library(pipeML)
library(caret)
#> Loading required package: ggplot2
#> Loading required package: lattice
library(dplyr)
#>
#> Attaching package: 'dplyr'
#> The following objects are masked from 'package:stats':
#>
#> filter, lag
#> The following objects are masked from 'package:base':
#>
#> intersect, setdiff, setequal, union
This tutorial demonstrates how to use the pipeML
package
for training and testing machine learning models. It introduces the main
functions of the pipeline and guides you through additional functions
for visualization.
Load data
raw.counts = pipeML::raw.counts
traitData = pipeML::traitData
deconvolution = pipeML::deconvolution
Feature selection
Apply repeated feature selection using the Boruta algorithm:
deconvolution$target = as.factor(traitData$Best.Confirmed.Overall.Response)
res_boruta <- feature.selection.boruta(
data = deconvolution,
iterations = 5,
fix = FALSE,
doParallel = FALSE,
threshold = 0.8,
file_name = "Test",
return = FALSE
)
#> Running 5 iterations of the Boruta algorithm
#>
#> Keeping only features confirmed in more than 0.8 of the times for training...............................................................
#> If you want to consider also tentative features, please specify tentative = TRUE in the parameters.
Inspect the results:
head(res_boruta$Matrix_Importance)
#> NULL
cat("Confirmed features:\n", res_boruta$Confirmed)
#> Confirmed features:
cat("Tentative features:\n", res_boruta$Tentative)
#> Tentative features:
To further assess tentative features, set fix = TRUE
to
rerun the selection and confirm or reject them:
res_boruta <- feature.selection.boruta(
data = deconvolution,
iterations = 5,
fix = TRUE,
doParallel = FALSE,
threshold = 0.8,
file_name = "Test",
return = FALSE
)
For faster execution, enable parallelization (ensure parameters
doParallel
and workers
are set):
res_boruta <- feature.selection.boruta(
data = deconvolution,
iterations = 10,
fix = FALSE,
doParallel = FALSE,
workers = 2,
threshold = 0.8,
file_name = "Test",
return = FALSE
)
Train Machine Learning models
Train and tune models using repeated stratified k-fold cross-validation:
deconvolution = pipeML::deconvolution
traitData = pipeML::traitData
res <- compute_features.training.ML(features_train = deconvolution,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
metric = "AUROC",
stack = FALSE,
k_folds = 2,
n_rep = 2,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
ncores = 2,
return = FALSE)
#> Warning: glm.fit: algorithm did not converge
#> Best ML model found: RF
#> Returning model trained
Access the best-trained model:
res[[1]]$Model
View all trained and tuned machine learning models:
res[[1]]$ML_Models
knitr::include_graphics("figures/Training.png")

Figure 1. Models training performance.
To apply model stacking, set stack = TRUE
:
res <- compute_features.training.ML(features_train = deconvolution,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
metric = "AUROC",
stack = TRUE,
k_folds = 2,
n_rep = 2,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
ncores = 2,
return = FALSE)
#> Warning: glm.fit: algorithm did not converge
#> Choosing base models for stacking.......................................
#>
#> Models chosen are: GLMNET, SVM_radial, XGboost
#>
#> Meta-learners ML model based on GLM
Inspect the base models used in stacking:
res$Model$Base_models
#> NULL
Access the meta-learner:
res$Model$Meta_learner
Model prediction
After training, users can predict on new data by using the
compute_prediction()
function. You can specify which metric
to maximize when determining the optimal classification threshold.
Supported values for maximize include: “Accuracy”, “Precision”,
“Recall”, “Specificity”, “Sensitivity”, “F1”, and “MCC”.
pred = pipeML::compute_prediction(model = res,
test_data = testing_set,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
file.name = "Test",
maximize = "Accuracy",
return = F)
If return = TRUE
, compute_prediction()
will
saved the RO and PR curves in the Results/
directory. It
can also be retrieved with the following function
get_curves(pred$Metrics, spec = "Specificity",
sens = "Sensitivity", reca = "Recall",
prec = "Precision", color = "Cohort",
auc_roc = pred$AUC$AUROC,
auc_prc = pred$AUC$AUPRC,
file.name = "Test")
To enable parallelization, specify the number of cores via the
ncores
parameter (default is
detectCores() - 1
).
res <- compute_features.training.ML(features_train = deconvolution,
target_var = traitData$Best.Confirmed.Overall.Response,
trait.positive = "CR",
metric = "AUROC",
stack = TRUE,
k_folds = 3,
n_rep = 3,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
ncores = 2,
return = FALSE)
Train and predict using the machine learning models
If you already have a separate testing dataset, you can directly train the model and perform predictions using the following function. You can also choose to optimize a specific metric to select a decision threshold and return detailed prediction performance metrics such as Accuracy, Sensitivity, Specificity, F1 score, MCC, Recall, and Precision.
deconvolution = pipeML::deconvolution
data_train = deconvolution[1:100,]
data_test = deconvolution[!rownames(deconvolution) %in% rownames(data_train),]
res <- compute_features.ML(features_train = data_train,
features_test = data_test,
clinical = traitData,
trait = "Best.Confirmed.Overall.Response",
trait.positive = "CR",
metric = "AUROC",
stack = FALSE,
k_folds = 3,
n_rep = 3,
LODO = FALSE,
batch_id = NULL,
file_name = "Test",
maximize = "F1",
return = FALSE)
Access the prediction metrics:
head(res$Prediction_metrics)
To compute variable importance:
importance = compute_variable.importance(res$Model,
stacking = FALSE,
n_cores = 2)
And to visualize SHAP values:
plot_shap_values(importance, "glm", "shap_plot")
knitr::include_graphics("figures/var_importance.png")

Figure 2. Variable importance.
Leaving one dataset out (LODO) analysis
If your data includes features from multiple cohorts, pipeML provides
a flexible approach to perform Leave-One-Dataset-Out (LODO) analysis.
This is achieved by applying k-fold stratified sampling across batches,
ensuring that each fold preserves the batch structure while maintaining
class balance. For this set LODO = TRUE
and provided the
vector containing the batch information in the batch_id
variable (this should be in the same order as your samples)
Below, we demonstrate how to perform a LODO analysis using simulated datasets:
Simulate datasets from different batches
set.seed(123)
# Simulate traitData with 3 cohorts
traitData <- data.frame(
Sample = paste0("Sample", 1:90),
Response = sample(c("R", "NR"), 90, replace = TRUE),
Cohort = rep(paste0("Cohort", 1:3), each = 30),
stringsAsFactors = FALSE
)
rownames(traitData) <- traitData$Sample
# Simulate counts matrix
counts_all <- matrix(rnorm(1000 * 90), nrow = 1000, ncol = 90)
rownames(counts_all) <- paste0("Gene", 1:1000)
colnames(counts_all) <- traitData$Sample
# Simulate some example features
features_all <- matrix(runif(90 * 15), nrow = 90, ncol = 15)
rownames(features_all) <- traitData$Sample
colnames(features_all) <- paste0("Feature", 1:15)
Perform LODO analysis
prediction = list()
i = 1
for (cohort in unique(traitData$Cohort)) {
# Test cohort
traitData_test = traitData %>%
filter(Cohort == cohort)
counts_test = counts_all[,colnames(counts_all)%in%rownames(traitData_test)]
features_test = features_all[rownames(features_all)%in%rownames(traitData_test),]
# Train cohort
traitData_train = traitData %>%
filter(Cohort != cohort)
counts_train = counts_all[,colnames(counts_all)%in%rownames(traitData_train)]
features_train = features_all[rownames(features_all)%in%rownames(traitData_train),]
#### ML Training
res = compute_features.training.ML(features_train,
traitData_train$Response,
trait.positive = "R",
metric = "AUROC",
stack = T,
k_folds = 2,
n_rep = 3,
LODO = TRUE,
batch_id = traitData_train$Cohort,
ncores = 2,
return = F)
## Feature importance
importance = compute_variable.importance(res, stacking = T, n_cores = 2)
#### ML predicting
#### Testing
pred = compute_prediction(model = res,
test_data = features_test,
target_var = traitData_test$Response,
trait.positive = "R",
stack = TRUE,
return = F)
pred[["Importance"]] = importance
#### Save results
prediction[[i]] = pred
names(prediction)[i] = cohort
i = i + 1
}
#> Choosing base models for stacking.......................................
#>
#> Models chosen are: BAG, KNN, RIDGE
#>
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Warning in compute_shap_values(ml_models[[base_models[i]]], train_data, :
#> Trivial predictions were found in some resamples. SHAP values cannot be
#> calculated
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold: 0.8248835
#> Accuracy: 63.333
#> Sensitivity: 41.176
#> Specificity: 92.308
#> F1 score: 56
#> MCC score: 37.521
#> Recall: 41.176
#> Precision: 87.5
#> Choosing base models for stacking.......................................
#>
#> Models chosen are: C50, GLMNET, SVM_linear
#>
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): Trivial
#> predictions were found in some resamples. SHAP values cannot be calculated
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold: 0.04350905
#> Accuracy: 63.333
#> Sensitivity: 100
#> Specificity: 0
#> F1 score: 77.551
#> MCC score: 0
#> Recall: 100
#> Precision: 63.333
#> Choosing base models for stacking.......................................
#>
#> Models chosen are: GLMNET, SVM_radial, XGboost
#>
#> Meta-learners ML model based on GLM
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Computing SHAP values for feature importance...............................................................
#> Warning in e$fun(obj, substitute(ex), parent.frame(), e$data): already
#> exporting variable(s): trivial_predictions_found
#> Predicting target variable using provided ML model.................................................
#> Choosing the threshold that maximizes Accuracy for calculating the confusion matrix...................................................
#> Best threshold: 0.7855284
#> Accuracy: 56.667
#> Sensitivity: 75
#> Specificity: 35.714
#> F1 score: 64.865
#> MCC score: 11.664
#> Recall: 75
#> Precision: 57.143
For plotting we will make use of our get_curves
function
adapted in a case for multiple cohorts.
metrics = list()
auroc = c()
auprc = c()
for (i in 1:length(prediction)) {
metrics[[i]] = prediction[[i]]$Metrics %>%
dplyr::mutate(Cohort = names(prediction)[i],
AUROC = rep(prediction[[i]]$AUC$AUROC, nrow(prediction[[i]]$Metrics)),
AUPRC = rep(prediction[[i]]$AUC$AUPRC, nrow(prediction[[i]]$Metrics))) %>%
tibble::rownames_to_column("Samples")
}
metrics = do.call(rbind, metrics)
metrics$Cohort = as.factor(metrics$Cohort)
get_curves(metrics, "Specificity", "Sensitivity",
"Recall", "Precision", "Cohort",
metrics$AUROC, metrics$AUPRC, "Test")
#> agg_png
#> 2
knitr::include_graphics("figures/ROC.png")

Figure 3. RO curves.
knitr::include_graphics("figures/PR.png")

Figure 4. PR curves.
Custom k-fold cross validation
In certain use cases it is essential to carefully design how the
training and test splits are created to prevent data leakage.
pipeML
supports the use of custom fold construction
functions via the fold_construction_fun
argument in the
compute_features.training.ML()
function. This allows you to
preprocess your features within each fold, ensuring that information
from the test set is never used during training.
Why use custom folds?
By default, cross-validation splits are performed directly on the feature matrix. However, if your features are generated through a process that uses correlations (e.g., clustering, dimensionality reduction, aggregation), gene set enrichment or other features that are sample-dependent, applying that process to the full dataset before splitting can leak test-set information into the training set, overfitting the models.
Using a custom fold construction function ensures that: - Test samples are projected onto the learned space without influencing it. - Fold-level processing mimics how the model would behave on truly unseen data.
Below you can find an example of a custom fold function created to compute TF activity in samples using the viper algorithm. This approach is based on a TF-target set enrichment analysis similar to GSEA and thus, the scores will be sample-dependent. It needs normalized counts as input.
# These parameters are taking care inside pipeML automatically once `fold_construction_fun` is set. Be sure to not change the parameteres names.
#' @param data Data from which the custom function will be performed
#' @param folds A list of integer vectors indicating row indices for the training set in each fold.
#'
prepare_TFs_folds <- function(data, folds) {
processed_folds <- list()
for (i in seq_along(folds)) {
train_idx <- folds[[i]]
test_idx <- setdiff(seq_len(nrow(data)), train_idx)
## Subset data
train_data <- data[train_idx, , drop = FALSE]
obs_train <- train_data$target
train_data$target <- NULL
## Run TFs once
train_processed <- compute_TF_activity(RNA_tpm = t(train_data)) ##################################################### User can change this part
colnames(train_processed) <- gsub("[- +]", ".", colnames(train_processed)) ## Just to change special characters (ignore in other cases)
train_cell_data <- train_processed %>%
dplyr::mutate(target = obs_train)
## Prepare test data using trained info
obs_test <- data$target[test_idx]
test_data <- data[test_idx, , drop = FALSE]
test_data$target <- NULL
test_data <- compute_TF_activity(RNA_tpm = t(test_data)) ##################################################### User can change this part
colnames(test_data) <- gsub("[- +]", ".", colnames(test_data)) ## Just to change special characters (ignore in other cases)
processed_folds[[i]] <- list(
train_data = train_cell_data,
test_data = test_data,
obs_test = obs_test,
rowIndex = test_idx,
fold_name = names(folds)[i]
)
}
# Run TFs on the full training set
obs_train = data$target
data$target = NULL
train_processed_final <- compute_TF_activity(RNA_tpm = t(data)) ##################################################### User can change this part
colnames(train_processed_final) <- gsub("[- +]", ".", colnames(train_processed_final)) ## Just to change special characters (ignore in other cases)
# Get features
train_cell_data_final <- train_processed_final %>%
dplyr::mutate(target = obs_train)
custom_output = train_processed_final
return(list(processed_folds, train_cell_data_final, custom_output))
}
In order to create your own custom fold function, user can follow the
same structure defined above and change the part highlighted in the
code. Once the function is ready you can provide it into the main
compute_features.training.ML()
function via the
fold_construction_fun
argument.
## counts_train: Normalized counts to compute the features from the training samples
## traitData_train: Target variable from the training samples
res = pipeML::compute_features.training.ML(as.data.frame(t(counts_train)),
traitData_train$Response,
trait.positive = "R",
metric = "AUROC",
stack = F, k_folds = 5,
n_rep = 10, LODO = F,
ncores = 2, return = F,
fold_construction_fun = prepare_TFs_folds)
In order to predict, user needs to calculate the same type of features on the test set
## counts_test: Normalized counts to compute the features from the testing samples
tf_activities_test <- compute_TF_activity(RNA_tpm = counts_test)
# Predict in TFs
pred = compute_prediction(res, tf_activities_test, traitData_test$Response, trait.positive = "R",
stack = FALSE, maximize = "Accuracy", return = F)
What if fold_construction_fun
has multiple
parameters?
Sometimes the feature construction function requires additional
arguments. For example, compute_TF_activity()
may need the
TF collection, a minimum target size, and whether TFs should be pruned.
These can be exposed in your custom function:
tfs = compute.TFs.activity(counts.norm, TF.collection = "CollecTRI", min_targets_size = 5, tfs.pruned = FALSE)
Then the custom function needs to specify these parameters and their values. It will look like this:
prepare_TFs_folds <- function(data, folds, TF.collection = "CollecTRI", min_targets_size = 5, tfs.pruned = FALSE) {
processed_folds <- list()
for (i in seq_along(folds)) {
train_idx <- folds[[i]]
test_idx <- setdiff(seq_len(nrow(data)), train_idx)
## Subset data
train_data <- data[train_idx, , drop = FALSE]
obs_train <- train_data$target
train_data$target <- NULL
##################################################### User can change this part
train_processed <- compute_TF_activity(RNA_tpm = t(train_data),
TF.collection = "CollecTRI",
min_targets_size = 5,
tfs.pruned = FALSE)
colnames(train_processed) <- gsub("[- +]", ".", colnames(train_processed)) ## Just to change special characters (ignore in other cases)
train_cell_data <- train_processed %>%
dplyr::mutate(target = obs_train)
## Prepare test data using trained info
obs_test <- data$target[test_idx]
test_data <- data[test_idx, , drop = FALSE]
test_data$target <- NULL
##################################################### User can change this part
test_data <- compute_TF_activity(RNA_tpm = t(test_data),
TF.collection = "CollecTRI",
min_targets_size = 5,
tfs.pruned = FALSE)
colnames(test_data) <- gsub("[- +]", ".", colnames(test_data)) ## Just to change special characters (ignore in other cases)
processed_folds[[i]] <- list(
train_data = train_cell_data,
test_data = test_data,
obs_test = obs_test,
rowIndex = test_idx,
fold_name = names(folds)[i]
)
}
# Run TFs on the full training set
obs_train = data$target
data$target = NULL
##################################################### User can change this part
train_processed_final <- compute_TF_activity(RNA_tpm = t(data),
TF.collection = "CollecTRI",
min_targets_size = 5,
tfs.pruned = FALSE)
colnames(train_processed_final) <- gsub("[- +]", ".", colnames(train_processed_final)) ## Just to change special characters (ignore in other cases)
# Get features
train_cell_data_final <- train_processed_final %>%
dplyr::mutate(target = obs_train)
custom_output = train_processed_final
return(list(processed_folds, train_cell_data_final, custom_output))
}
When calling compute_features.training.ML()
, these fixed
arguments are passed via fold_construction_args_fixed
:
res = pipeML::compute_features.training.ML(as.data.frame(t(counts_train)),
traitData_train$Response,
trait.positive = "R",
metric = "AUROC",
stack = F, k_folds = 5,
n_rep = 10, LODO = F,
ncores = 2, return = F,
fold_construction_fun = prepare_TFs_folds,
fold_construction_args_fixed = list(TF.collection = "CollecTRI",
min_targets_size = 5,
tfs.pruned = FALSE))
Tunable hyperparameters during custom function
There will be cases when user will have parameters that can change
value and thus, affect the performance during training. pipeML
implements a custom hyperparameter tuning for these type of cases where
the goal is to find which value of a certain parameter maximizes the
performance of the model. Briefly, it takes a grid of values propose by
the user and compute custom k-folds for each combination of values,
testing the performance across different ML models and their
corresponding hyperparameters. It then returns the optimized parameters
that return the best performance across a certain number of iterations
defined by the user. For this, user needs to defined the parameter
fold_construction_args_tunable
with the different values
that should be test. Here we want to test the performance using 5, 10,
15, 20 as different values for this parameter.
res = pipeML::compute_features.training.ML(as.data.frame(t(counts_train)),
traitData_train$Response,
trait.positive = "R",
metric = "AUROC",
stack = F, k_folds = 5,
n_rep = 10, LODO = F,
ncores = 2, return = F,
fold_construction_fun = prepare_TFs_folds,
fold_construction_args_fixed = list(TF.collection = "CollecTRI", tfs.pruned = FALSE),
fold_construction_args_tunable = list(min_targets_size = c(5, 10, 15, 20)))
For this to work, your custom function must accept a
bestune
argument, which is used internally to inject the
optimized parameter values found during the tuning step.
When bestune
is NULL, the function assumes that tuning
has not yet been performed. A grid of candidate parameter values
(defined in fold_construction_args_tunable
) is generated.
For each fold, the function iterates through all combinations of
parameter values and recomputes the features. This exploration step is
parallelized across folds using foreach and doParallel, allowing
multiple folds to be processed simultaneously. Parallelization reduces
runtime considerably when the parameter grid or number of folds is
large.
When bestune
is not NULL, it means the tuning process
has already been completed. The optimized parameter values are extracted
from the bestune object. Features are then recomputed once on the full
training dataset using these tuned parameters. This ensures the final
model is trained with the best parameter setting identified during
cross-validation.
In summary, the bestune
argument acts as a control
switch:
- NULL → run parameter search with parallelized fold-level evaluation.
- non-NULL → lock in the tuned parameter values and rebuild the features for final training.
This design allows a single custom fold-construction function to handle both hyperparameter tuning (exploration, parallelized) and final model preparation (exploitation, single optimized run).
prepare_TFs_folds <- function(data, folds, TF.collection = "CollecTRI", min_targets_size = 5, tfs.pruned = FALSE, bestune = NULL){
if(!is.null(bestune)){
obs_train = data$target
data$target = NULL
best_params <- bestune %>%
dplyr::select(min_targets_size) ### Here you should put the hyperparameters that you want to tune
train_processed_final <- compute_TF_activity(RNA_tpm = t(test_data),
TF.collection = "CollecTRI",
min_targets_size = best_params$min_targets_size, ### Here you replace by the value of the optimized parameter
tfs.pruned = FALSE)
colnames(train_processed_final) <- gsub("[- +]", ".", colnames(train_processed_final)) ## Just to change special characters (ignore in other cases)
# Get cell group features
train_cell_data_final <- train_processed_final %>%
dplyr::mutate(target = obs_train)
custom_output = train_processed_final
res = list(train_cell_data_final, custom_output, best_params)
}else{ ### Means best tune need to be find
#### Here put all the parameters you want to tune
custom_grid <- expand.grid(
min_targets_size = min_targets_size,
)
### Implement parallelization
if(is.null(ncores) == TRUE){
ncores = parallel::detectCores() - 2
}
cl <- parallel::makeCluster(ncores)
doParallel::registerDoParallel(cl)
# Parallelize over folds
processed_folds <- foreach::foreach(i = seq_along(folds), .packages = c("dplyr")) %dopar% {
cat("Starting fold", names(folds)[i], "\n")
train_idx <- folds[[i]]
test_idx <- setdiff(seq_len(nrow(data)), train_idx)
## Subset data
train_data <- data[train_idx, , drop = FALSE]
train_coldata <- coldata[train_idx, , drop = FALSE]
obs_train <- train_data$target
train_data$target <- NULL
fold_results <- vector("list", nrow(custom_grid))
for (j in seq_len(nrow(custom_grid))) {
##################################################### User can change this part
train_processed <- compute_TF_activity(RNA_tpm = t(train_data),
TF.collection = "CollecTRI",
min_targets_size = custom_grid$min_targets_size[j], ### Here will iterate across params values
tfs.pruned = FALSE)
colnames(train_processed) <- gsub("[- +]", ".", colnames(train_processed)) ## Just to change special characters (ignore in other cases)
train_cell_data <- train_processed %>%
dplyr::mutate(target = obs_train)
## Prepare test data using trained info
obs_test <- data$target[test_idx]
test_data <- data[test_idx, , drop = FALSE]
test_data$target <- NULL
##################################################### User can change this part
test_data <- compute_TF_activity(RNA_tpm = t(test_data),
TF.collection = "CollecTRI",
min_targets_size = custom_grid$min_targets_size[j], ### Here will iterate across params values
tfs.pruned = FALSE)
colnames(test_data) <- gsub("[- +]", ".", colnames(test_data)) ## Just to change special characters (ignore in other cases)
fold_results[[j]] <- list(
train_data = train_cell_data,
test_data = test_data,
obs_test = obs_test,
rowIndex = test_idx,
fold_name = names(folds)[i],
params = custom_grid[j, ]
)
}
fold_results
}
parallel::stopCluster(cl) # stop the cluster after parallel execution
unregister_dopar() #Stop Dopar from running in the background
res <- processed_folds
}
return(res)
}