2  Methods

2.1 Reproducibility

I list below the R packages required to reproduce the analyses.

Code
## data wrangling
library(dplyr)
library(tidyr)
library(stringr)
library(readr)
library(readxl)
library(purrr)

## plotting
library(ggplot2)

## reporting
library(flextable)

# automated package linting
library(xml2)
library(downlit)

## auxiliary functions
source("R/preprocessing.R")
source("R/differential_barcoding.R")
source("R/convert_bioc_objects.R")
today_date <- "2025-07-01"
# today_date <- format(Sys.Date(), "%Y-%m-%d")

2.2 Export to curated formats

  • Read all barcode profiles and merge them on column barcode_id.
Listing 2.1: Import all raw barcode .tsv files, and add artificial controls for experiment exp200921_dose_response.
Code
pheno_data <- readr::read_csv(paste0("./data/replicates_barcode_metadata_",
                                     today_date, ".csv"), 
                              show_col_types = FALSE) 
barcode_files <- unique(pheno_data$Filename)

## 1) artificially add duplicated metadata ----
artificial_controls_metadata <- pheno_data |>
  dplyr::filter(Compound == "Control" & Batch_ID == "exp200921") |>
  dplyr::mutate(Batch_ID = "exp200921_dose_response", 
                Replicates_ID = gsub("run111021", "run101121", Replicates_ID))
pheno_data <- pheno_data |>
  dplyr::bind_rows(artificial_controls_metadata) |> 
  # add a unique column per compound and biological condition
  dplyr::mutate(Unique_compound = paste0(Compound, "_", Batch_ID, "_",
                                        "Conc: ", Concentrations_Formatted, "_",
                                        "Days: ", Duration, sep = "_"))

barcode_counts_aggregated <- purrr::map(barcode_files, function(filename) {
  barcode_filepath <- file.path("./data/barcode-counts", filename)
  barcode_counts <- readr::read_tsv(barcode_filepath, 
                                    show_col_types = FALSE)
  
  ## 2) artificially add duplicated control samples for dose-response ----
  if (filename == "exp200921.tsv") {
    barcode_duplicated_controls <- barcode_counts |> 
      dplyr::select("barcode_id", dplyr::matches("^Contro(.*)_000u_exp200921_run111021_")) |> 
      dplyr::rename_with(
        .fn = ~ gsub("run111021", "run101121", .x),
        .cols = starts_with("Contro"))
    
    barcode_counts <- barcode_counts |> 
      dplyr::inner_join(barcode_duplicated_controls, by = "barcode_id")
  }
  
  return(barcode_counts)
}) |> 
  purrr::reduce(~ inner_join(.x, .y, by = "barcode_id"))

## 3) check that we can map each barcode replicate to its corresponding metadata information
stopifnot(identical(setdiff(pheno_data$Replicates_ID, colnames(barcode_counts_aggregated)), character(0)))
gc()
##             used  (Mb) gc trigger   (Mb)  max used   (Mb)
##  Ncells  1294189  69.2    6388343  341.2   5144996  274.8
##  Vcells 27183137 207.4  505644887 3857.8 632024133 4822.0

The resulting barcode matrix stores 37689 unique barcode IDs, and 598 unique replicates (with 4 artificially duplicated controls for dose response, Date: 20-09-2021).

2.2.1 DGEList object

  • Export to EdgeR::DGEList format.
Listing 2.2: Export to DGEList format.
Code
# prepare barcode counts ----
counts <- barcode_counts_aggregated |> 
  tibble::column_to_rownames("barcode_id") |> 
  as.matrix()
## ensure consistent order
counts <- counts[, pheno_data$Replicates_ID]

rownames(pheno_data) <- pheno_data$Replicates_ID
dge_count <- edgeR::DGEList(counts = counts, 
                            samples = pheno_data, 
                            group = pheno_data$Compound,
                            remove.zeros = TRUE)
saveRDS(dge_count, 
        paste0("./data/processed/DGEList_before_filtering_", today_date, ".rds"))

2.2.2 SummarizedExperiment object

Listing 2.3: Export to SummarizedExperiment format.
Code
SE_object <- DGEList2SE (dge_count)
# check that both expression matrices store the same content
stopifnot(identical(dge_count$counts, SummarizedExperiment::assay(SE_object)))
saveRDS(SE_object, 
        paste0("./data/processed/SummarizedExperiment_before_filtering_", today_date, ".rds"))

rm(artificial_controls_metadata, barcode_counts_aggregated,
   barcode_metadata, counts, samples, pheno_data, barcode_files)

2.3 Luca’s pipeline

Code
old_dge_count <- readRDS(paste0("./data/processed/DGEList_before_filtering_", 
                            today_date, ".rds"))
# remove Zero-controls, P42 and P43 experiences
dge_count <- old_dge_count[, !old_dge_count$samples$Compound %in% c("Control - time zero", "P42", "P43", "Control - T75")]

2.3.1 Preprocessing: noise removal and normalisation

Eliminate barcodes for which the combined counts of the 4 controls per barcode are below a given threshold, here 5.

  • By removing Time-Zero-controls, P42 and P43, as well as T75 flasks, we switch from 598 to 574 unique biological replicates.
Listing 2.4: Remove background noise, and aggregate all individual barcode counts.
Code
rm(old_dge_count)

dge_normalised_replicates <- preprocessing_luca(dgeObject = dge_count, 
                                     group = "Batch_ID",
                                     compound = "Control",
                                     thresh_background = 4,
                                     normalisation_function = cpm_normalisation)

saveRDS(dge_normalised_replicates, 
        file = paste0("./data/processed/dge_normalised_replicate_",
                          today_date, ".rds"))
Listing 2.5: Simply compute the Pearson correlation score at the Drug per Batch level.
Code
dge_normalised_samples <- collapseSamples(dge_normalised_replicates,
                                          group = "Unique_compound",
                                          showReps = TRUE, 
                                          sample_colname = "Replicates_ID", 
                                          method = mean)
saveRDS(dge_normalised_samples,
        file = paste0("./data/processed/dge_normalised_samples_",
                      today_date, ".rds"))

We collapsed from 574 to 142 by averaging barcode counts over biological replicates.

2.4 Differential barcode analyses

  1. Follow differential protocol by Luca (fold change with respect to the mean, then binarise, and assign a 1 if \(FC>3\)).
Listing 2.6: Compute the averaged value for control replicates, then binarise for each compound. A 1 denoting a significant positive enrichment for a given barcode_id.
Code
dge_binarised_replicates <- differential_barcoded_luca(dge_normalised_replicates,
                                                       group = "Batch_ID",
                                                       reference = "Control",
                                                       threshold_FC = 3)
saveRDS(dge_binarised_replicates,
        file = paste0("./results/differential_analyses/dge_discretised_replicate_",
                      today_date, ".rds"))

We removed 60 control samples (all remaining samples being associated with an effective drug).

  1. Aggregate over replicates, using a logical AND (in other words, a barcode must be differential in all replicates of a given compound, to be consider DR):
Code
# Use an all function as aggregator, but could have been a mean or sum as well.

dge_binarised_samples <- collapseSamples(dge_binarised_replicates,
                                         group = "Unique_compound",
                                         showReps = FALSE, 
                                         sample_colname = "Replicates_ID", 
                                         method = all)

null_barcodes <- which(rowSums(dge_binarised_samples$counts) == 0)
  if (length(null_barcodes) > 0L) {
    message(paste("We removed a total of", length(null_barcodes),
                  "over a total of", nrow(dge_binarised_samples), "barcode IDs.
                  None of them being fully differential in at least one compound."))
    dge_binarised_samples <- dge_binarised_samples[-null_barcodes,]
  }

saveRDS(dge_binarised_samples,
        file = paste0("./results/differential_analyses/dge_discretised_samples_",
                      today_date, ".rds"))

We collapsed from 514 to 128 by averaging barcode counts over biological replicates while removing 3.747577 \(\%\) of barcodes, showing no consistent differential expression for at least one compound.