diff --git a/DESCRIPTION b/DESCRIPTION index 5322c59..7264e5c 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -75,7 +75,8 @@ Suggests: scuttle, methods, SeuratObject, - SummarizedExperiment + SummarizedExperiment, + seqgendiff Config/testthat/edition: 3 Imports: dplyr, diff --git a/NAMESPACE b/NAMESPACE index e616818..b7a43c4 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,6 +1,7 @@ # Generated by roxygen2: do not edit by hand export(aggregate_by_de) +export(check_zeroinflation) export(compute_chem_descriptors) export(compute_de_umap) export(compute_hyper_enrich_bg) @@ -34,6 +35,7 @@ export(plot_qc_metrics_heatmap) export(plot_rle) export(plot_volcano) export(read_metadata) +export(subsample_genes) export(summarise_de) export(validate_metadata) import(DESeq2) diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R new file mode 100644 index 0000000..84ab865 --- /dev/null +++ b/R/check_zeroinflation.R @@ -0,0 +1,179 @@ +#' Quick group-aware zero-inflation check (Negative Binomial baseline via edgeR) +#' +#' @description +#' Computes a **sample group-aware Zero-Inflation (ZI) index** for each gene using a +#' negative-binomial (NB) baseline fitted with **edgeR**. For each group +#' (e.g., drug condition), the function: +#' 1) estimates gene-wise tagwise dispersions with edgeR (using all selected groups), +#' 2) builds NB-expected zero probabilities from TMMwsp-scaled means, and +#' 3) returns per-gene ZI (observed zeros minus NB-expected zeros) and +#' per-group summaries (e.g., % genes with ZI > 0.05). ZI-cutoffs are user-defined. +#' +#' This is intended as a **fast screening diagnostic** to decide whether +#' standard NB GLM methods (edgeR/DESeq2) are adequate or whether a +#' zero-aware workflow (e.g., ZINB-WaVE) might be warranted. +#' +#' This function **relies on edgeR** to estimate dispersion. The current +#' implementation requires **≥2 groups** in the design so that edgeR can +#' stabilize gene-wise dispersions across groups. If you only have a single +#' group and still want a design-aware baseline for expected zeros, fit a +#' Gamma–Poisson/NB GLM and compute the +#' expected zero probabilities from its fitted means and over-dispersion. +#' +#' @param data Seurat object. +#' @param group_by Character, column in `data@meta.data` that defines groups +#' (default: `"combined_id"`). +#' @param samples Character vector of group labels/patterns to include. If +#' `NULL` or if none match, all groups in `group_by` are used. +#' @param batch Optional batch indicator; if length 1, an intercept-free design +#' is used with group dummies. +#' @param cutoffs Numeric vector of user-supply ZI thresholds for summary statistics +#' +#' @returns A list with: +#' * `gene_metrics_by_group`: long data frame (group × gene) with `p0_obs`, +#' `p0_nb`, `ZI`, and counts. +#' * `summary_by_group`: one row per group with medians and % ZI thresholds, +#' plus observed/expected zero **counts** for the group. +#' +#' @note +#' - This is a **screening** tool; it is not a replacement for fitting a full +#' GLM with your actual design. If strong covariates exist, a GLM baseline +#' (e.g., `glmGamPoi::glm_gp`) will yield more faithful expected-zero rates. +#' - For single-group experiments, consider either adding a reference group or +#' switching to a GLM-based baseline that does not require multiple groups. +#' @export +#' @examples +#' data(mini_mac) +#' check_zeroinflation(mini_mac, group_by = "combined_id", +#' samples = c("DMSO_0","Staurosporine_10")) + + +check_zeroinflation <- function(data = NULL, + group_by = NULL, + samples = NULL, + batch = 1, + cutoffs = c(0.1, 0.20) +){ + validate_inputs <- function(data, group_by, samples, cutoffs) { + if (!inherits(data, "Seurat")) { + stop("argument 'data' must be a Seurat or TidySeurat object.") + } + group_by <- if (is.null(group_by)) "combined_id" else group_by + + # check samples in combined_id column + meta_groups <- as.character(data@meta.data[[group_by]]) + matched_groups <- samples %in% meta_groups + if (is.null(samples)){ + # all samples included + samples <- unique(data@meta.data[[group_by]]) + cat("All samples will be included in the combined_id column.") + } else if (length(samples) == 1 || length(which(matched_groups==TRUE)) < 2) { + # need at least two groups for edgeR dispersion estimation + stop("Two treatment groups are needed to calculate dispersion using edgeR.") + } + # check cutoffs + if (any(cutoffs <= 0) || any(cutoffs >= 1)) { + stop("cutoffs must be between 0 and 1.") + } + return(list(data = data, group_by = group_by, samples = samples, cutoffs = cutoffs)) + } + validated <- validate_inputs(data, group_by, samples,cutoffs) + data <- validated$data + group_by <- validated$group_by + samples <- validated$samples + cutoffs <- validated$cutoffs + mac_data <- subset(data, subset = combined_id %in% samples) + count_matrix <- GetAssayData(mac_data, assay = "RNA", layer = "counts") + count_matrix <- Matrix::Matrix(count_matrix, sparse = TRUE) + obs_zero <- Matrix::rowMeans(count_matrix == 0) + # Negative binomial expected zeros + # using edgeR for dispersion estimation + dge <- edgeR::DGEList(counts = count_matrix) + dge <- edgeR::calcNormFactors(dge, method = "TMMwsp") + #design matrix + combined_id <- mac_data$combined_id + #make up batch parameter + model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else + model.matrix(~0 + combined_id + batch) + # tagwise dispersion + dge <- edgeR::estimateDisp(dge, design = model_matrix) + phi <- dge$tagwise.dispersion # NB variance: mu + phi * mu^2 (phi >= 0) + # Build per-sample NB mean mu_gj using TMMwsp-scaled library sizes + # Effective library sizes + eff_lib <- dge$samples$lib.size * dge$samples$norm.factors + per_group_gene_metrics <- lapply(samples, function(g){ + idx <- which(combined_id == g) + n_wells <- length(idx) + # sub count matrix for group g + count_matrix_g <- count_matrix[, idx, drop=FALSE] + # Observed zeros within group g + p0_obs_g <- Matrix::rowMeans(count_matrix_g==0) + # count zeros per gene within group g + # sum later for summary + obs_zero_num_g <- Matrix::rowSums(count_matrix_g==0) + # Group-specific q_{g,g} using only wells in group g + eff_lib_g <- eff_lib[idx] + total_eff_lib_g <- sum(eff_lib_g) + total_counts_per_gene_g <- Matrix::rowSums(count_matrix_g) + q_g_g <- as.numeric(total_counts_per_gene_g) / total_eff_lib_g + # NB-expected zeros within group g (average over wells in g) + eps <- 1e-12 + phi_safe <- pmax(phi, eps) + inv_phi <- 1 / phi_safe + # Fast loop over wells in g, no GxJ materialization + p0_nb_sum_g <- numeric(nrow(count_matrix)) + for (j in seq_along(idx)) { + Lj <- eff_lib_g[j] + mu_gj <- q_g_g * Lj + p0_nb_sum_g <- p0_nb_sum_g + (1 + phi_safe * mu_gj)^(-inv_phi) + } + p0_nb_g <- p0_nb_sum_g / length(idx) + # Poisson fallback where phi ~ 0 + poi_idx <- which(phi < 1e-8) + if (length(poi_idx)) { + mu_bar_g <- q_g_g * mean(eff_lib_g) + p0_nb_g[poi_idx] <- exp(-mu_bar_g[poi_idx]) + } + # ZI within group g + zi_g <- p0_obs_g - p0_nb_g + data.frame( + group = g, + gene = rownames(count_matrix), + mean_count_group = total_counts_per_gene_g / length(idx), + dispersion = phi, + p0_obs = p0_obs_g, + obs_zeros_num = obs_zero_num_g, + p0_nb = p0_nb_g, + expected_zeros_num = p0_nb_g*n_wells, + ZI = zi_g, + stringsAsFactors = FALSE + ) + }) + gene_metrics_by_group <- do.call(rbind, per_group_gene_metrics) + #if there are more than one cutoffs, calculate pct_ZI_gt_ for each cutoff + # Per-group summaries (one row per group) + summary_by_group <- do.call(rbind, lapply(split(gene_metrics_by_group, gene_metrics_by_group$group), function(df){ + list_a <- list( + group = unique(df$group), + n_genes = nrow(df), + n_wells = sum(combined_id == unique(df$group)), + median_p0_obs = median(df$p0_obs), + median_p0_nb = median(df$p0_nb), + median_ZI = median(df$ZI), + observed_zeros_num = sum(df$obs_zeros_num), + expected_zeros_num = sum(df$expected_zeros_num) + ) + list_b <- lapply(cutoffs, function(cutoff){ + pct_name <- paste0("pct_ZI_gt_", cutoff) + pct_value <- mean(df$ZI > cutoff) + setNames(list(pct_value), pct_name) + }) + as.data.frame(c(list_a, list_b)) + })) + # Return just the selected groups' indices instead of plate-level + list( + gene_metrics_by_group = gene_metrics_by_group, # long format: group × gene + summary_by_group = summary_by_group + ) + +} \ No newline at end of file diff --git a/R/subsample_genes.R b/R/subsample_genes.R new file mode 100644 index 0000000..746f1f5 --- /dev/null +++ b/R/subsample_genes.R @@ -0,0 +1,45 @@ +#' Subsample genes (fast helper function for zero-inflation checks) +#' +#' @description +#' Quickly subsample a specified number of **genes** from a Seurat object and +#' return a smaller Seurat object containing the selected features and all +#' original wells/samples. This is a lightweight convenience wrapper around +#' **`seqgendiff::select_counts()`** and is intended for creating a small +#' working object to run **`check_zeroinflation()`** (or similar +#' diagnostics) rapidly. +#' +#' @param data A Seurat object (v4 or v5) with counts in assay "RNA". +#' @param ngene Integer. Number of genes to keep (must be <= total genes). +#' @param gselect Gene-selection strategy as used by +#' **`seqgendiff::select_counts()`**. Defaults to `"random"`. +#' @param seed Integer random seed for reproducibility. +#' @return A Seurat object containing the subsampled genes and all original wells/samples. +#' @export +#' @examples +#' data(mini_mac) +#' subsample_genes(mini_mac, ngene = 50, gselect = "random", seed = 1 ) +#' +subsample_genes <- function(data, + ngene = 100, + gselect = "random", + seed = 1){ + if (utils::packageVersion("Seurat") < "5.0.0" + ) { + count_matrix <- GetAssayData(data, assay = "RNA", slot = "counts") + } else { + count_matrix <- GetAssayData(data, assay = "RNA", layer = "counts") + } + count_matrix <- as.matrix(count_matrix) + if (!inherits(data, "Seurat")) { + stop("argument 'data' must be a Seurat or TidySeurat object.") + } + if (ngene > nrow(count_matrix)){ + stop("ngene should be less than the total number of genes in the dataset.") + } + set.seed(seed) + sub_matrix <- seqgendiff::select_counts(count_matrix, ngene = ngene, gselect = gselect) + subsample_genes <- rownames(sub_matrix) + data_sub <- data[subsample_genes, ] + return(data_sub) + +} diff --git a/_pkgdown.yaml b/_pkgdown.yaml index 6247e65..3fc5d00 100644 --- a/_pkgdown.yaml +++ b/_pkgdown.yaml @@ -31,6 +31,8 @@ navbar: href: articles/integration_across_modalities.html - text: "Working with Bioconductor classes" href: articles/macpie_bioc.html + - text: "Chck zero-inflation" + href: articles/check_zero_inflation.html - text: "Reference" href: reference/index.html # Links to your function reference docs. right: diff --git a/man/check_zeroinflation.Rd b/man/check_zeroinflation.Rd new file mode 100644 index 0000000..f79125a --- /dev/null +++ b/man/check_zeroinflation.Rd @@ -0,0 +1,73 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/check_zeroinflation.R +\name{check_zeroinflation} +\alias{check_zeroinflation} +\title{Quick group-aware zero-inflation check (Negative Binomial baseline via edgeR)} +\usage{ +check_zeroinflation( + data = NULL, + group_by = NULL, + samples = NULL, + batch = 1, + cutoffs = c(0.1, 0.2) +) +} +\arguments{ +\item{data}{Seurat object.} + +\item{group_by}{Character, column in \code{data@meta.data} that defines groups +(default: \code{"combined_id"}).} + +\item{samples}{Character vector of group labels/patterns to include. If +\code{NULL} or if none match, all groups in \code{group_by} are used.} + +\item{batch}{Optional batch indicator; if length 1, an intercept-free design +is used with group dummies.} + +\item{cutoffs}{Numeric vector of user-supply ZI thresholds for summary statistics} +} +\value{ +A list with: +\itemize{ +\item \code{gene_metrics_by_group}: long data frame (group × gene) with \code{p0_obs}, +\code{p0_nb}, \code{ZI}, and counts. +\item \code{summary_by_group}: one row per group with medians and \% ZI thresholds, +plus observed/expected zero \strong{counts} for the group. +} +} +\description{ +Computes a \strong{sample group-aware Zero-Inflation (ZI) index} for each gene using a +negative-binomial (NB) baseline fitted with \strong{edgeR}. For each group +(e.g., drug condition), the function: +\enumerate{ +\item estimates gene-wise tagwise dispersions with edgeR (using all selected groups), +\item builds NB-expected zero probabilities from TMMwsp-scaled means, and +\item returns per-gene ZI (observed zeros minus NB-expected zeros) and +per-group summaries (e.g., \% genes with ZI > 0.05). ZI-cutoffs are user-defined. +} + +This is intended as a \strong{fast screening diagnostic} to decide whether +standard NB GLM methods (edgeR/DESeq2) are adequate or whether a +zero-aware workflow (e.g., ZINB-WaVE) might be warranted. + +This function \strong{relies on edgeR} to estimate dispersion. The current +implementation requires \strong{≥2 groups} in the design so that edgeR can +stabilize gene-wise dispersions across groups. If you only have a single +group and still want a design-aware baseline for expected zeros, fit a +Gamma–Poisson/NB GLM and compute the +expected zero probabilities from its fitted means and over-dispersion. +} +\note{ +\itemize{ +\item This is a \strong{screening} tool; it is not a replacement for fitting a full +GLM with your actual design. If strong covariates exist, a GLM baseline +(e.g., \code{glmGamPoi::glm_gp}) will yield more faithful expected-zero rates. +\item For single-group experiments, consider either adding a reference group or +switching to a GLM-based baseline that does not require multiple groups. +} +} +\examples{ +data(mini_mac) +check_zeroinflation(mini_mac, group_by = "combined_id", + samples = c("DMSO_0","Staurosporine_10")) +} diff --git a/man/subsample_genes.Rd b/man/subsample_genes.Rd new file mode 100644 index 0000000..5114db1 --- /dev/null +++ b/man/subsample_genes.Rd @@ -0,0 +1,34 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/subsample_genes.R +\name{subsample_genes} +\alias{subsample_genes} +\title{Subsample genes (fast helper function for zero-inflation checks)} +\usage{ +subsample_genes(data, ngene = 100, gselect = "random", seed = 1) +} +\arguments{ +\item{data}{A Seurat object (v4 or v5) with counts in assay "RNA".} + +\item{ngene}{Integer. Number of genes to keep (must be <= total genes).} + +\item{gselect}{Gene-selection strategy as used by +\strong{\code{seqgendiff::select_counts()}}. Defaults to \code{"random"}.} + +\item{seed}{Integer random seed for reproducibility.} +} +\value{ +A Seurat object containing the subsampled genes and all original wells/samples. +} +\description{ +Quickly subsample a specified number of \strong{genes} from a Seurat object and +return a smaller Seurat object containing the selected features and all +original wells/samples. This is a lightweight convenience wrapper around +\strong{\code{seqgendiff::select_counts()}} and is intended for creating a small +working object to run \strong{\code{check_zeroinflation()}} (or similar +diagnostics) rapidly. +} +\examples{ +data(mini_mac) +subsample_genes(mini_mac, ngene = 50, gselect = "random", seed = 1 ) + +} diff --git a/tests/testthat/test-check_zeroinflation.R b/tests/testthat/test-check_zeroinflation.R new file mode 100644 index 0000000..08d6485 --- /dev/null +++ b/tests/testthat/test-check_zeroinflation.R @@ -0,0 +1,18 @@ +test_that("chech_zeroinflation works", { + # Load test dataset from tests/testthat/testdata + testdata <- get_testdata() + #only match one group should fail the minimum requirement of two groups + expect_error(check_zeroinflation(data = testdata, group_by = "combined_id", + samples = c("DrugA_10", "DMSO_0"), + cutoffs = c(0.1, 0.2))) + #cutoffs out of range + expect_error(check_zeroinflation(data = testdata, group_by = "combined_id", + samples = c("Luminespib_10", "DMSO_0"), + cutoffs = 1.2)) + # expect a list as output + res <- check_zeroinflation(data = testdata, group_by = "combined_id", + samples = c("Luminespib_10", "DMSO_0"), + cutoffs = c(0.1, 0.2)) + expect_type(res, "list") +}) + diff --git a/tests/testthat/test-subsample_genes.R b/tests/testthat/test-subsample_genes.R new file mode 100644 index 0000000..a2b5169 --- /dev/null +++ b/tests/testthat/test-subsample_genes.R @@ -0,0 +1,17 @@ +test_that("subsample_genes works", { + # Load test dataset from tests/testthat/testdata + testdata <- get_testdata() + expect_error(subsample_genes(testdata, + ngene = 1000, + gselect = "random", + seed = 1)) + expect_error(subsample_genes(testdata, + ngene = 100, + gselect = "min", + seed = 1)) + subsampled_genes <- subsample_genes(testdata, + ngene = 50, + gselect = "random", + seed = 1) + expect_s4_class(subsampled_genes,"Seurat") +}) diff --git a/vignettes/check_zero_inflation.Rmd b/vignettes/check_zero_inflation.Rmd new file mode 100644 index 0000000..b74f9e9 --- /dev/null +++ b/vignettes/check_zero_inflation.Rmd @@ -0,0 +1,243 @@ +--- +title: "Check for zero-inflation in your data" +output: + rmarkdown::html_document: + theme: flatly + toc: true + toc_float: true + toc_depth: 5 +vignette: > + %\VignetteIndexEntry{check_zero_inflation} + %\VignetteEngine{knitr::rmarkdown} + %\VignetteEncoding{UTF-8} + %\VignetteBuild{true} + +--- + + +```{r, include = FALSE} +knitr::opts_chunk$set( + collapse = TRUE, + comment = "#>", + digits = 3 +) +options(digits = 3) +``` + + + +## Overview + +This is a quick demonstration of the `check_zeroinflation()` function from our **macpie** package. This function is a fast diagnostic tool to help you assess whether your MACseq data exhibits zero-inflation relative to a negative binomial (NB) model. + +We will use a lightweight convenience wrapper `subsample_genes()` around **`seqgendiff::select_counts()`** to create a smaller object with a subset of genes for faster computation. This function is also part of the **macpie** package. + +Under the hood, `check_zeroinflation()` uses **edgeR** to estimate gene-wise dispersions and compute expected zero probabilities under a NB model. It then compares the observed and expected zero-inflated indexes for each gene within each group defined in the metadata. + +This function: + + - estimates gene-wise tagwise dispersions with edgeR (using all selected groups), + + - builds NB-expected zero probabilities from TMMwsp-scaled means, and + + - returns per-gene ZI (observed zeros minus NB-expected zeros) and per-group summaries (e.g., % genes with ZI > 0.05). ZI-cutoffs are user-defined. + + +The output is a list with two elements: + + - `summary_by_group`: A summary table showing the number and percentage of genes that are zero-inflated for each group at specified cutoffs. + + - `gene_metrics_by_group`: A detailed table with gene-wise metrics, including observed and expected zero numbers and proportions, zero-inflation index (ZI), mean count, and dispersion estimates for each gene in each group. + +**Note**: `check_zeroinflation()` relies on edgeR to estimate dispersion. The current implementation requires ≥2 groups in the design so that edgeR can stabilize gene-wise dispersions across groups. If you only have a single group and still want a design-aware baseline for expected zeros, fit a Gamma–Poisson/NB GLM and compute the expected zero probabilities from its fitted means and over-dispersion. + +*** +
+ +```{r set_wd, include=FALSE} +dir <- "/Users/liuxin/macpie_Dev/" +devtools::load_all(paste0(dir, "macpie/")) +``` + +## Load data + +Load data and preprocess before subsampling genes and checking for zero-inflation. + +```{r setup} +#install.packages("macpie") # or devtools::install_github("PMCC-BioinformaticsCore/macpie") +library(macpie) + +# Define project variables +project_name <- "PMMSq033" +project_metadata <- system.file("extdata/PMMSq033_metadata.csv", package = "macpie") +# Load metadata +metadata <- read_metadata(project_metadata) + +``` + +```{r load_data} +# Import raw data +project_rawdata <- paste0(dir, "/macpieData/PMMSq033/raw_matrix") +project_name <- "PMMSq033" +raw_counts <- Read10X(data.dir = project_rawdata) +# Create tidySeurat object +mac <- CreateSeuratObject(counts = raw_counts, + project = project_name, + min.cells = 1, + min.features = 1) +# Join with metadata +mac <- mac %>% + inner_join(metadata, by = c(".cell" = "Barcode")) +# Add unique identifier +mac <- mac %>% + mutate(combined_id = str_c(Compound_ID, Concentration_1, sep = "_")) %>% + mutate(combined_id = gsub(" ", "", .data$combined_id)) +mac <- mac %>% + filter(Project == "Current") + +``` + +Filter genes with very low counts across all samples. This step is important because genes with extremely low expression can lead to unreliable estimates of dispersion and expected zero probabilities, which can skew the zero-inflation assessment. + +```{r} +# Filter by read count per sample group +mac <- filter_genes_by_expression(mac, + group_by = "combined_id", + min_counts = 10, + min_samples = 2) +``` + + +## Subsample genes for faster computation + +This is to randomly select a subset of 1000 genes for a quick check. For a more comprehensive analysis, consider using a larger subset. + +```{r} +# Subsample genes for faster computation +sub_mac <- subsample_genes(mac, ngene = 1000, gselect = "random", seed = 1) +sub_mac %>% nrow() +``` + + +## Check for zero-inflation + +We can now run the zero-inflation check on the subsampled data. + +### DMSO vs Camptothecin + +First, we will compare two groups: "DMSO_0" and "CAMPTOTHECIN_10". + + + +```{r} +zi_results <- check_zeroinflation(sub_mac, + group_by = "combined_id", + samples = c("DMSO_0","CAMPTOTHECIN_10"), + batch = 1, + cutoffs = c(0.1, 0.2)) + +``` + +#### View gene-wise metrics for each group + +For DMSO group: + +```{r} +zi_results$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head() +``` + +For Camptothecin group: + +```{r} +zi_results$gene_metrics_by_group %>% filter(group=="CAMPTOTHECIN_10") %>% head() +``` + +#### View summary statistics for each group + +```{r} +zi_results$summary_by_group +``` + + +From the summary table, we can see the summary statistics for each group, including the number and percentage of genes that are zero-inflated at the specified cutoffs. + +Each of the columns in the summary table are defined as follows: + + - group: The treatment group + + - n_genes: Total number of genes analyzed in the group + + - n_wells: Total number of wells/samples in the group + + - median_p0_obs: Median observed zero proportion across genes in the group + + - median_p0_nb: Median expected zero proportion under the NB model across genes in the group + + - median_ZI: Median zero-inflation index (ZI = p0_nb - p0_obs for each gene) across genes in the group + + - observed_zeros_num: Number of data points with observed zeros (it shouldn't be more than n_genes*n_wells for each group) + + - expected_zeros_num: Number of data points with expected zeros under the NB model (same here, it shouldn't be more than n_genes*n_wells for each group) + + - pct_ZI_gt_0.1: Percentage of genes with ZI greater than 0.1 + + - pct_ZI_gt_0.2: Percentage of genes with ZI greater than 0.2 + +*** +
+ +#### See below for interpretation of the results + +For DMSO vs camptothecin, we can see that both DMSO and camptothecin groups have negative median ZI values, indicating that there is no significant zero-inflation in either group. Percentage of genes with ZI greater than 0.1 or 0.2 are 13.5% and 6.6% for camptothecin. As camptothecin acts as a DNA topoisomerase inhibitor and induces DNA damage, it can lead to cell cycle arrest and apoptosis, which may reduce the overall transcriptional activity in the cells. This reduction in transcriptional activity can result in genes being expressed at lower levels or not at all. On the other hand, DMSO is a common solvent used in biological experiments and is not expected to induce significant changes in gene expression. Therefore, it is reasonable to observe a lower level of zero-inflation in the DMSO group compared to the camptothecin group. Same as ZI results for DMSO, the percentage of genes with ZI greater than 0.1 or 0.2 are both 0%. + +Besides, the observed zeros are also lower than the expected zeros, further supporting the absence of zero-inflation. Overall, these results suggest that the data does not exhibit significant zero-inflation, and a standard negative binomial model may be appropriate for downstream analyses. Zero-inflated models (ZINB) may not be necessary for this dataset. + + + +```{r, include=FALSE} +rm(zi_results) +``` + +*** +
+ + + +### DMSO vs Staruosporine + +Next, we will compare two treatment groups: "DMSO_0" and "Staurosporine_10". As staurosporine is a potent inducer of apoptosis, we might expect to see more zero-inflation in this group compared to DMSO. + + +```{r} +# Check for zero-inflation +zi_results <- check_zeroinflation(sub_mac, + group_by = "combined_id", + samples = c("DMSO_0","Staurosporine_10"), + batch = 1, + cutoffs = c(0.1, 0.2)) +``` + + +#### View gene-wise metrics for each group + +For DMSO group: + +```{r} +zi_results$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head() +``` + +For Staurosporine group: +```{r} +zi_results$gene_metrics_by_group %>% filter(group=="Staurosporine_10") %>% head() +``` + +#### View summary statistics for each group + +```{r} +zi_results$summary_by_group +``` +#### See below for interpretation of the results + +For DMSO vs staurosporine, we can see that only staurosporine has 14% of median ZI value, the observed zeros number (713) is much higher than the expected zeros number (~355), indicating that there is some level of zero-inflation in the staurosporine group. As staurosporine is a cell death control, which it is expected to see zero-inflated data in this group. In this case, it may be appropriate to consider using a zero-inflated negative binomial (ZINB) model for downstream analyses to account for the excess zeros in the data. + +