From 3a0c0734cda156dc94b7d55cf1e8046d1f5fdd63 Mon Sep 17 00:00:00 2001
From: Xin Liu <xin.liu@petermac.org>
Date: Wed, 1 Oct 2025 16:52:13 +1000
Subject: [PATCH 01/11] add check_zeroinflation.R

---
 R/check_zeroinflation.R | 135 ++++++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)
 create mode 100644 R/check_zeroinflation.R

diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R
new file mode 100644
index 0000000..a5c3cf1
--- /dev/null
+++ b/R/check_zeroinflation.R
@@ -0,0 +1,135 @@
+#' @tite Check zero-inflated counts 
+#'
+#' This offers a fast, method-agonistic way to check whether the data is zero-inflated 
+#' relative to a Negative Binomial distribution. 
+#' 
+#' It computes observed zero fraction and NB-expected zero fraction per gene,
+#' then reports a ZI index = p0_obs - p0_NB. Uses edgeR for dispersion.
+#' 
+#' Other methods to check zero-inflation include: GLM-fitted (glmGamPoi) when
+#' your data has strong effects (plate/row/column/treatment) and you want 
+#' the most accurate estimation.
+#' 
+
+
+check_zeroinflation <- function(data = NULL,
+                                treatment_samples = NULL,
+                                control_samples = NULL,
+                                batch = 1
+                                ){
+  
+  validate_inputs <- function(data, treatment_samples, control_samples) {
+    if (!inherits(data, "Seurat")) {
+      stop("argument 'data' must be a Seurat or TidySeurat object.")
+    }
+    if (is.null(treatment_samples) || is.null(control_samples)) {
+      stop("Missing the vectors of treatment and control samples.")
+    }
+    if (!"combined_id" %in% colnames(data@meta.data)) {
+      data <- data %>%
+        mutate(combined_id = apply(select(starts_with("Treatment_") | starts_with("Concentration_")),
+                                   1, paste, collapse = "_")) %>%
+        mutate(combined_id = gsub(" ", "", .data$combined_id))
+    }
+    if (length(treatment_samples) == 1 && length(control_samples) == 1) {
+      treatment_samples_list <- grepl(treatment_samples, data$combined_id)
+      control_samples_list <- grepl(control_samples, data$combined_id)
+      if (any(sum(treatment_samples_list) == 0, sum(control_samples_list) == 0)) {
+        stop("Your treatment and control samples are not in your combined_id column.")
+      }
+    }
+  }
+  
+  
+  count_matrix <- GetAssayData(data, assay = "RNA", layer = "counts")
+  count_matrix <- Matrix::Matrix(count_matrix, sparse = TRUE)
+  obs_zero <- Matrix::rowMeans(count_matrix == 0)
+  
+  # Negative binomial expected zeros
+  # using edgeR for dispersion estimation
+  dge <- edgeR::DGEList(counts = count_matrix)
+  dge <- edgeR::calcNormFactors(dge, method = "TMMwsp")
+  
+  #design matrix
+  combined_id <- data$combined_id
+  #model matrix with the batch parameter
+  model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else
+    model.matrix(~0 + combined_id + batch)
+  # tagwise dispersion
+  dge <- edgeR::estimateDisp(dge, design = model_matrix)  
+  phi <- dge$tagwise.dispersion  # NB variance: mu + phi * mu^2  (phi >= 0)
+  
+  
+  # ---- 4) Build per-sample NB mean mu_gj using TMM-scaled library sizes
+  # Effective library sizes
+  eff_lib <- dge$samples$lib.size * dge$samples$norm.factors
+  # Per-gene proportion q_g = total counts for gene / total eff_lib across samples
+  total_eff_lib <- sum(eff_lib)
+  total_counts_per_gene <- Matrix::rowSums(count_matrix)
+  q_g <- as.numeric(total_counts_per_gene) / total_eff_lib
+  # expected mean cont for each gene in each sample
+  mu <- q_g %o% eff_lib      # (genes x samples) without materializing too big objects in typical 384-well
+  
+  
+  # ---- 5) NB-expected zero per gene, averaged across samples
+  # p0_NB_gj = (1 + phi_g * mu_gj)^(-1/phi_g); then average over j
+  # guard against phi==0 (Poisson limit): use exp(-mu) when phi ~ 0
+  eps <- 1e-12
+  phi_safe <- pmax(phi, eps)
+  # vectorized: compute per gene using rowMeans on transformed mu
+  inv_phi <- 1 / phi_safe
+  # (1 + phi * mu)^(-1/phi)  -> do per gene with broadcasting
+  # Use sweep for efficiency
+  one_plus <- sweep(mu, 1, phi_safe, `*`)
+  one_plus <- 1 + one_plus
+  p0_nb_mat <- sweep(one_plus, 1, inv_phi, `^`)  # (1 + phi*mu)^(-1/phi)
+  p0_nb <- rowMeans(1 / p0_nb_mat)               # since above computed (1+phi*mu)^(+1/phi); invert
+  # NOTE: If memory is a concern, chunk over columns; okay for 384 wells.
+  
+  # Poisson fallback where phi was ~0 (replace those rows with exp(-rowMeans(mu)))
+  poi_idx <- which(phi < 1e-8)
+  if (length(poi_idx)) {
+    mu_bar <- rowMeans(mu[poi_idx, , drop = FALSE])
+    p0_nb[poi_idx] <- exp(-mu_bar)
+  }
+  
+  # ---- 6) ZI index per gene
+  zi <- obs_zero - p0_nb
+  
+  # ---- 7) Summaries
+  libsize <- Matrix::colSums(count_matrix)
+  summary <- list(
+    n_genes          = nrow(count_matrix),
+    n_samples        = ncol(count_matrix),
+    median_libsize   = stats::median(libsize),
+    median_p0_obs    = stats::median(obs_zero),
+    median_p0_nb     = stats::median(p0_nb),
+    median_ZI        = stats::median(zi),
+    pct_ZI_gt_0.05   = mean(zi > 0.05),
+    pct_ZI_gt_0.10   = mean(zi > 0.10),
+    mean_observed_zeros       = mean(obs_zero*ncol(count_matrix)),
+    mean_expected_zeros       = mean(p0_nb*ncol(count_matrix))
+  )
+  
+  thresholds_used <- list(
+    zi_margin_small = 0.05,
+    zi_margin_mod   = 0.10
+  )
+  
+  gene_metrics <- data.frame(
+    gene = rownames(count_matrix),
+    mean_count = total_counts_per_gene / ncol(count_matrix),
+    dispersion = phi,
+    p0_obs = p0_obs,
+    p0_nb  = p0_nb,
+    ZI     = zi,
+    stringsAsFactors = FALSE
+  )
+  
+  list(
+    gene_metrics   = gene_metrics,
+    summary        = summary,
+    thresholds_used = thresholds_used
+  )
+  
+}
\ No newline at end of file

From 1f7ce5490f46fa5a8defaf378115478715d7fd2d Mon Sep 17 00:00:00 2001
From: Xin Liu <xin.liu@petermac.org>
Date: Mon, 6 Oct 2025 21:59:56 +1100
Subject: [PATCH 02/11] add grouping option

---
 R/check_zeroinflation.R | 43 +++++++++++++++++++++++++++--------------
 1 file changed, 28 insertions(+), 15 deletions(-)

diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R
index a5c3cf1..2938ecc 100644
--- a/R/check_zeroinflation.R
+++ b/R/check_zeroinflation.R
@@ -13,30 +13,43 @@
 
 
 check_zeroinflation <- function(data = NULL,
-                                treatment_samples = NULL,
-                                control_samples = NULL,
+                                group_by = NULL,
+                                samples = NULL,
                                 batch = 1
                                 ){
   
-  validate_inputs <- function(data, treatment_samples, control_samples) {
+  validate_inputs <- function(data, samples) {
     if (!inherits(data, "Seurat")) {
       stop("argument 'data' must be a Seurat or TidySeurat object.")
     }
-    if (is.null(treatment_samples) || is.null(control_samples)) {
-      stop("Missing the vectors of treatment and control samples.")
+    group_by <- if (is.null(group_by)) "combined_id" else group_by} 
+    
+    #decide grouping mode
+    meta_groups <- as.character(data@meta.data[[group_by]])
+    matched_groups <- !is.null(samples) && any(grepl(samples, meta_groups))
+    if (matched_groups){
+      
     }
-    if (!"combined_id" %in% colnames(data@meta.data)) {
-      data <- data %>%
-        mutate(combined_id = apply(select(starts_with("Treatment_") | starts_with("Concentration_")),
-                                   1, paste, collapse = "_")) %>%
-        mutate(combined_id = gsub(" ", "", .data$combined_id))
+    
+    
+    if (is.null(samples)) {
+      stop("Samples are not specified, whole set of samples will be included.")
+    } else {
+      if (length(samples)>=1){
+        sample_list <- grepl(samples, data$combined_id)
+        if (sum(sample_list) == 0) {
+          stop("Your samples are not in your combined_id column.")
+        }
+      }
     }
-    if (length(treatment_samples) == 1 && length(control_samples) == 1) {
-      treatment_samples_list <- grepl(treatment_samples, data$combined_id)
-      control_samples_list <- grepl(control_samples, data$combined_id)
-      if (any(sum(treatment_samples_list) == 0, sum(control_samples_list) == 0)) {
-        stop("Your treatment and control samples are not in your combined_id column.")
+    # Helper: Prepare data and pheno_data
+    prepare_data <- function(data, samples, batch) {
+      data <- data[, grepl(samples, data$combined_id)]
+      if (length(unique(data$combined_id)) < 2) {
+        stop("Insufficient factors for differential expression analysis.")
       }
+      pheno_data <- data.frame(batch = as.factor(batch), condition = as.factor(data$combined_id))
+      return(list(data = data, pheno_data = pheno_data))
     }
   }
   

From d2b593388b7a187bfc551a58c2053f6d1dd87edb Mon Sep 17 00:00:00 2001
From: Xin Liu <xin.liu@petermac.org>
Date: Wed, 8 Oct 2025 10:40:40 +1100
Subject: [PATCH 03/11] simplify group/plate-level

---
 R/check_zeroinflation.R | 269 +++++++++++++++++++++++-----------------
 1 file changed, 155 insertions(+), 114 deletions(-)

diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R
index 2938ecc..f89ef94 100644
--- a/R/check_zeroinflation.R
+++ b/R/check_zeroinflation.R
@@ -1,60 +1,89 @@
-#' @tite Check zero-inflated counts 
+#' Quick group-aware zero-inflation check (NB baseline via edgeR)
 #'
-#' This offers a fast, method-agonistic way to check whether the data is zero-inflated 
-#' relative to a Negative Binomial distribution. 
-#' 
-#' It computes observed zero fraction and NB-expected zero fraction per gene,
-#' then reports a ZI index = p0_obs - p0_NB. Uses edgeR for dispersion.
-#' 
-#' Other methods to check zero-inflation include: GLM-fitted (glmGamPoi) when
-#' your data has strong effects (plate/row/column/treatment) and you want 
-#' the most accurate estimation.
-#' 
+#' @description
+#' Computes a **group-aware Zero-Inflation (ZI) index** for each gene using a
+#' negative-binomial (NB) baseline fitted with **edgeR**. For each group
+#' (e.g., drug condition), the function:
+#' 1) estimates gene-wise tagwise dispersions with edgeR (using all selected groups),
+#' 2) builds NB-expected zero probabilities from TMMwsp-scaled means, and
+#' 3) returns per-gene ZI (observed zeros minus NB-expected zeros) and
+#'    per-group summaries (e.g., % genes with ZI > 0.05).
+#'
+#' This is intended as a **fast screening diagnostic** to decide whether
+#' standard NB GLM methods (edgeR/DESeq2) are adequate or whether a
+#' zero-aware workflow (e.g., ZINB-WaVE) might be warranted.
+#'
+#' @important
+#' This helper **relies on edgeR** to estimate dispersion. The current
+#' implementation requires **≥2 groups** in the design so that edgeR can
+#' stabilize gene-wise dispersions across groups. If you only have a single
+#' group and still want a design-aware baseline for expected zeros, fit a
+#' Gamma–Poisson/NB GLM yourself (e.g., with **glmGamPoi**) and compute the
+#' expected zero probabilities from its fitted means and overdispersion.
+#'
+#' @param data Seurat object.
+#' @param group_by Character, column in `data@meta.data` that defines groups
+#'   (default: `"combined_id"`).
+#' @param samples Character vector of group labels/patterns to include. If
+#'   `NULL` or if none match, all groups in `group_by` are used.
+#' @param batch Optional batch indicator; if length 1, an intercept-free design
+#'   is used with group dummies.
+#'
+#' @returns A list with:
+#' * `gene_metrics_by_group`: long data frame (group × gene) with `p0_obs`,
+#'   `p0_nb`, `ZI`, and counts.
+#' * `summary_by_group`: one row per group with medians and % ZI thresholds,
+#'   plus observed/expected zero **counts** for the group.
+#'
+#' @note
+#' - This is a **screening** tool; it is not a replacement for fitting a full
+#'   GLM with your actual design. If strong covariates exist, a GLM baseline
+#'   (e.g., `glmGamPoi::glm_gp`) will yield more faithful expected-zero rates.
+#' - For single-group experiments, consider either adding a reference group or
+#'   switching to a GLM-based baseline that does not require multiple groups.
+#'
+#' @examples
+#' # check_zeroinflation(mini_mac, group_by = "combined_id",
+#' #                           samples = c("DrugA_10", "DMSO_0"))
+
 
 
 check_zeroinflation <- function(data = NULL,
-                                group_by = NULL,
-                                samples = NULL,
-                                batch = 1
-                                ){
-  
-  validate_inputs <- function(data, samples) {
+                                     group_by = NULL,
+                                     samples = NULL,
+                                     batch = 1
+){
+  validate_inputs <- function(data, group_by, samples) {
     if (!inherits(data, "Seurat")) {
       stop("argument 'data' must be a Seurat or TidySeurat object.")
     }
-    group_by <- if (is.null(group_by)) "combined_id" else group_by} 
+    group_by <- if (is.null(group_by)) "combined_id" else group_by
     
-    #decide grouping mode
+    # check samples in combined_id column
     meta_groups <- as.character(data@meta.data[[group_by]])
     matched_groups <- !is.null(samples) && any(grepl(samples, meta_groups))
-    if (matched_groups){
-      
-    }
-    
-    
-    if (is.null(samples)) {
-      stop("Samples are not specified, whole set of samples will be included.")
-    } else {
-      if (length(samples)>=1){
-        sample_list <- grepl(samples, data$combined_id)
-        if (sum(sample_list) == 0) {
-          stop("Your samples are not in your combined_id column.")
-        }
-      }
-    }
-    # Helper: Prepare data and pheno_data
-    prepare_data <- function(data, samples, batch) {
-      data <- data[, grepl(samples, data$combined_id)]
-      if (length(unique(data$combined_id)) < 2) {
-        stop("Insufficient factors for differential expression analysis.")
-      }
-      pheno_data <- data.frame(batch = as.factor(batch), condition = as.factor(data$combined_id))
-      return(list(data = data, pheno_data = pheno_data))
-    }
+    if (!matched_groups){
+      # all samples included
+      samples <- unique(data@meta.data[[group_by]])
+      cat("All samples will be included in the combined_id column.")
+    } 
+    # need at least two groups for edgeR dispersion estimation
+    if (length(samples) == 1) {
+      stop("Two treatment groups are needed to calculate dispersion using edgeR.")
+    } 
+    return(list(data = data, group_by = group_by, samples = samples))
   }
   
   
-  count_matrix <- GetAssayData(data, assay = "RNA", layer = "counts")
+  
+  validated <- validate_inputs(data, group_by, samples)
+  data <- validated$data
+  group_by <- validated$group_by
+  samples <- validated$samples
+  
+  # samples must be specified
+  mac_data <- subset(data, subset = combined_id %in% samples)
+  count_matrix <- GetAssayData(mac_data, assay = "RNA", layer = "counts")
   count_matrix <- Matrix::Matrix(count_matrix, sparse = TRUE)
   obs_zero <- Matrix::rowMeans(count_matrix == 0)
   
@@ -64,85 +93,97 @@ check_zeroinflation <- function(data = NULL,
   dge <- edgeR::calcNormFactors(dge, method = "TMMwsp")
   
   #design matrix
-  combined_id <- data$combined_id
-  #model matrix with the batch parameter
+  combined_id <- mac_data$combined_id
+  #make up batch parameter
   model_matrix <- if (length(batch) == 1) model.matrix(~0 + combined_id) else
     model.matrix(~0 + combined_id + batch)
   # tagwise dispersion
   dge <- edgeR::estimateDisp(dge, design = model_matrix)  
   phi <- dge$tagwise.dispersion  # NB variance: mu + phi * mu^2  (phi >= 0)
-  
-  
-  # ---- 4) Build per-sample NB mean mu_gj using TMM-scaled library sizes
+  # Build per-sample NB mean mu_gj using TMMwsp-scaled library sizes
   # Effective library sizes
   eff_lib <- dge$samples$lib.size * dge$samples$norm.factors
-  # Per-gene proportion q_g = total counts for gene / total eff_lib across samples
-  total_eff_lib <- sum(eff_lib)
-  total_counts_per_gene <- Matrix::rowSums(count_matrix)
-  q_g <- as.numeric(total_counts_per_gene) / total_eff_lib
-  # expected mean cont for each gene in each sample
-  mu <- q_g %o% eff_lib      # (genes x samples) without materializing too big objects in typical 384-well
-  
-  
-  # ---- 5) NB-expected zero per gene, averaged across samples
-  # p0_NB_gj = (1 + phi_g * mu_gj)^(-1/phi_g); then average over j
-  # guard against phi==0 (Poisson limit): use exp(-mu) when phi ~ 0
-  eps <- 1e-12
-  phi_safe <- pmax(phi, eps)
-  # vectorized: compute per gene using rowMeans on transformed mu
-  inv_phi <- 1 / phi_safe
-  # (1 + phi * mu)^(-1/phi)  -> do per gene with broadcasting
-  # Use sweep for efficiency
-  one_plus <- sweep(mu, 1, phi_safe, `*`)
-  one_plus <- 1 + one_plus
-  p0_nb_mat <- sweep(one_plus, 1, inv_phi, `^`)  # (1 + phi*mu)^(-1/phi)
-  p0_nb <- rowMeans(1 / p0_nb_mat)               # since above computed (1+phi*mu)^(+1/phi); invert
-  # NOTE: If memory is a concern, chunk over columns; okay for 384 wells.
-  
-  # Poisson fallback where phi was ~0 (replace those rows with exp(-rowMeans(mu)))
-  poi_idx <- which(phi < 1e-8)
-  if (length(poi_idx)) {
-    mu_bar <- rowMeans(mu[poi_idx, , drop = FALSE])
-    p0_nb[poi_idx] <- exp(-mu_bar)
-  }
-  
-  # ---- 6) ZI index per gene
-  zi <- obs_zero - p0_nb
-  
-  # ---- 7) Summaries
-  libsize <- Matrix::colSums(count_matrix)
-  summary <- list(
-    n_genes          = nrow(count_matrix),
-    n_samples        = ncol(count_matrix),
-    median_libsize   = stats::median(libsize),
-    median_p0_obs    = stats::median(obs_zero),
-    median_p0_nb     = stats::median(p0_nb),
-    median_ZI        = stats::median(zi),
-    pct_ZI_gt_0.05   = mean(zi > 0.05),
-    pct_ZI_gt_0.10   = mean(zi > 0.10),
-    mean_observed_zeros       = mean(obs_zero*ncol(count_matrix)),
-    mean_expected_zeros       = mean(p0_nb*ncol(count_matrix))
-  )
-  
-  thresholds_used <- list(
-    zi_margin_small = 0.05,
-    zi_margin_mod   = 0.10
-  )
-  
-  gene_metrics <- data.frame(
-    gene = rownames(count_matrix),
-    mean_count = total_counts_per_gene / ncol(count_matrix),
-    dispersion = phi,
-    p0_obs = p0_obs,
-    p0_nb  = p0_nb,
-    ZI     = zi,
-    stringsAsFactors = FALSE
-  )
   
+  per_group_gene_metrics <- lapply(samples, function(g){
+    
+    idx <- which(combined_id == g)
+    n_wells <- length(idx)
+    # sub count matrix for group g
+    count_matrix_g <- count_matrix[, idx, drop=FALSE]
+    # Observed zeros within group g
+    p0_obs_g <- Matrix::rowMeans(count_matrix_g==0)
+    
+    # count zeros per gene within group g
+    # sum later for summary
+    obs_zero_num_g <- Matrix::rowSums(count_matrix_g==0)
+    
+    # Group-specific q_{g,g} using only wells in group g
+    eff_lib_g <- eff_lib[idx]
+    total_eff_lib_g <- sum(eff_lib_g)
+    total_counts_per_gene_g <- Matrix::rowSums(count_matrix_g)
+    q_g_g <- as.numeric(total_counts_per_gene_g) / total_eff_lib_g
+    
+    # NB-expected zeros within group g (average over wells in g)
+    eps <- 1e-12
+    phi_safe <- pmax(phi, eps)
+    inv_phi <- 1 / phi_safe
+    
+    # Fast loop over wells in g, no GxJ materialization
+    p0_nb_sum_g <- numeric(nrow(count_matrix))
+    for (j in seq_along(idx)) {
+      Lj <- eff_lib_g[j]
+      mu_gj <- q_g_g * Lj
+      p0_nb_sum_g <- p0_nb_sum_g + (1 + phi_safe * mu_gj)^(-inv_phi)
+    }
+    p0_nb_g <- p0_nb_sum_g / length(idx)
+    
+    # Poisson fallback where phi ~ 0
+    poi_idx <- which(phi < 1e-8)
+    if (length(poi_idx)) {
+      mu_bar_g <- q_g_g * mean(eff_lib_g)
+      p0_nb_g[poi_idx] <- exp(-mu_bar_g[poi_idx])
+    }
+    
+    # ZI within group g
+    zi_g <- p0_obs_g - p0_nb_g
+    
+    data.frame(
+      group = g,
+      gene  = rownames(count_matrix),
+      mean_count_group = total_counts_per_gene_g / length(idx),
+      dispersion = phi,
+      p0_obs = p0_obs_g,
+      obs_zeros_num = obs_zero_num_g,
+      p0_nb  = p0_nb_g,
+      expected_zeros_num = p0_nb_g*n_wells,
+      ZI     = zi_g,
+      stringsAsFactors = FALSE
+    )
+  })
+  
+  gene_metrics_by_group <- do.call(rbind, per_group_gene_metrics)
+  
+  # Per-group summaries (one row per group)
+  summary_by_group <- do.call(rbind, lapply(split(gene_metrics_by_group, gene_metrics_by_group$group), function(df){
+    list(
+      group            = unique(df$group),
+      n_genes          = nrow(df),
+      n_wells          = sum(combined_id == unique(df$group)),
+      median_p0_obs    = median(df$p0_obs),
+      median_p0_nb     = median(df$p0_nb),
+      median_ZI        = median(df$ZI),
+      pct_ZI_gt_0.05   = mean(df$ZI > 0.05),
+      pct_ZI_gt_0.10   = mean(df$ZI > 0.1),
+      observed_zeros_num       = sum(df$obs_zeros_num),
+      expected_zeros_num  = sum(df$expected_zeros_num) 
+    )
+  })) |>
+    as.data.frame()
+  
+  # Return gene-level metrics and sample group-level summaries
   list(
-    gene_metrics   = gene_metrics,
-    summary        = summary,
-    thresholds_used = thresholds_used
+    gene_metrics_by_group = gene_metrics_by_group %>% head(10),     
+    summary_by_group      = summary_by_group
   )
   
 }
\ No newline at end of file

From 749ce909cdc3d8c54e2df2806214509870b30bac Mon Sep 17 00:00:00 2001
From: Xin Liu <xin.liu@petermac.org>
Date: Wed, 8 Oct 2025 10:55:08 +1100
Subject: [PATCH 04/11] add user-defined ZI cutoffs

---
 R/check_zeroinflation.R | 46 ++++++++++++++++++++++-------------------
 1 file changed, 25 insertions(+), 21 deletions(-)

diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R
index f89ef94..a0b97c0 100644
--- a/R/check_zeroinflation.R
+++ b/R/check_zeroinflation.R
@@ -7,7 +7,7 @@
 #' 1) estimates gene-wise tagwise dispersions with edgeR (using all selected groups),
 #' 2) builds NB-expected zero probabilities from TMMwsp-scaled means, and
 #' 3) returns per-gene ZI (observed zeros minus NB-expected zeros) and
-#'    per-group summaries (e.g., % genes with ZI > 0.05).
+#'    per-group summaries (e.g., % genes with ZI > 0.05). ZI-cutoffs are user-defined. 
 #'
 #' This is intended as a **fast screening diagnostic** to decide whether
 #' standard NB GLM methods (edgeR/DESeq2) are adequate or whether a
@@ -28,6 +28,7 @@
 #'   `NULL` or if none match, all groups in `group_by` are used.
 #' @param batch Optional batch indicator; if length 1, an intercept-free design
 #'   is used with group dummies.
+#' @param cutoffs Numeric vector of user-supply ZI thresholds for summary statistics
 #'
 #' @returns A list with:
 #' * `gene_metrics_by_group`: long data frame (group × gene) with `p0_obs`,
@@ -51,9 +52,10 @@
 check_zeroinflation <- function(data = NULL,
                                      group_by = NULL,
                                      samples = NULL,
-                                     batch = 1
+                                     batch = 1,
+                                     cutoffs = c(0.1, 0.20)
 ){
-  validate_inputs <- function(data, group_by, samples) {
+  validate_inputs <- function(data, group_by, samples, cutoffs) {
     if (!inherits(data, "Seurat")) {
       stop("argument 'data' must be a Seurat or TidySeurat object.")
     }
@@ -71,17 +73,13 @@ check_zeroinflation <- function(data = NULL,
     if (length(samples) == 1) {
       stop("Two treatment groups are needed to calculate dispersion using edgeR.")
     } 
-    return(list(data = data, group_by = group_by, samples = samples))
+    # check cutoffs
+    if (any(cutoffs <= 0) || any(cutoffs >= 1)) {
+      stop("cutoffs must be between 0 and 1.")
+    }
+    return(list(data = data, group_by = group_by, samples = samples, cutoffs = cutoffs))
   }
   
-  
-  
-  validated <- validate_inputs(data, group_by, samples)
-  data <- validated$data
-  group_by <- validated$group_by
-  samples <- validated$samples
-  
-  # samples must be specified
   mac_data <- subset(data, subset = combined_id %in% samples)
   count_matrix <- GetAssayData(mac_data, assay = "RNA", layer = "counts")
   count_matrix <- Matrix::Matrix(count_matrix, sparse = TRUE)
@@ -144,7 +142,7 @@ check_zeroinflation <- function(data = NULL,
       p0_nb_g[poi_idx] <- exp(-mu_bar_g[poi_idx])
     }
     
-    # ZI within group g
+    # 4) ZI within group g
     zi_g <- p0_obs_g - p0_nb_g
     
     data.frame(
@@ -163,26 +161,32 @@ check_zeroinflation <- function(data = NULL,
   
   gene_metrics_by_group <- do.call(rbind, per_group_gene_metrics)
   
+  #if there are more than one cutoffs, calculate pct_ZI_gt_ for each cutoff
   # Per-group summaries (one row per group)
   summary_by_group <- do.call(rbind, lapply(split(gene_metrics_by_group, gene_metrics_by_group$group), function(df){
-    list(
+    
+    list_a <- list(
       group            = unique(df$group),
       n_genes          = nrow(df),
       n_wells          = sum(combined_id == unique(df$group)),
       median_p0_obs    = median(df$p0_obs),
       median_p0_nb     = median(df$p0_nb),
       median_ZI        = median(df$ZI),
-      pct_ZI_gt_0.05   = mean(df$ZI > 0.05),
-      pct_ZI_gt_0.10   = mean(df$ZI > 0.1),
       observed_zeros_num       = sum(df$obs_zeros_num),
       expected_zeros_num  = sum(df$expected_zeros_num) 
     )
-  })) |>
-    as.data.frame()
-  
-  # Return gene-level metrics and sample group-level summaries
+    
+    list_b <- lapply(cutoffs, function(cutoff){
+      pct_name <- paste0("pct_ZI_gt_", cutoff)
+      pct_value <- mean(df$ZI > cutoff)
+      setNames(list(pct_value), pct_name)
+    })
+    
+    as.data.frame(c(list_a, list_b))
+  })) 
+  # Return just the selected groups' indices instead of plate-level
   list(
-    gene_metrics_by_group = gene_metrics_by_group %>% head(10),     
+    gene_metrics_by_group = gene_metrics_by_group %>% head(10),     # long format: group × gene
     summary_by_group      = summary_by_group
   )
   

From 591e4f925d6c80995e514bbb8b8f2d7926723fd7 Mon Sep 17 00:00:00 2001
From: Xin Liu <xin.liu@petermac.org>
Date: Wed, 8 Oct 2025 15:57:21 +1100
Subject: [PATCH 05/11] updated check_zeroinflation and added test

---
 man/check_zeroinflation.Rd                | 73 +++++++++++++++++++++++
 tests/testthat/test-check_zeroinflation.R | 18 ++++++
 2 files changed, 91 insertions(+)
 create mode 100644 man/check_zeroinflation.Rd
 create mode 100644 tests/testthat/test-check_zeroinflation.R

diff --git a/man/check_zeroinflation.Rd b/man/check_zeroinflation.Rd
new file mode 100644
index 0000000..d7dd7ec
--- /dev/null
+++ b/man/check_zeroinflation.Rd
@@ -0,0 +1,73 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/check_zeroinflation.R
+\name{check_zeroinflation}
+\alias{check_zeroinflation}
+\title{Quick group-aware zero-inflation check (Negative Binomial baseline via edgeR)}
+\usage{
+check_zeroinflation(
+  data = NULL,
+  group_by = NULL,
+  samples = NULL,
+  batch = 1,
+  cutoffs = c(0.1, 0.2)
+)
+}
+\arguments{
+\item{data}{Seurat object.}
+
+\item{group_by}{Character, column in \code{data@meta.data} that defines groups
+(default: \code{"combined_id"}).}
+
+\item{samples}{Character vector of group labels/patterns to include. If
+\code{NULL} or if none match, all groups in \code{group_by} are used.}
+
+\item{batch}{Optional batch indicator; if length 1, an intercept-free design
+is used with group dummies.}
+
+\item{cutoffs}{Numeric vector of user-supply ZI thresholds for summary statistics}
+}
+\value{
+A list with:
+\itemize{
+\item \code{gene_metrics_by_group}: long data frame (group × gene) with \code{p0_obs},
+\code{p0_nb}, \code{ZI}, and counts.
+\item \code{summary_by_group}: one row per group with medians and \% ZI thresholds,
+plus observed/expected zero \strong{counts} for the group.
+}
+}
+\description{
+Computes a \strong{sample group-aware Zero-Inflation (ZI) index} for each gene using a
+negative-binomial (NB) baseline fitted with \strong{edgeR}. For each group
+(e.g., drug condition), the function:
+\enumerate{
+\item estimates gene-wise tagwise dispersions with edgeR (using all selected groups),
+\item builds NB-expected zero probabilities from TMMwsp-scaled means, and
+\item returns per-gene ZI (observed zeros minus NB-expected zeros) and
+per-group summaries (e.g., \% genes with ZI > 0.05). ZI-cutoffs are user-defined.
+}
+
+This is intended as a \strong{fast screening diagnostic} to decide whether
+standard NB GLM methods (edgeR/DESeq2) are adequate or whether a
+zero-aware workflow (e.g., ZINB-WaVE) might be warranted.
+
+This function \strong{relies on edgeR} to estimate dispersion. The current
+implementation requires \strong{≥2 groups} in the design so that edgeR can
+stabilize gene-wise dispersions across groups. If you only have a single
+group and still want a design-aware baseline for expected zeros, fit a
+Gamma–Poisson/NB GLM (e.g., with \strong{glmGamPoi}) and compute the
+expected zero probabilities from its fitted means and over-dispersion.
+}
+\note{
+\itemize{
+\item This is a \strong{screening} tool; it is not a replacement for fitting a full
+GLM with your actual design. If strong covariates exist, a GLM baseline
+(e.g., \code{glmGamPoi::glm_gp}) will yield more faithful expected-zero rates.
+\item For single-group experiments, consider either adding a reference group or
+switching to a GLM-based baseline that does not require multiple groups.
+}
+}
+\examples{
+data(mini_mac)
+check_zeroinflation(mini_mac, group_by = "combined_id",
+                     samples = c("DMSO_0","Staurosporine_10"))
+}
diff --git a/tests/testthat/test-check_zeroinflation.R b/tests/testthat/test-check_zeroinflation.R
new file mode 100644
index 0000000..08d6485
--- /dev/null
+++ b/tests/testthat/test-check_zeroinflation.R
@@ -0,0 +1,18 @@
+test_that("chech_zeroinflation works", {
+  # Load test dataset from tests/testthat/testdata
+  testdata <- get_testdata()
+  #only match one group should fail the minimum requirement of two groups
+  expect_error(check_zeroinflation(data = testdata, group_by = "combined_id",
+                                    samples = c("DrugA_10", "DMSO_0"),
+                                    cutoffs = c(0.1, 0.2)))
+  #cutoffs out of range
+  expect_error(check_zeroinflation(data = testdata, group_by = "combined_id",
+                                   samples = c("Luminespib_10", "DMSO_0"),
+                                    cutoffs = 1.2))
+  # expect a list as output
+  res <- check_zeroinflation(data = testdata, group_by = "combined_id",
+                             samples = c("Luminespib_10", "DMSO_0"),
+                             cutoffs = c(0.1, 0.2))
+  expect_type(res, "list")
+}) 
+

From 91dfa8025a7422318600312009071022984251dd Mon Sep 17 00:00:00 2001
From: Xin Liu <xin.liu@petermac.org>
Date: Wed, 8 Oct 2025 15:58:13 +1100
Subject: [PATCH 06/11] added subsample_genes.R and tests

---
 R/subsample_genes.R                   | 45 +++++++++++++++++++++++++++
 man/subsample_genes.Rd                | 34 ++++++++++++++++++++
 tests/testthat/test-subsample_genes.R | 17 ++++++++++
 3 files changed, 96 insertions(+)
 create mode 100644 R/subsample_genes.R
 create mode 100644 man/subsample_genes.Rd
 create mode 100644 tests/testthat/test-subsample_genes.R

diff --git a/R/subsample_genes.R b/R/subsample_genes.R
new file mode 100644
index 0000000..746f1f5
--- /dev/null
+++ b/R/subsample_genes.R
@@ -0,0 +1,45 @@
+#' Subsample genes (fast helper function for zero-inflation checks)
+#' 
+#' @description
+#' Quickly subsample a specified number of **genes** from a Seurat object and
+#' return a smaller Seurat object containing the selected features and all
+#' original wells/samples. This is a lightweight convenience wrapper around
+#' **`seqgendiff::select_counts()`** and is intended for creating a small
+#' working object to run **`check_zeroinflation()`** (or similar
+#' diagnostics) rapidly.
+#' 
+#' @param data   A Seurat object (v4 or v5) with counts in assay "RNA".
+#' @param ngene  Integer. Number of genes to keep (must be <= total genes).
+#' @param gselect Gene-selection strategy as used by
+#'   **`seqgendiff::select_counts()`**. Defaults to `"random"`.
+#' @param seed   Integer random seed for reproducibility.
+#' @return A Seurat object containing the subsampled genes and all original wells/samples.
+#' @export
+#' @examples
+#' data(mini_mac)
+#' subsample_genes(mini_mac, ngene = 50, gselect = "random", seed = 1 )
+#' 
+subsample_genes <- function(data, 
+                            ngene = 100, 
+                            gselect = "random", 
+                            seed = 1){
+  if (utils::packageVersion("Seurat") < "5.0.0"
+  ) {
+    count_matrix <- GetAssayData(data, assay = "RNA", slot = "counts")
+  } else {
+    count_matrix <- GetAssayData(data, assay = "RNA", layer = "counts")
+  }
+  count_matrix <- as.matrix(count_matrix)
+  if (!inherits(data, "Seurat")) {
+    stop("argument 'data' must be a Seurat or TidySeurat object.")
+  }
+  if (ngene > nrow(count_matrix)){
+    stop("ngene should be less than the total number of genes in the dataset.")
+  }
+  set.seed(seed)
+  sub_matrix <- seqgendiff::select_counts(count_matrix, ngene = ngene, gselect = gselect)
+  subsample_genes <- rownames(sub_matrix)
+  data_sub <- data[subsample_genes, ]
+  return(data_sub)
+  
+}
diff --git a/man/subsample_genes.Rd b/man/subsample_genes.Rd
new file mode 100644
index 0000000..5114db1
--- /dev/null
+++ b/man/subsample_genes.Rd
@@ -0,0 +1,34 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/subsample_genes.R
+\name{subsample_genes}
+\alias{subsample_genes}
+\title{Subsample genes (fast helper function for zero-inflation checks)}
+\usage{
+subsample_genes(data, ngene = 100, gselect = "random", seed = 1)
+}
+\arguments{
+\item{data}{A Seurat object (v4 or v5) with counts in assay "RNA".}
+
+\item{ngene}{Integer. Number of genes to keep (must be <= total genes).}
+
+\item{gselect}{Gene-selection strategy as used by
+\strong{\code{seqgendiff::select_counts()}}. Defaults to \code{"random"}.}
+
+\item{seed}{Integer random seed for reproducibility.}
+}
+\value{
+A Seurat object containing the subsampled genes and all original wells/samples.
+}
+\description{
+Quickly subsample a specified number of \strong{genes} from a Seurat object and
+return a smaller Seurat object containing the selected features and all
+original wells/samples. This is a lightweight convenience wrapper around
+\strong{\code{seqgendiff::select_counts()}} and is intended for creating a small
+working object to run \strong{\code{check_zeroinflation()}} (or similar
+diagnostics) rapidly.
+}
+\examples{
+data(mini_mac)
+subsample_genes(mini_mac, ngene = 50, gselect = "random", seed = 1 )
+
+}
diff --git a/tests/testthat/test-subsample_genes.R b/tests/testthat/test-subsample_genes.R
new file mode 100644
index 0000000..a2b5169
--- /dev/null
+++ b/tests/testthat/test-subsample_genes.R
@@ -0,0 +1,17 @@
+test_that("subsample_genes works", {
+  # Load test dataset from tests/testthat/testdata
+  testdata <- get_testdata()
+  expect_error(subsample_genes(testdata,
+                  ngene = 1000,
+                  gselect = "random", 
+                  seed = 1))
+  expect_error(subsample_genes(testdata,
+                  ngene = 100,
+                  gselect = "min", 
+                  seed = 1))
+  subsampled_genes <- subsample_genes(testdata,
+                                     ngene = 50,
+                                     gselect = "random", 
+                                     seed = 1)
+  expect_s4_class(subsampled_genes,"Seurat")
+})

From ee5c3f21461036185f24ef8b6a258e603d6204f0 Mon Sep 17 00:00:00 2001
From: Xin Liu <xin.liu@petermac.org>
Date: Wed, 8 Oct 2025 16:00:15 +1100
Subject: [PATCH 07/11] updated check_zeroinflation.R

---
 R/check_zeroinflation.R | 52 +++++++++++++++--------------------------
 1 file changed, 19 insertions(+), 33 deletions(-)

diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R
index a0b97c0..1117ae3 100644
--- a/R/check_zeroinflation.R
+++ b/R/check_zeroinflation.R
@@ -1,7 +1,7 @@
-#' Quick group-aware zero-inflation check (NB baseline via edgeR)
+#' Quick group-aware zero-inflation check (Negative Binomial baseline via edgeR)
 #'
 #' @description
-#' Computes a **group-aware Zero-Inflation (ZI) index** for each gene using a
+#' Computes a **sample group-aware Zero-Inflation (ZI) index** for each gene using a
 #' negative-binomial (NB) baseline fitted with **edgeR**. For each group
 #' (e.g., drug condition), the function:
 #' 1) estimates gene-wise tagwise dispersions with edgeR (using all selected groups),
@@ -13,13 +13,12 @@
 #' standard NB GLM methods (edgeR/DESeq2) are adequate or whether a
 #' zero-aware workflow (e.g., ZINB-WaVE) might be warranted.
 #'
-#' @important
-#' This helper **relies on edgeR** to estimate dispersion. The current
+#' This function **relies on edgeR** to estimate dispersion. The current
 #' implementation requires **≥2 groups** in the design so that edgeR can
 #' stabilize gene-wise dispersions across groups. If you only have a single
 #' group and still want a design-aware baseline for expected zeros, fit a
-#' Gamma–Poisson/NB GLM yourself (e.g., with **glmGamPoi**) and compute the
-#' expected zero probabilities from its fitted means and overdispersion.
+#' Gamma–Poisson/NB GLM (e.g., with **glmGamPoi**) and compute the
+#' expected zero probabilities from its fitted means and over-dispersion.
 #'
 #' @param data Seurat object.
 #' @param group_by Character, column in `data@meta.data` that defines groups
@@ -42,11 +41,11 @@
 #'   (e.g., `glmGamPoi::glm_gp`) will yield more faithful expected-zero rates.
 #' - For single-group experiments, consider either adding a reference group or
 #'   switching to a GLM-based baseline that does not require multiple groups.
-#'
+#' @export
 #' @examples
-#' # check_zeroinflation(mini_mac, group_by = "combined_id",
-#' #                           samples = c("DrugA_10", "DMSO_0"))
-
+#' data(mini_mac)
+#' check_zeroinflation(mini_mac, group_by = "combined_id",
+#'                      samples = c("DMSO_0","Staurosporine_10"))
 
 
 check_zeroinflation <- function(data = NULL,
@@ -63,14 +62,13 @@ check_zeroinflation <- function(data = NULL,
     
     # check samples in combined_id column
     meta_groups <- as.character(data@meta.data[[group_by]])
-    matched_groups <- !is.null(samples) && any(grepl(samples, meta_groups))
-    if (!matched_groups){
+    matched_groups <-  samples %in% meta_groups
+    if (is.null(samples)){
       # all samples included
       samples <- unique(data@meta.data[[group_by]])
       cat("All samples will be included in the combined_id column.")
-    } 
-    # need at least two groups for edgeR dispersion estimation
-    if (length(samples) == 1) {
+    } else if  (length(samples) == 1 || length(which(matched_groups==TRUE)) < 2)  {
+      # need at least two groups for edgeR dispersion estimation
       stop("Two treatment groups are needed to calculate dispersion using edgeR.")
     } 
     # check cutoffs
@@ -79,17 +77,19 @@ check_zeroinflation <- function(data = NULL,
     }
     return(list(data = data, group_by = group_by, samples = samples, cutoffs = cutoffs))
   }
-  
+  validated <- validate_inputs(data, group_by, samples,cutoffs)
+  data <- validated$data
+  group_by <- validated$group_by
+  samples <- validated$samples
+  cutoffs <- validated$cutoffs
   mac_data <- subset(data, subset = combined_id %in% samples)
   count_matrix <- GetAssayData(mac_data, assay = "RNA", layer = "counts")
   count_matrix <- Matrix::Matrix(count_matrix, sparse = TRUE)
   obs_zero <- Matrix::rowMeans(count_matrix == 0)
-  
   # Negative binomial expected zeros
   # using edgeR for dispersion estimation
   dge <- edgeR::DGEList(counts = count_matrix)
   dge <- edgeR::calcNormFactors(dge, method = "TMMwsp")
-  
   #design matrix
   combined_id <- mac_data$combined_id
   #make up batch parameter
@@ -101,31 +101,25 @@ check_zeroinflation <- function(data = NULL,
   # Build per-sample NB mean mu_gj using TMMwsp-scaled library sizes
   # Effective library sizes
   eff_lib <- dge$samples$lib.size * dge$samples$norm.factors
-  
   per_group_gene_metrics <- lapply(samples, function(g){
-    
     idx <- which(combined_id == g)
     n_wells <- length(idx)
     # sub count matrix for group g
     count_matrix_g <- count_matrix[, idx, drop=FALSE]
     # Observed zeros within group g
     p0_obs_g <- Matrix::rowMeans(count_matrix_g==0)
-    
     # count zeros per gene within group g
     # sum later for summary
     obs_zero_num_g <- Matrix::rowSums(count_matrix_g==0)
-    
     # Group-specific q_{g,g} using only wells in group g
     eff_lib_g <- eff_lib[idx]
     total_eff_lib_g <- sum(eff_lib_g)
     total_counts_per_gene_g <- Matrix::rowSums(count_matrix_g)
     q_g_g <- as.numeric(total_counts_per_gene_g) / total_eff_lib_g
-    
     # NB-expected zeros within group g (average over wells in g)
     eps <- 1e-12
     phi_safe <- pmax(phi, eps)
     inv_phi <- 1 / phi_safe
-    
     # Fast loop over wells in g, no GxJ materialization
     p0_nb_sum_g <- numeric(nrow(count_matrix))
     for (j in seq_along(idx)) {
@@ -134,17 +128,14 @@ check_zeroinflation <- function(data = NULL,
       p0_nb_sum_g <- p0_nb_sum_g + (1 + phi_safe * mu_gj)^(-inv_phi)
     }
     p0_nb_g <- p0_nb_sum_g / length(idx)
-    
     # Poisson fallback where phi ~ 0
     poi_idx <- which(phi < 1e-8)
     if (length(poi_idx)) {
       mu_bar_g <- q_g_g * mean(eff_lib_g)
       p0_nb_g[poi_idx] <- exp(-mu_bar_g[poi_idx])
     }
-    
-    # 4) ZI within group g
+    # ZI within group g
     zi_g <- p0_obs_g - p0_nb_g
-    
     data.frame(
       group = g,
       gene  = rownames(count_matrix),
@@ -158,13 +149,10 @@ check_zeroinflation <- function(data = NULL,
       stringsAsFactors = FALSE
     )
   })
-  
   gene_metrics_by_group <- do.call(rbind, per_group_gene_metrics)
-  
   #if there are more than one cutoffs, calculate pct_ZI_gt_ for each cutoff
   # Per-group summaries (one row per group)
   summary_by_group <- do.call(rbind, lapply(split(gene_metrics_by_group, gene_metrics_by_group$group), function(df){
-    
     list_a <- list(
       group            = unique(df$group),
       n_genes          = nrow(df),
@@ -175,13 +163,11 @@ check_zeroinflation <- function(data = NULL,
       observed_zeros_num       = sum(df$obs_zeros_num),
       expected_zeros_num  = sum(df$expected_zeros_num) 
     )
-    
     list_b <- lapply(cutoffs, function(cutoff){
       pct_name <- paste0("pct_ZI_gt_", cutoff)
       pct_value <- mean(df$ZI > cutoff)
       setNames(list(pct_value), pct_name)
     })
-    
     as.data.frame(c(list_a, list_b))
   })) 
   # Return just the selected groups' indices instead of plate-level

From ddb4007248653b759aa4e2ed9cb022aead90fca7 Mon Sep 17 00:00:00 2001
From: Xin Liu <xin.liu@petermac.org>
Date: Wed, 8 Oct 2025 16:00:51 +1100
Subject: [PATCH 08/11] update Suggests in DESCRIPTION and NAMESPACE

---
 DESCRIPTION | 3 ++-
 NAMESPACE   | 2 ++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 5322c59..7264e5c 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -75,7 +75,8 @@ Suggests:
     scuttle,
     methods,
     SeuratObject,
-    SummarizedExperiment
+    SummarizedExperiment,
+    seqgendiff
 Config/testthat/edition: 3
 Imports:
     dplyr,
diff --git a/NAMESPACE b/NAMESPACE
index e616818..b7a43c4 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,6 +1,7 @@
 # Generated by roxygen2: do not edit by hand
 
 export(aggregate_by_de)
+export(check_zeroinflation)
 export(compute_chem_descriptors)
 export(compute_de_umap)
 export(compute_hyper_enrich_bg)
@@ -34,6 +35,7 @@ export(plot_qc_metrics_heatmap)
 export(plot_rle)
 export(plot_volcano)
 export(read_metadata)
+export(subsample_genes)
 export(summarise_de)
 export(validate_metadata)
 import(DESeq2)

From 517d272715a846949ece3043cf2460d003e5aa50 Mon Sep 17 00:00:00 2001
From: Xin Liu <xin.liu@petermac.org>
Date: Thu, 9 Oct 2025 11:11:38 +1100
Subject: [PATCH 09/11] update check_zeroinflation.R

---
 R/check_zeroinflation.R    | 4 ++--
 man/check_zeroinflation.Rd | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/R/check_zeroinflation.R b/R/check_zeroinflation.R
index 1117ae3..84ab865 100644
--- a/R/check_zeroinflation.R
+++ b/R/check_zeroinflation.R
@@ -17,7 +17,7 @@
 #' implementation requires **≥2 groups** in the design so that edgeR can
 #' stabilize gene-wise dispersions across groups. If you only have a single
 #' group and still want a design-aware baseline for expected zeros, fit a
-#' Gamma–Poisson/NB GLM (e.g., with **glmGamPoi**) and compute the
+#' Gamma–Poisson/NB GLM and compute the
 #' expected zero probabilities from its fitted means and over-dispersion.
 #'
 #' @param data Seurat object.
@@ -172,7 +172,7 @@ check_zeroinflation <- function(data = NULL,
   })) 
   # Return just the selected groups' indices instead of plate-level
   list(
-    gene_metrics_by_group = gene_metrics_by_group %>% head(10),     # long format: group × gene
+    gene_metrics_by_group = gene_metrics_by_group,     # long format: group × gene
     summary_by_group      = summary_by_group
   )
   
diff --git a/man/check_zeroinflation.Rd b/man/check_zeroinflation.Rd
index d7dd7ec..f79125a 100644
--- a/man/check_zeroinflation.Rd
+++ b/man/check_zeroinflation.Rd
@@ -54,7 +54,7 @@ This function \strong{relies on edgeR} to estimate dispersion. The current
 implementation requires \strong{≥2 groups} in the design so that edgeR can
 stabilize gene-wise dispersions across groups. If you only have a single
 group and still want a design-aware baseline for expected zeros, fit a
-Gamma–Poisson/NB GLM (e.g., with \strong{glmGamPoi}) and compute the
+Gamma–Poisson/NB GLM and compute the
 expected zero probabilities from its fitted means and over-dispersion.
 }
 \note{

From cc04a9bc21dc53d85ba2adf789beaab68c47c39c Mon Sep 17 00:00:00 2001
From: Xin Liu <xin.liu@petermac.org>
Date: Thu, 9 Oct 2025 11:20:22 +1100
Subject: [PATCH 10/11] add vignettes/check_zero_inflation.Rmd

---
 vignettes/check_zero_inflation.Rmd | 243 +++++++++++++++++++++++++++++
 1 file changed, 243 insertions(+)
 create mode 100644 vignettes/check_zero_inflation.Rmd

diff --git a/vignettes/check_zero_inflation.Rmd b/vignettes/check_zero_inflation.Rmd
new file mode 100644
index 0000000..b74f9e9
--- /dev/null
+++ b/vignettes/check_zero_inflation.Rmd
@@ -0,0 +1,243 @@
+---
+title: "Check for zero-inflation in your data"
+output:
+  rmarkdown::html_document:
+    theme: flatly
+    toc: true
+    toc_float: true
+    toc_depth: 5
+vignette: >
+  %\VignetteIndexEntry{check_zero_inflation}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+  %\VignetteBuild{true}
+  
+---
+
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>",
+  digits = 3
+)
+options(digits = 3) 
+```
+
+
+
+## Overview 
+
+This is a quick demonstration of the `check_zeroinflation()` function from our **macpie** package. This function is a fast diagnostic tool to help you assess whether your MACseq data exhibits zero-inflation relative to a negative binomial (NB) model.
+
+We will use a lightweight convenience wrapper `subsample_genes()` around **`seqgendiff::select_counts()`** to create a smaller object with a subset of genes for faster computation. This function is also part of the **macpie** package. 
+
+Under the hood, `check_zeroinflation()` uses **edgeR** to estimate gene-wise dispersions and compute expected zero probabilities under a NB model. It then compares the observed and expected zero-inflated indexes for each gene within each group defined in the metadata.
+
+This function:
+
+  - estimates gene-wise tagwise dispersions with edgeR (using all selected groups),
+
+  - builds NB-expected zero probabilities from TMMwsp-scaled means, and
+
+  - returns per-gene ZI (observed zeros minus NB-expected zeros) and per-group summaries (e.g., % genes with ZI > 0.05). ZI-cutoffs are user-defined.
+  
+
+The output is a list with two elements:
+
+  - `summary_by_group`: A summary table showing the number and percentage of genes that are zero-inflated for each group at specified cutoffs.
+  
+  - `gene_metrics_by_group`: A detailed table with gene-wise metrics, including observed and expected zero numbers and proportions, zero-inflation index (ZI), mean count, and dispersion estimates for each gene in each group.
+
+**Note**: `check_zeroinflation()` relies on edgeR to estimate dispersion. The current implementation requires ≥2 groups in the design so that edgeR can stabilize gene-wise dispersions across groups. If you only have a single group and still want a design-aware baseline for expected zeros, fit a Gamma–Poisson/NB GLM and compute the expected zero probabilities from its fitted means and over-dispersion.
+
+***
+<br>
+
+```{r set_wd, include=FALSE}
+dir <- "/Users/liuxin/macpie_Dev/"
+devtools::load_all(paste0(dir, "macpie/"))
+```
+
+## Load data
+
+Load data and preprocess before subsampling genes and checking for zero-inflation.
+
+```{r setup}
+#install.packages("macpie")  # or devtools::install_github("PMCC-BioinformaticsCore/macpie")
+library(macpie)
+
+# Define project variables
+project_name <- "PMMSq033"
+project_metadata <- system.file("extdata/PMMSq033_metadata.csv", package = "macpie")
+# Load metadata
+metadata <- read_metadata(project_metadata)
+
+```
+
+```{r load_data}
+# Import raw data
+project_rawdata <- paste0(dir, "/macpieData/PMMSq033/raw_matrix")
+project_name <- "PMMSq033"
+raw_counts <- Read10X(data.dir = project_rawdata)
+# Create tidySeurat object
+mac <- CreateSeuratObject(counts = raw_counts,
+                          project = project_name,
+                          min.cells = 1,
+                          min.features = 1)
+# Join with metadata
+mac <- mac %>%
+  inner_join(metadata, by = c(".cell" = "Barcode"))
+# Add unique identifier
+mac <- mac %>%
+  mutate(combined_id = str_c(Compound_ID, Concentration_1, sep = "_")) %>%
+  mutate(combined_id = gsub(" ", "", .data$combined_id))
+mac <- mac %>%
+  filter(Project == "Current")
+
+```
+
+Filter genes with very low counts across all samples. This step is important because genes with extremely low expression can lead to unreliable estimates of dispersion and expected zero probabilities, which can skew the zero-inflation assessment.
+
+```{r}
+# Filter by read count per sample group
+mac <- filter_genes_by_expression(mac, 
+                                  group_by = "combined_id", 
+                                  min_counts = 10, 
+                                  min_samples = 2)
+```
+
+
+## Subsample genes for faster computation
+
+This is to randomly select a subset of 1000 genes for a quick check. For a more comprehensive analysis, consider using a larger subset.
+
+```{r}
+# Subsample genes for faster computation
+sub_mac <- subsample_genes(mac, ngene = 1000, gselect = "random", seed = 1)
+sub_mac %>% nrow()
+```
+
+
+## Check for zero-inflation  
+
+We can now run the zero-inflation check on the subsampled data.
+
+### DMSO vs Camptothecin
+
+First, we will compare two groups: "DMSO_0" and "CAMPTOTHECIN_10".
+
+
+  
+```{r}
+zi_results <- check_zeroinflation(sub_mac, 
+                                  group_by = "combined_id",
+                                  samples = c("DMSO_0","CAMPTOTHECIN_10"),
+                                  batch = 1,
+                                  cutoffs = c(0.1, 0.2))
+
+```
+
+#### View gene-wise metrics for each group
+
+For DMSO group: 
+
+```{r}
+zi_results$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head()
+```
+
+For Camptothecin group:
+
+```{r}
+zi_results$gene_metrics_by_group %>% filter(group=="CAMPTOTHECIN_10") %>% head()
+```
+
+#### View summary statistics for each group
+
+```{r}
+zi_results$summary_by_group
+```
+
+
+From the summary table, we can see the summary statistics for each group, including the number and percentage of genes that are zero-inflated at the specified cutoffs.
+
+Each of the columns in the summary table are defined as follows:
+
+  - group: The treatment group
+  
+  - n_genes: Total number of genes analyzed in the group
+  
+  - n_wells:  Total number of wells/samples in the group
+  
+  - median_p0_obs: Median observed zero proportion across genes in the group
+  
+  - median_p0_nb: Median expected zero proportion under the NB model across genes in the group
+  
+  - median_ZI: Median zero-inflation index (ZI = p0_nb - p0_obs for each gene) across genes in the group
+  
+  - observed_zeros_num: Number of data points with observed zeros (it shouldn't be more than n_genes*n_wells for each group)
+  
+  - expected_zeros_num: Number of data points with expected zeros under the NB model (same here, it shouldn't be more than n_genes*n_wells for each group)
+  
+  - pct_ZI_gt_0.1: Percentage of genes with ZI greater than 0.1
+  
+  - pct_ZI_gt_0.2: Percentage of genes with ZI greater than 0.2
+
+***  
+<br> 
+
+#### See below for interpretation of the results
+
+For DMSO vs camptothecin, we can see that both DMSO and camptothecin groups have negative median ZI values, indicating that there is no significant zero-inflation in either group. Percentage of genes with ZI greater than 0.1 or 0.2 are 13.5% and 6.6% for camptothecin. As camptothecin acts as a DNA topoisomerase inhibitor and induces DNA damage, it can lead to cell cycle arrest and apoptosis, which may reduce the overall transcriptional activity in the cells. This reduction in transcriptional activity can result in genes being expressed at lower levels or not at all. On the other hand, DMSO is a common solvent used in biological experiments and is not expected to induce significant changes in gene expression. Therefore, it is reasonable to observe a lower level of zero-inflation in the DMSO group compared to the camptothecin group. Same as ZI results for DMSO, the percentage of genes with ZI greater than 0.1 or 0.2 are both 0%.
+
+Besides, the observed zeros are also lower than the expected zeros, further supporting the absence of zero-inflation. Overall, these results suggest that the data does not exhibit significant zero-inflation, and a standard negative binomial model may be appropriate for downstream analyses. Zero-inflated models (ZINB) may not be necessary for this dataset.
+
+
+
+```{r, include=FALSE}
+rm(zi_results)
+```
+
+***
+<br>
+  
+  
+  
+### DMSO vs Staruosporine 
+  
+Next, we will compare two treatment groups: "DMSO_0" and "Staurosporine_10". As staurosporine is a potent inducer of apoptosis, we might expect to see more zero-inflation in this group compared to DMSO.
+
+
+```{r}
+# Check for zero-inflation
+zi_results <- check_zeroinflation(sub_mac, 
+                                  group_by = "combined_id",
+                                  samples = c("DMSO_0","Staurosporine_10"),
+                                  batch = 1,
+                                  cutoffs = c(0.1, 0.2))
+```
+
+
+#### View gene-wise metrics for each group
+
+For DMSO group: 
+
+```{r}
+zi_results$gene_metrics_by_group %>% filter(group=="DMSO_0") %>% head()
+```
+
+For Staurosporine group: 
+```{r}
+zi_results$gene_metrics_by_group %>% filter(group=="Staurosporine_10") %>% head()
+```
+
+#### View summary statistics for each group
+
+```{r}
+zi_results$summary_by_group
+```
+#### See below for interpretation of the results
+
+For DMSO vs staurosporine, we can see that only staurosporine has 14% of median ZI value, the observed zeros number (713) is much higher than the expected zeros number (~355), indicating that there is some level of zero-inflation in the staurosporine group. As staurosporine is a cell death control, which it is expected to see zero-inflated data in this group. In this case, it may be appropriate to consider using a zero-inflated negative binomial (ZINB) model for downstream analyses to account for the excess zeros in the data.
+
+

From a42e67a52d5b6d966b1a2258ab4cdc8215893d5f Mon Sep 17 00:00:00 2001
From: Xin Liu <xin.liu@petermac.org>
Date: Thu, 9 Oct 2025 11:22:49 +1100
Subject: [PATCH 11/11] update pkgdown.yaml

---
 _pkgdown.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/_pkgdown.yaml b/_pkgdown.yaml
index 6247e65..3fc5d00 100644
--- a/_pkgdown.yaml
+++ b/_pkgdown.yaml
@@ -31,6 +31,8 @@ navbar:
           href: articles/integration_across_modalities.html
         - text: "Working with Bioconductor classes"
           href: articles/macpie_bioc.html
+        - text: "Chck zero-inflation"
+          href: articles/check_zero_inflation.html
     - text: "Reference"
       href: reference/index.html  # Links to your function reference docs.
   right: