From 3c05c75dc334f901a9aa8910e2d1f19d449c69c6 Mon Sep 17 00:00:00 2001 From: Xin Liu Date: Wed, 22 Oct 2025 17:30:49 +1100 Subject: [PATCH] fix non-ASCII in select_robust_controls.R --- R/select_robust_controls.R | 70 ++++++++++++++++++----------------- R/utils.R | 3 +- man/select_robust_controls.Rd | 59 ++++++++++++++--------------- 3 files changed, 68 insertions(+), 64 deletions(-) diff --git a/R/select_robust_controls.R b/R/select_robust_controls.R index 0bf62f8..1a8d3eb 100644 --- a/R/select_robust_controls.R +++ b/R/select_robust_controls.R @@ -2,57 +2,59 @@ #' #' @description #' For a given control group (e.g., DMSO) on a specific plate/batch, this function -#' ranks samples by their average correlation (Fisher z–averaged) to all *other* +#' ranks samples by their average correlation (Fisher z-averaged) to all *other* #' samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally) -#' plots per-sample expression distributions and sample–sample correlation heatmaps. +#' plots per-sample expression distributions and sample-sample correlation heatmaps. #' #' @param data A tidyseurat object containing an RNA assay with a **counts** layer. #' @param samples the control/treatment label to keep in column samples -#' (e.g., `"CB_43_EP73_0"`). Only cells/samples with this label are considered. +#' (e.g., "CB_43_EP73_0"). Only cells/samples with this label are considered. #' @param orig_ident Character scalar: the plate/batch identifier to keep -#' (e.g., `"VH02012942"`). Only cells/samples from this batch are considered. +#' (e.g., "VH02012942"). Only cells/samples from this batch are considered. #' @param cpm_filter Numeric scalar; CPM threshold used for gene filtering prior to -#' normalization (default `1`). -#' @param min_samps Integer; a gene must be expressed (CPM > `cpm_filter`) in at least -#' this many samples to be retained (default `16`). +#' normalization (default 1). +#' @param min_samps Integer; a gene must be expressed (CPM > cpm_filter) in at least +#' this many samples to be retained (default 16). #' @param corr_method Correlation type used for ranking; one of -#' `c("spearman","pearson")` (default `"spearman"`). -#' @param top_n Integer; the number of top-ranked samples to report in `topN`. -#' Ties at the cutoff are kept (default `5`). -#' @param make_plots Logical; if `TRUE`, print a log2-CPM boxplot and Pearson/Spearman -#' correlation heatmaps (default `TRUE`). +#' c("spearman","pearson") (default "spearman"). +#' @param top_n Integer; the number of top-ranked samples to report in topN. +#' Ties at the cutoff are kept (default 5). +#' @param make_plots Logical; if TRUE, print a log2-CPM boxplot and Pearson/Spearman +#' correlation heatmaps (default TRUE). #' #' @details #' Workflow: -#' 1) Subset to the specified `samples` **and** `orig_ident` (plate/batch). -#' 2) Build an `edgeR::DGEList`, filter lowly expressed genes using CPM and `min_samps`. -#' 3) Normalize with **TMMwsp** and compute log2-CPM. -#' 4) Rank samples by mean Fisher z–transformed correlation to all *other* samples -#' (according to `corr_method`). +#' 1) Subset to the specified samples and orig_ident (plate/batch). +#' 2) Build an edgeR::DGEList, filter lowly expressed genes using CPM and min_samps. +#' 3) Normalize with TMMwsp and compute log2-CPM. +#' 4) Rank samples by mean Fisher z transformed correlation to all other samples +#' (according to corr_method). #' 5) Return the ranking, correlation matrices, the normalized matrix, and (optionally) #' plots for QC. #' -#' Column names of the counts matrix are rewritten to `"_"` +#' Column names of the counts matrix are rewritten to "_" #' for easier visual inspection in plots. #' #' @return A list with elements: -#' \item{subset_obj}{The Seurat object subset used for analysis.} -#' \item{dge}{The filtered `edgeR::DGEList`.} -#' \item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).} -#' \item{boxplot_df}{Long-format data frame used for the boxplot (`gene`, `sample`, `log_cpm`).} -#' \item{cor_pearson}{Sample–sample Pearson correlation matrix.} -#' \item{cor_spearman}{Sample–sample Spearman correlation matrix.} -#' \item{ranking_method}{The correlation method used for ranking.} -#' \item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed -#' correlations (higher = better), sorted decreasing.} -#' \item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).} -#' +#' * subset_obj: The Seurat object subset used for analysis. +#' * dge: The filtered edgeR::DGEList +#' * log_cpm_tmm: Matrix of TMMwsp log2-CPM. +#' * boxplot_df: Long-format data frame used for the boxplot (gene, sample, log_cpm). +#' * cor_pearson: Sample-sample Pearson correlation matrix. +#' * cor_spearman: Sample-sample Spearman correlation matrix. +#' * ranking_method: The correlation method used for ranking. +#' * scores_mean_to_others: Named numeric vector of mean Fisher-z back-transformed +#' correlations (higher = better), sorted decreasing. +#' * topN: Named numeric vector of the top-ranked samples (ties at the cutoff kept). + +#' +#' #' @examples #' data(mini_mac) #' res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini") #' #' -#' +#' @importFrom rlang .data #' @importFrom edgeR DGEList calcNormFactors cpm #' @importFrom tibble rownames_to_column #' @importFrom tidyr pivot_longer @@ -72,7 +74,7 @@ select_robust_controls <- function( if (!inherits(data, "Seurat")) { stop("argument 'data' must be a Seurat or TidySeurat object.") } - + # check samples and orig_ident columns if (colnames(data@meta.data)%in% c("combined_id","orig.ident") %>% sum() < 2) { stop("The 'data' object must contain 'combined_id' and 'orig.ident' columns in its metadata.") @@ -123,7 +125,7 @@ select_robust_controls <- function( warning("Package 'ggplot2' not available; skipping boxplot.") } else { print( - ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) + + ggplot2::ggplot(df_long, ggplot2::aes(x = .data$sample, y = .data$log_cpm)) + ggplot2::geom_boxplot(outlier.size = 0.5) + ggplot2::labs(x = "Sample", y = "log2 CPM", title = "Boxplot of log2-CPM (TMMwsp)") + @@ -139,8 +141,8 @@ select_robust_controls <- function( if (!requireNamespace("pheatmap", quietly = TRUE)) { warning("Package 'pheatmap' not available; skipping heatmaps.") } else { - pheatmap::pheatmap(cors_pear, main = "Sample–sample correlation (Pearson, log2-CPM)") - pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)") + pheatmap::pheatmap(cors_pear, main = "Pearson correlation") + pheatmap::pheatmap(cors_spear, main = "Spearman correlation") } } # Ranking by mean Fisher-z correlation to all *other* samples diff --git a/R/utils.R b/R/utils.R index 0718920..3036c14 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,3 +1,4 @@ utils::globalVariables(c("diff_expressed", "gene_labels", ".cell", "UMAPde_1", "UMAPde_2", - "padj", "target", "target_id")) \ No newline at end of file + "padj", "target", "target_id", + "log_cpm")) \ No newline at end of file diff --git a/man/select_robust_controls.Rd b/man/select_robust_controls.Rd index e05a997..fb4254f 100644 --- a/man/select_robust_controls.Rd +++ b/man/select_robust_controls.Rd @@ -19,58 +19,60 @@ select_robust_controls( \item{data}{A tidyseurat object containing an RNA assay with a \strong{counts} layer.} \item{samples}{the control/treatment label to keep in column samples -(e.g., \code{"CB_43_EP73_0"}). Only cells/samples with this label are considered.} +(e.g., "CB_43_EP73_0"). Only cells/samples with this label are considered.} \item{orig_ident}{Character scalar: the plate/batch identifier to keep -(e.g., \code{"VH02012942"}). Only cells/samples from this batch are considered.} +(e.g., "VH02012942"). Only cells/samples from this batch are considered.} \item{cpm_filter}{Numeric scalar; CPM threshold used for gene filtering prior to -normalization (default \code{1}).} +normalization (default 1).} -\item{min_samps}{Integer; a gene must be expressed (CPM > \code{cpm_filter}) in at least -this many samples to be retained (default \code{16}).} +\item{min_samps}{Integer; a gene must be expressed (CPM > cpm_filter) in at least +this many samples to be retained (default 16).} \item{corr_method}{Correlation type used for ranking; one of -\code{c("spearman","pearson")} (default \code{"spearman"}).} +c("spearman","pearson") (default "spearman").} -\item{top_n}{Integer; the number of top-ranked samples to report in \code{topN}. -Ties at the cutoff are kept (default \code{5}).} +\item{top_n}{Integer; the number of top-ranked samples to report in topN. +Ties at the cutoff are kept (default 5).} -\item{make_plots}{Logical; if \code{TRUE}, print a log2-CPM boxplot and Pearson/Spearman -correlation heatmaps (default \code{TRUE}).} +\item{make_plots}{Logical; if TRUE, print a log2-CPM boxplot and Pearson/Spearman +correlation heatmaps (default TRUE).} } \value{ A list with elements: -\item{subset_obj}{The Seurat object subset used for analysis.} -\item{dge}{The filtered \code{edgeR::DGEList}.} -\item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).} -\item{boxplot_df}{Long-format data frame used for the boxplot (\code{gene}, \code{sample}, \code{log_cpm}).} -\item{cor_pearson}{Sample–sample Pearson correlation matrix.} -\item{cor_spearman}{Sample–sample Spearman correlation matrix.} -\item{ranking_method}{The correlation method used for ranking.} -\item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed -correlations (higher = better), sorted decreasing.} -\item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).} +\itemize{ +\item subset_obj: The Seurat object subset used for analysis. +\item dge: The filtered edgeR::DGEList +\item log_cpm_tmm: Matrix of TMMwsp log2-CPM. +\item boxplot_df: Long-format data frame used for the boxplot (gene, sample, log_cpm). +\item cor_pearson: Sample-sample Pearson correlation matrix. +\item cor_spearman: Sample-sample Spearman correlation matrix. +\item ranking_method: The correlation method used for ranking. +\item scores_mean_to_others: Named numeric vector of mean Fisher-z back-transformed +correlations (higher = better), sorted decreasing. +\item topN: Named numeric vector of the top-ranked samples (ties at the cutoff kept). +} } \description{ For a given control group (e.g., DMSO) on a specific plate/batch, this function -ranks samples by their average correlation (Fisher z–averaged) to all \emph{other} +ranks samples by their average correlation (Fisher z-averaged) to all \emph{other} samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally) -plots per-sample expression distributions and sample–sample correlation heatmaps. +plots per-sample expression distributions and sample-sample correlation heatmaps. } \details{ Workflow: \enumerate{ -\item Subset to the specified \code{samples} \strong{and} \code{orig_ident} (plate/batch). -\item Build an \code{edgeR::DGEList}, filter lowly expressed genes using CPM and \code{min_samps}. -\item Normalize with \strong{TMMwsp} and compute log2-CPM. -\item Rank samples by mean Fisher z–transformed correlation to all \emph{other} samples -(according to \code{corr_method}). +\item Subset to the specified samples and orig_ident (plate/batch). +\item Build an edgeR::DGEList, filter lowly expressed genes using CPM and min_samps. +\item Normalize with TMMwsp and compute log2-CPM. +\item Rank samples by mean Fisher z transformed correlation to all other samples +(according to corr_method). \item Return the ranking, correlation matrices, the normalized matrix, and (optionally) plots for QC. } -Column names of the counts matrix are rewritten to \code{"_"} +Column names of the counts matrix are rewritten to "_" for easier visual inspection in plots. } \examples{ @@ -78,5 +80,4 @@ data(mini_mac) res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini") - }