From 3c05c75dc334f901a9aa8910e2d1f19d449c69c6 Mon Sep 17 00:00:00 2001
From: Xin Liu <xin.liu@petermac.org>
Date: Wed, 22 Oct 2025 17:30:49 +1100
Subject: [PATCH] fix non-ASCII in select_robust_controls.R

---
 R/select_robust_controls.R    | 70 ++++++++++++++++++-----------------
 R/utils.R                     |  3 +-
 man/select_robust_controls.Rd | 59 ++++++++++++++---------------
 3 files changed, 68 insertions(+), 64 deletions(-)

diff --git a/R/select_robust_controls.R b/R/select_robust_controls.R
index 0bf62f8..1a8d3eb 100644
--- a/R/select_robust_controls.R
+++ b/R/select_robust_controls.R
@@ -2,57 +2,59 @@
 #'
 #' @description
 #' For a given control group (e.g., DMSO) on a specific plate/batch, this function
-#' ranks samples by their average correlation (Fisher z–averaged) to all *other*
+#' ranks samples by their average correlation (Fisher z-averaged) to all *other*
 #' samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally)
-#' plots per-sample expression distributions and sample–sample correlation heatmaps.
+#' plots per-sample expression distributions and sample-sample correlation heatmaps.
 #'
 #' @param data A tidyseurat object containing an RNA assay with a **counts** layer.
 #' @param samples the control/treatment label to keep in column samples
-#'   (e.g., `"CB_43_EP73_0"`). Only cells/samples with this label are considered.
+#'   (e.g., "CB_43_EP73_0"). Only cells/samples with this label are considered.
 #' @param orig_ident Character scalar: the plate/batch identifier to keep
-#'   (e.g., `"VH02012942"`). Only cells/samples from this batch are considered.
+#'   (e.g., "VH02012942"). Only cells/samples from this batch are considered.
 #' @param cpm_filter Numeric scalar; CPM threshold used for gene filtering prior to
-#'   normalization (default `1`).
-#' @param min_samps Integer; a gene must be expressed (CPM > `cpm_filter`) in at least
-#'   this many samples to be retained (default `16`).
+#'   normalization (default 1).
+#' @param min_samps Integer; a gene must be expressed (CPM > cpm_filter) in at least
+#'   this many samples to be retained (default 16).
 #' @param corr_method Correlation type used for ranking; one of
-#'   `c("spearman","pearson")` (default `"spearman"`).
-#' @param top_n Integer; the number of top-ranked samples to report in `topN`.
-#'   Ties at the cutoff are kept (default `5`).
-#' @param make_plots Logical; if `TRUE`, print a log2-CPM boxplot and Pearson/Spearman
-#'   correlation heatmaps (default `TRUE`).
+#'   c("spearman","pearson") (default "spearman").
+#' @param top_n Integer; the number of top-ranked samples to report in topN.
+#'   Ties at the cutoff are kept (default 5).
+#' @param make_plots Logical; if TRUE, print a log2-CPM boxplot and Pearson/Spearman
+#'   correlation heatmaps (default TRUE).
 #'
 #' @details
 #' Workflow:
-#' 1) Subset to the specified `samples` **and** `orig_ident` (plate/batch).  
-#' 2) Build an `edgeR::DGEList`, filter lowly expressed genes using CPM and `min_samps`.  
-#' 3) Normalize with **TMMwsp** and compute log2-CPM.  
-#' 4) Rank samples by mean Fisher z–transformed correlation to all *other* samples
-#'    (according to `corr_method`).  
+#' 1) Subset to the specified samples and orig_ident (plate/batch).
+#' 2) Build an edgeR::DGEList, filter lowly expressed genes using CPM and min_samps.
+#' 3) Normalize with TMMwsp and compute log2-CPM.
+#' 4) Rank samples by mean Fisher z transformed correlation to all other samples
+#'    (according to corr_method).
 #' 5) Return the ranking, correlation matrices, the normalized matrix, and (optionally)
 #'    plots for QC.
 #'
-#' Column names of the counts matrix are rewritten to `"<orig.ident>_<Well_ID>"`
+#' Column names of the counts matrix are rewritten to "<orig.ident>_<Well_ID>"
 #' for easier visual inspection in plots.
 #'
 #' @return A list with elements:
-#' \item{subset_obj}{The Seurat object subset used for analysis.}
-#' \item{dge}{The filtered `edgeR::DGEList`.}
-#' \item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).}
-#' \item{boxplot_df}{Long-format data frame used for the boxplot (`gene`, `sample`, `log_cpm`).}
-#' \item{cor_pearson}{Sample–sample Pearson correlation matrix.}
-#' \item{cor_spearman}{Sample–sample Spearman correlation matrix.}
-#' \item{ranking_method}{The correlation method used for ranking.}
-#' \item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed
-#'   correlations (higher = better), sorted decreasing.}
-#' \item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).}
-#'
+#' * subset_obj: The Seurat object subset used for analysis.
+#' * dge: The filtered edgeR::DGEList
+#' * log_cpm_tmm: Matrix of TMMwsp log2-CPM.
+#' * boxplot_df: Long-format data frame used for the boxplot (gene, sample, log_cpm).
+#' * cor_pearson: Sample-sample Pearson correlation matrix.
+#' * cor_spearman: Sample-sample Spearman correlation matrix.
+#' * ranking_method: The correlation method used for ranking.
+#' * scores_mean_to_others: Named numeric vector of mean Fisher-z back-transformed
+#'   correlations (higher = better), sorted decreasing.
+#' * topN: Named numeric vector of the top-ranked samples (ties at the cutoff kept).
+
+#' 
+#' 
 #' @examples
 #' data(mini_mac)
 #' res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini")
 #'
 #' 
-#'
+#' @importFrom rlang .data
 #' @importFrom edgeR DGEList calcNormFactors cpm
 #' @importFrom tibble rownames_to_column
 #' @importFrom tidyr pivot_longer
@@ -72,7 +74,7 @@ select_robust_controls <- function(
     if (!inherits(data, "Seurat")) {
       stop("argument 'data' must be a Seurat or TidySeurat object.")
     }
-
+    
     # check samples and orig_ident columns
     if (colnames(data@meta.data)%in% c("combined_id","orig.ident") %>% sum() < 2) {
       stop("The 'data' object must contain 'combined_id' and 'orig.ident' columns in its metadata.")
@@ -123,7 +125,7 @@ select_robust_controls <- function(
       warning("Package 'ggplot2' not available; skipping boxplot.")
     } else {
       print(
-        ggplot2::ggplot(df_long, ggplot2::aes(x = sample, y = log_cpm)) +
+        ggplot2::ggplot(df_long, ggplot2::aes(x = .data$sample, y = .data$log_cpm)) +
           ggplot2::geom_boxplot(outlier.size = 0.5) +
           ggplot2::labs(x = "Sample", y = "log2 CPM",
                         title = "Boxplot of log2-CPM (TMMwsp)") +
@@ -139,8 +141,8 @@ select_robust_controls <- function(
     if (!requireNamespace("pheatmap", quietly = TRUE)) {
       warning("Package 'pheatmap' not available; skipping heatmaps.")
     } else {
-      pheatmap::pheatmap(cors_pear,  main = "Sample–sample correlation (Pearson, log2-CPM)")
-      pheatmap::pheatmap(cors_spear, main = "Sample–sample correlation (Spearman, log2-CPM)")
+      pheatmap::pheatmap(cors_pear,  main = "Pearson correlation")
+      pheatmap::pheatmap(cors_spear, main = "Spearman correlation")
     }
   }
   # Ranking by mean Fisher-z correlation to all *other* samples
diff --git a/R/utils.R b/R/utils.R
index 0718920..3036c14 100644
--- a/R/utils.R
+++ b/R/utils.R
@@ -1,3 +1,4 @@
 utils::globalVariables(c("diff_expressed", "gene_labels", ".cell",
                          "UMAPde_1", "UMAPde_2",
-                         "padj", "target", "target_id"))
\ No newline at end of file
+                         "padj", "target", "target_id",
+                         "log_cpm"))
\ No newline at end of file
diff --git a/man/select_robust_controls.Rd b/man/select_robust_controls.Rd
index e05a997..fb4254f 100644
--- a/man/select_robust_controls.Rd
+++ b/man/select_robust_controls.Rd
@@ -19,58 +19,60 @@ select_robust_controls(
 \item{data}{A tidyseurat object containing an RNA assay with a \strong{counts} layer.}
 
 \item{samples}{the control/treatment label to keep in column samples
-(e.g., \code{"CB_43_EP73_0"}). Only cells/samples with this label are considered.}
+(e.g., "CB_43_EP73_0"). Only cells/samples with this label are considered.}
 
 \item{orig_ident}{Character scalar: the plate/batch identifier to keep
-(e.g., \code{"VH02012942"}). Only cells/samples from this batch are considered.}
+(e.g., "VH02012942"). Only cells/samples from this batch are considered.}
 
 \item{cpm_filter}{Numeric scalar; CPM threshold used for gene filtering prior to
-normalization (default \code{1}).}
+normalization (default 1).}
 
-\item{min_samps}{Integer; a gene must be expressed (CPM > \code{cpm_filter}) in at least
-this many samples to be retained (default \code{16}).}
+\item{min_samps}{Integer; a gene must be expressed (CPM > cpm_filter) in at least
+this many samples to be retained (default 16).}
 
 \item{corr_method}{Correlation type used for ranking; one of
-\code{c("spearman","pearson")} (default \code{"spearman"}).}
+c("spearman","pearson") (default "spearman").}
 
-\item{top_n}{Integer; the number of top-ranked samples to report in \code{topN}.
-Ties at the cutoff are kept (default \code{5}).}
+\item{top_n}{Integer; the number of top-ranked samples to report in topN.
+Ties at the cutoff are kept (default 5).}
 
-\item{make_plots}{Logical; if \code{TRUE}, print a log2-CPM boxplot and Pearson/Spearman
-correlation heatmaps (default \code{TRUE}).}
+\item{make_plots}{Logical; if TRUE, print a log2-CPM boxplot and Pearson/Spearman
+correlation heatmaps (default TRUE).}
 }
 \value{
 A list with elements:
-\item{subset_obj}{The Seurat object subset used for analysis.}
-\item{dge}{The filtered \code{edgeR::DGEList}.}
-\item{log_cpm_tmm}{Matrix of TMMwsp log2-CPM (genes × samples).}
-\item{boxplot_df}{Long-format data frame used for the boxplot (\code{gene}, \code{sample}, \code{log_cpm}).}
-\item{cor_pearson}{Sample–sample Pearson correlation matrix.}
-\item{cor_spearman}{Sample–sample Spearman correlation matrix.}
-\item{ranking_method}{The correlation method used for ranking.}
-\item{scores_mean_to_others}{Named numeric vector of mean Fisher-z back-transformed
-correlations (higher = better), sorted decreasing.}
-\item{topN}{Named numeric vector of the top-ranked samples (ties at the cutoff kept).}
+\itemize{
+\item subset_obj: The Seurat object subset used for analysis.
+\item dge: The filtered edgeR::DGEList
+\item log_cpm_tmm: Matrix of TMMwsp log2-CPM.
+\item boxplot_df: Long-format data frame used for the boxplot (gene, sample, log_cpm).
+\item cor_pearson: Sample-sample Pearson correlation matrix.
+\item cor_spearman: Sample-sample Spearman correlation matrix.
+\item ranking_method: The correlation method used for ranking.
+\item scores_mean_to_others: Named numeric vector of mean Fisher-z back-transformed
+correlations (higher = better), sorted decreasing.
+\item topN: Named numeric vector of the top-ranked samples (ties at the cutoff kept).
+}
 }
 \description{
 For a given control group (e.g., DMSO) on a specific plate/batch, this function
-ranks samples by their average correlation (Fisher z–averaged) to all \emph{other}
+ranks samples by their average correlation (Fisher z-averaged) to all \emph{other}
 samples using edgeR's TMMwsp-normalized log2-CPM. It returns the ranking and (optionally)
-plots per-sample expression distributions and sample–sample correlation heatmaps.
+plots per-sample expression distributions and sample-sample correlation heatmaps.
 }
 \details{
 Workflow:
 \enumerate{
-\item Subset to the specified \code{samples} \strong{and} \code{orig_ident} (plate/batch).
-\item Build an \code{edgeR::DGEList}, filter lowly expressed genes using CPM and \code{min_samps}.
-\item Normalize with \strong{TMMwsp} and compute log2-CPM.
-\item Rank samples by mean Fisher z–transformed correlation to all \emph{other} samples
-(according to \code{corr_method}).
+\item Subset to the specified samples and orig_ident (plate/batch).
+\item Build an edgeR::DGEList, filter lowly expressed genes using CPM and min_samps.
+\item Normalize with TMMwsp and compute log2-CPM.
+\item Rank samples by mean Fisher z transformed correlation to all other samples
+(according to corr_method).
 \item Return the ranking, correlation matrices, the normalized matrix, and (optionally)
 plots for QC.
 }
 
-Column names of the counts matrix are rewritten to \code{"<orig.ident>_<Well_ID>"}
+Column names of the counts matrix are rewritten to "<orig.ident>_<Well_ID>"
 for easier visual inspection in plots.
 }
 \examples{
@@ -78,5 +80,4 @@ data(mini_mac)
 res <- select_robust_controls(mini_mac,samples = "DMSO_0", orig_ident = "PMMSq033_mini")
 
 
-
 }