From 883f2fe603318b23753d35ae6512a5e321cab7c4 Mon Sep 17 00:00:00 2001
From: Gregory Jefferis <jefferis@gmail.com>
Date: Wed, 29 Nov 2023 21:32:33 +0000
Subject: [PATCH 1/4] implement colScaleM/rowScaleM/geomScaleM

---
 DESCRIPTION                     |  2 +-
 NAMESPACE                       |  3 ++
 R/sparsemat.R                   | 70 ++++++++++++++++++++++++++++++++
 man/colScaleM.Rd                | 72 +++++++++++++++++++++++++++++++++
 tests/testthat/test-sparsemat.R | 34 ++++++++++++++++
 5 files changed, 180 insertions(+), 1 deletion(-)
 create mode 100644 R/sparsemat.R
 create mode 100644 man/colScaleM.Rd
 create mode 100644 tests/testthat/test-sparsemat.R

diff --git a/DESCRIPTION b/DESCRIPTION
index 4918301..0a8dd48 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -18,7 +18,7 @@ Imports:
     bit64,
     checkmate,
     glue,
-    Matrix,
+    Matrix (>= 1.5-3),
     grDevices,
     stats
 Suggests: 
diff --git a/NAMESPACE b/NAMESPACE
index 9d74675..988e48c 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,12 +1,15 @@
 # Generated by roxygen2: do not edit by hand
 
 export(add_cluster_info)
+export(colScaleM)
 export(cosine_sim)
 export(dataset_names)
+export(geomScaleM)
 export(id2char)
 export(partner_summary2adjacency_matrix)
 export(prepare_cosine_matrix)
 export(register_dataset)
+export(rowScaleM)
 importFrom(grDevices,hcl.colors)
 importFrom(stats,as.dist)
 importFrom(stats,hclust)
diff --git a/R/sparsemat.R b/R/sparsemat.R
new file mode 100644
index 0000000..ceaf106
--- /dev/null
+++ b/R/sparsemat.R
@@ -0,0 +1,70 @@
+#' Efficient column and row scaling for sparse matrices
+#'
+#' @details Conventionally in connectivity matrices, rows are upstream input
+#'   neurons and columns are downstream output neurons. \code{colScaleM} will
+#'   therefore normalise by the total input onto each downstream neuron while
+#'   \code{rowScaleM} will normalise by the total output from each upstream
+#'   neuron.
+#'
+#'   These functions will work with dense inputs but are less efficient than
+#'   their base R cousins (e.g. \code{scale}). As an example, normalising a 4x4k
+#'   subset of VNC connectivity data took \itemize{
+#'
+#'   \item colScaleM sparse 3.5ms
+#'
+#'   \item colScaleM dense 1460ms
+#'
+#'   \item scale dense 365ms
+#'
+#'   }
+#'
+#'   In conclusion sparse matrices with \code{colScaleM}/\code{rowScaleM} are
+#'   the way to go for real connectivity matrices! For large matrices they will
+#'   be \emph{much} faster and for small ones, both methods will be fast anyway.
+#'
+#'   Note also that these functions were originally called \code{colScale} etc
+#'   but then the \code{Matrix} package added functions of the same name.
+#'
+#' @param A a sparse (or dense) matrix
+#' @param na.rm when \code{T}, the default, converts any NA values (usually
+#'   resulting from divide by zero errors when a neuron has no partners) to 0
+#' @description \code{colScale} normalises a matrix by the sum of each column
+#'
+#' @return A scaled sparse matrix
+#' @export
+#'
+#' @seealso \code{Matrix::\link[Matrix]{colScale}} on which these are based and,
+#'   for the original version,
+#'   \url{https://stackoverflow.com/questions/39284774/column-rescaling-for-a-very-large-sparse-matrix-in-r}
+#'
+#' @examples
+#' library(Matrix)
+#' set.seed(42)
+#' A <- Matrix(rbinom(100,10,0.05), nrow = 10)
+#' rownames(A)=letters[1:10]
+#' colnames(A)=LETTERS[1:10]
+#'
+#' colScaleM(A)
+#' rowScaleM(A)
+#' geomScaleM(A)
+colScaleM <- function(A, na.rm=TRUE) {
+  scalefac = 1 / Matrix::colSums(A)
+  if(na.rm) scalefac[!is.finite(scalefac)]=0
+  Matrix::colScale(A, scalefac)
+}
+
+#' @export
+#' @rdname colScaleM
+#' @description \code{rowScale} normalises a matrix by the sum of each row
+rowScaleM <- function(A, na.rm=TRUE) {
+  scalefac = 1 / Matrix::rowSums(A)
+  if(na.rm) scalefac[!is.finite(scalefac)]=0
+  Matrix::rowScale(A, scalefac)
+}
+
+#' @export
+#' @rdname colScaleM
+#' @description sqrt of both column and row normalisation
+geomScaleM <- function(A, na.rm=TRUE) {
+  sqrt(colScaleM(A, na.rm=na.rm)*rowScaleM(A, na.rm=na.rm))
+}
diff --git a/man/colScaleM.Rd b/man/colScaleM.Rd
new file mode 100644
index 0000000..3bc8e86
--- /dev/null
+++ b/man/colScaleM.Rd
@@ -0,0 +1,72 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/sparsemat.R
+\name{colScaleM}
+\alias{colScaleM}
+\alias{rowScaleM}
+\alias{geomScaleM}
+\title{Efficient column and row scaling for sparse matrices}
+\usage{
+colScaleM(A, na.rm = TRUE)
+
+rowScaleM(A, na.rm = TRUE)
+
+geomScaleM(A, na.rm = TRUE)
+}
+\arguments{
+\item{A}{a sparse (or dense) matrix}
+
+\item{na.rm}{when \code{T}, the default, converts any NA values (usually
+resulting from divide by zero errors when a neuron has no partners) to 0}
+}
+\value{
+A scaled sparse matrix
+}
+\description{
+\code{colScale} normalises a matrix by the sum of each column
+
+\code{rowScale} normalises a matrix by the sum of each row
+
+sqrt of both column and row normalisation
+}
+\details{
+Conventionally in connectivity matrices, rows are upstream input
+  neurons and columns are downstream output neurons. \code{colScaleM} will
+  therefore normalise by the total input onto each downstream neuron while
+  \code{rowScaleM} will normalise by the total output from each upstream
+  neuron.
+
+  These functions will work with dense inputs but are less efficient than
+  their base R cousins (e.g. \code{scale}). As an example, normalising a 4x4k
+  subset of VNC connectivity data took \itemize{
+
+  \item colScaleM sparse 3.5ms
+
+  \item colScaleM dense 1460ms
+
+  \item scale dense 365ms
+
+  }
+
+  In conclusion sparse matrices with \code{colScaleM}/\code{rowScaleM} are
+  the way to go for real connectivity matrices! For large matrices they will
+  be \emph{much} faster and for small ones, both methods will be fast anyway.
+
+  Note also that these functions were originally called \code{colScale} etc
+  but then the \code{Matrix} package added functions of the same name.
+}
+\examples{
+library(Matrix)
+set.seed(42)
+A <- Matrix(rbinom(100,10,0.05), nrow = 10)
+rownames(A)=letters[1:10]
+colnames(A)=LETTERS[1:10]
+
+colScaleM(A)
+rowScaleM(A)
+geomScaleM(A)
+}
+\seealso{
+\code{Matrix::\link[Matrix]{colScale}} on which these are based and,
+  for the original version,
+  \url{https://stackoverflow.com/questions/39284774/column-rescaling-for-a-very-large-sparse-matrix-in-r}
+}
diff --git a/tests/testthat/test-sparsemat.R b/tests/testthat/test-sparsemat.R
new file mode 100644
index 0000000..9d20ff7
--- /dev/null
+++ b/tests/testthat/test-sparsemat.R
@@ -0,0 +1,34 @@
+test_that("multiplication works", {
+  # library(Matrix)
+  # set.seed(42)
+  # A <- Matrix(rbinom(100,10,0.05), nrow = 10)
+  # rownames(A)=letters[1:10]
+  # colnames(A)=LETTERS[1:10]
+  A=new("dgCMatrix", i = c(0L, 1L, 3L, 4L, 6L, 8L, 9L, 1L, 2L, 5L,
+6L, 0L, 2L, 3L, 7L, 9L, 0L, 1L, 3L, 5L, 8L, 9L, 3L, 5L, 6L, 7L,
+8L, 9L, 3L, 5L, 6L, 0L, 1L, 2L, 4L, 7L, 8L, 5L, 3L, 4L, 0L, 3L,
+4L, 5L, 8L, 9L), p = c(0L, 7L, 11L, 16L, 22L, 28L, 31L, 37L,
+38L, 40L, 46L), Dim = c(10L, 10L), Dimnames = list(c("a", "b",
+"c", "d", "e", "f", "g", "h", "i", "j"), c("A", "B", "C", "D",
+"E", "F", "G", "H", "I", "J")), x = c(2, 2, 1, 1, 1, 1, 1, 1,
+2, 2, 2, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1,
+1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1), factors = list())
+  colScaleM(A)
+  csA=new("dgCMatrix", i = c(0L, 1L, 3L, 4L, 6L, 8L, 9L, 1L, 2L, 5L,
+6L, 0L, 2L, 3L, 7L, 9L, 0L, 1L, 3L, 5L, 8L, 9L, 3L, 5L, 6L, 7L,
+8L, 9L, 3L, 5L, 6L, 0L, 1L, 2L, 4L, 7L, 8L, 5L, 3L, 4L, 0L, 3L,
+4L, 5L, 8L, 9L), p = c(0L, 7L, 11L, 16L, 22L, 28L, 31L, 37L,
+38L, 40L, 46L), Dim = c(10L, 10L), Dimnames = list(c("a", "b",
+"c", "d", "e", "f", "g", "h", "i", "j"), c("A", "B", "C", "D",
+"E", "F", "G", "H", "I", "J")), x = c(0.222222222222222, 0.222222222222222,
+0.111111111111111, 0.111111111111111, 0.111111111111111, 0.111111111111111,
+0.111111111111111, 0.142857142857143, 0.285714285714286, 0.285714285714286,
+0.285714285714286, 0.125, 0.375, 0.25, 0.125, 0.125, 0.166666666666667,
+0.166666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667,
+0.166666666666667, 0.222222222222222, 0.222222222222222, 0.111111111111111,
+0.111111111111111, 0.222222222222222, 0.111111111111111, 0.333333333333333,
+0.333333333333333, 0.333333333333333, 0.142857142857143, 0.285714285714286,
+0.142857142857143, 0.142857142857143, 0.142857142857143, 0.142857142857143,
+1, 0.5, 0.5, 0.125, 0.25, 0.25, 0.125, 0.125, 0.125), factors = list())
+  expect_equal(colScaleM(A), csA)
+})

From 0e071d9ea7318861ef3a2d9b0be1f1985e359db2 Mon Sep 17 00:00:00 2001
From: Gregory Jefferis <jefferis@gmail.com>
Date: Wed, 29 Nov 2023 21:36:43 +0000
Subject: [PATCH 2/4] faster implementation of geomScaleM

* avoid matrix multiplication and just use scale facs instead.
* test vs old implementation
---
 R/sparsemat.R                   | 6 +++++-
 tests/testthat/test-sparsemat.R | 3 +++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/R/sparsemat.R b/R/sparsemat.R
index ceaf106..0489d16 100644
--- a/R/sparsemat.R
+++ b/R/sparsemat.R
@@ -66,5 +66,9 @@ rowScaleM <- function(A, na.rm=TRUE) {
 #' @rdname colScaleM
 #' @description sqrt of both column and row normalisation
 geomScaleM <- function(A, na.rm=TRUE) {
-  sqrt(colScaleM(A, na.rm=na.rm)*rowScaleM(A, na.rm=na.rm))
+  sf1=sqrt(1/Matrix::rowSums(A))
+  if(na.rm) sf1[!is.finite(sf1)]=0
+  sf2=sqrt(1/Matrix::colSums(A))
+  if(na.rm) sf2[!is.finite(sf2)]=0
+  Matrix::dimScale(A, d1 = sf1, d2 = sf2)
 }
diff --git a/tests/testthat/test-sparsemat.R b/tests/testthat/test-sparsemat.R
index 9d20ff7..d444a59 100644
--- a/tests/testthat/test-sparsemat.R
+++ b/tests/testthat/test-sparsemat.R
@@ -31,4 +31,7 @@ test_that("multiplication works", {
 0.142857142857143, 0.142857142857143, 0.142857142857143, 0.142857142857143,
 1, 0.5, 0.5, 0.125, 0.25, 0.25, 0.125, 0.125, 0.125), factors = list())
   expect_equal(colScaleM(A), csA)
+
+  expect_equal(geomScaleM(A),
+    sqrt(colScaleM(A)*rowScaleM(A)))
 })

From 35436e56d6443d80297765c7e3663fc0f34ea4f2 Mon Sep 17 00:00:00 2001
From: Gregory Jefferis <jefferis@gmail.com>
Date: Thu, 25 Jul 2024 15:20:16 +0100
Subject: [PATCH 3/4] spelling

---
 DESCRIPTION      | 2 +-
 R/sparsemat.R    | 6 +++---
 inst/WORDLIST    | 2 ++
 man/colScaleM.Rd | 6 +++---
 4 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 0a8dd48..5f97019 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -13,7 +13,7 @@ Description: Provides data source agnostic utility functions to support
 License: GPL (>= 3)
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.2
 Imports: 
     bit64,
     checkmate,
diff --git a/R/sparsemat.R b/R/sparsemat.R
index 0489d16..38c399b 100644
--- a/R/sparsemat.R
+++ b/R/sparsemat.R
@@ -10,11 +10,11 @@
 #'   their base R cousins (e.g. \code{scale}). As an example, normalising a 4x4k
 #'   subset of VNC connectivity data took \itemize{
 #'
-#'   \item colScaleM sparse 3.5ms
+#'   \item \code{colScaleM} sparse 3.5ms
 #'
-#'   \item colScaleM dense 1460ms
+#'   \item \code{colScaleM} dense 1460ms
 #'
-#'   \item scale dense 365ms
+#'   \item \code{scale} dense 365ms
 #'
 #'   }
 #'
diff --git a/inst/WORDLIST b/inst/WORDLIST
index aa15387..eebe043 100644
--- a/inst/WORDLIST
+++ b/inst/WORDLIST
@@ -5,8 +5,10 @@ Lifecycle
 Natverse
 ORCID
 PNs
+VNC
 connectome
 connectomics
+etc
 fafbseg
 flywire
 natverse
diff --git a/man/colScaleM.Rd b/man/colScaleM.Rd
index 3bc8e86..d3d3a72 100644
--- a/man/colScaleM.Rd
+++ b/man/colScaleM.Rd
@@ -39,11 +39,11 @@ Conventionally in connectivity matrices, rows are upstream input
   their base R cousins (e.g. \code{scale}). As an example, normalising a 4x4k
   subset of VNC connectivity data took \itemize{
 
-  \item colScaleM sparse 3.5ms
+  \item \code{colScaleM} sparse 3.5ms
 
-  \item colScaleM dense 1460ms
+  \item \code{colScaleM} dense 1460ms
 
-  \item scale dense 365ms
+  \item \code{scale} dense 365ms
 
   }
 

From 52c083c7b54fc490a5827c65b2f8ddd15448ebce Mon Sep 17 00:00:00 2001
From: Gregory Jefferis <jefferis@gmail.com>
Date: Sun, 17 Aug 2025 13:32:19 +0100
Subject: [PATCH 4/4] doc polishing

---
 R/sparsemat.R    | 7 ++++++-
 man/colScaleM.Rd | 7 ++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/R/sparsemat.R b/R/sparsemat.R
index 38c399b..fd41bf2 100644
--- a/R/sparsemat.R
+++ b/R/sparsemat.R
@@ -23,7 +23,12 @@
 #'   be \emph{much} faster and for small ones, both methods will be fast anyway.
 #'
 #'   Note also that these functions were originally called \code{colScale} etc
-#'   but then the \code{Matrix} package added functions of the same name.
+#'   but then the \code{\link[Matrix]{Matrix}} package added functions of the
+#'   same name. The functions in this package differ in two respects from the
+#'   \code{Matrix::\link[Matrix:colScale]{colScale,rowScale}} functions. First
+#'   they calculate the required row or column sums to perform the scaling.
+#'   Second, they handle the inevitable zero sum rows or columns via the default
+#'   \code{na.rm} argument.
 #'
 #' @param A a sparse (or dense) matrix
 #' @param na.rm when \code{T}, the default, converts any NA values (usually
diff --git a/man/colScaleM.Rd b/man/colScaleM.Rd
index d3d3a72..696c2f5 100644
--- a/man/colScaleM.Rd
+++ b/man/colScaleM.Rd
@@ -52,7 +52,12 @@ Conventionally in connectivity matrices, rows are upstream input
   be \emph{much} faster and for small ones, both methods will be fast anyway.
 
   Note also that these functions were originally called \code{colScale} etc
-  but then the \code{Matrix} package added functions of the same name.
+  but then the \code{\link[Matrix]{Matrix}} package added functions of the
+  same name. The functions in this package differ in two respects from the
+  \code{Matrix::\link[Matrix:colScale]{colScale,rowScale}} functions. First
+  they calculate the required row or column sums to perform the scaling.
+  Second, they handle the inevitable zero sum rows or columns via the default
+  \code{na.rm} argument.
 }
 \examples{
 library(Matrix)