From 883f2fe603318b23753d35ae6512a5e321cab7c4 Mon Sep 17 00:00:00 2001 From: Gregory Jefferis Date: Wed, 29 Nov 2023 21:32:33 +0000 Subject: [PATCH 1/4] implement colScaleM/rowScaleM/geomScaleM --- DESCRIPTION | 2 +- NAMESPACE | 3 ++ R/sparsemat.R | 70 ++++++++++++++++++++++++++++++++ man/colScaleM.Rd | 72 +++++++++++++++++++++++++++++++++ tests/testthat/test-sparsemat.R | 34 ++++++++++++++++ 5 files changed, 180 insertions(+), 1 deletion(-) create mode 100644 R/sparsemat.R create mode 100644 man/colScaleM.Rd create mode 100644 tests/testthat/test-sparsemat.R diff --git a/DESCRIPTION b/DESCRIPTION index 4918301..0a8dd48 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,7 @@ Imports: bit64, checkmate, glue, - Matrix, + Matrix (>= 1.5-3), grDevices, stats Suggests: diff --git a/NAMESPACE b/NAMESPACE index 9d74675..988e48c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,12 +1,15 @@ # Generated by roxygen2: do not edit by hand export(add_cluster_info) +export(colScaleM) export(cosine_sim) export(dataset_names) +export(geomScaleM) export(id2char) export(partner_summary2adjacency_matrix) export(prepare_cosine_matrix) export(register_dataset) +export(rowScaleM) importFrom(grDevices,hcl.colors) importFrom(stats,as.dist) importFrom(stats,hclust) diff --git a/R/sparsemat.R b/R/sparsemat.R new file mode 100644 index 0000000..ceaf106 --- /dev/null +++ b/R/sparsemat.R @@ -0,0 +1,70 @@ +#' Efficient column and row scaling for sparse matrices +#' +#' @details Conventionally in connectivity matrices, rows are upstream input +#' neurons and columns are downstream output neurons. \code{colScaleM} will +#' therefore normalise by the total input onto each downstream neuron while +#' \code{rowScaleM} will normalise by the total output from each upstream +#' neuron. +#' +#' These functions will work with dense inputs but are less efficient than +#' their base R cousins (e.g. \code{scale}). As an example, normalising a 4x4k +#' subset of VNC connectivity data took \itemize{ +#' +#' \item colScaleM sparse 3.5ms +#' +#' \item colScaleM dense 1460ms +#' +#' \item scale dense 365ms +#' +#' } +#' +#' In conclusion sparse matrices with \code{colScaleM}/\code{rowScaleM} are +#' the way to go for real connectivity matrices! For large matrices they will +#' be \emph{much} faster and for small ones, both methods will be fast anyway. +#' +#' Note also that these functions were originally called \code{colScale} etc +#' but then the \code{Matrix} package added functions of the same name. +#' +#' @param A a sparse (or dense) matrix +#' @param na.rm when \code{T}, the default, converts any NA values (usually +#' resulting from divide by zero errors when a neuron has no partners) to 0 +#' @description \code{colScale} normalises a matrix by the sum of each column +#' +#' @return A scaled sparse matrix +#' @export +#' +#' @seealso \code{Matrix::\link[Matrix]{colScale}} on which these are based and, +#' for the original version, +#' \url{https://stackoverflow.com/questions/39284774/column-rescaling-for-a-very-large-sparse-matrix-in-r} +#' +#' @examples +#' library(Matrix) +#' set.seed(42) +#' A <- Matrix(rbinom(100,10,0.05), nrow = 10) +#' rownames(A)=letters[1:10] +#' colnames(A)=LETTERS[1:10] +#' +#' colScaleM(A) +#' rowScaleM(A) +#' geomScaleM(A) +colScaleM <- function(A, na.rm=TRUE) { + scalefac = 1 / Matrix::colSums(A) + if(na.rm) scalefac[!is.finite(scalefac)]=0 + Matrix::colScale(A, scalefac) +} + +#' @export +#' @rdname colScaleM +#' @description \code{rowScale} normalises a matrix by the sum of each row +rowScaleM <- function(A, na.rm=TRUE) { + scalefac = 1 / Matrix::rowSums(A) + if(na.rm) scalefac[!is.finite(scalefac)]=0 + Matrix::rowScale(A, scalefac) +} + +#' @export +#' @rdname colScaleM +#' @description sqrt of both column and row normalisation +geomScaleM <- function(A, na.rm=TRUE) { + sqrt(colScaleM(A, na.rm=na.rm)*rowScaleM(A, na.rm=na.rm)) +} diff --git a/man/colScaleM.Rd b/man/colScaleM.Rd new file mode 100644 index 0000000..3bc8e86 --- /dev/null +++ b/man/colScaleM.Rd @@ -0,0 +1,72 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sparsemat.R +\name{colScaleM} +\alias{colScaleM} +\alias{rowScaleM} +\alias{geomScaleM} +\title{Efficient column and row scaling for sparse matrices} +\usage{ +colScaleM(A, na.rm = TRUE) + +rowScaleM(A, na.rm = TRUE) + +geomScaleM(A, na.rm = TRUE) +} +\arguments{ +\item{A}{a sparse (or dense) matrix} + +\item{na.rm}{when \code{T}, the default, converts any NA values (usually +resulting from divide by zero errors when a neuron has no partners) to 0} +} +\value{ +A scaled sparse matrix +} +\description{ +\code{colScale} normalises a matrix by the sum of each column + +\code{rowScale} normalises a matrix by the sum of each row + +sqrt of both column and row normalisation +} +\details{ +Conventionally in connectivity matrices, rows are upstream input + neurons and columns are downstream output neurons. \code{colScaleM} will + therefore normalise by the total input onto each downstream neuron while + \code{rowScaleM} will normalise by the total output from each upstream + neuron. + + These functions will work with dense inputs but are less efficient than + their base R cousins (e.g. \code{scale}). As an example, normalising a 4x4k + subset of VNC connectivity data took \itemize{ + + \item colScaleM sparse 3.5ms + + \item colScaleM dense 1460ms + + \item scale dense 365ms + + } + + In conclusion sparse matrices with \code{colScaleM}/\code{rowScaleM} are + the way to go for real connectivity matrices! For large matrices they will + be \emph{much} faster and for small ones, both methods will be fast anyway. + + Note also that these functions were originally called \code{colScale} etc + but then the \code{Matrix} package added functions of the same name. +} +\examples{ +library(Matrix) +set.seed(42) +A <- Matrix(rbinom(100,10,0.05), nrow = 10) +rownames(A)=letters[1:10] +colnames(A)=LETTERS[1:10] + +colScaleM(A) +rowScaleM(A) +geomScaleM(A) +} +\seealso{ +\code{Matrix::\link[Matrix]{colScale}} on which these are based and, + for the original version, + \url{https://stackoverflow.com/questions/39284774/column-rescaling-for-a-very-large-sparse-matrix-in-r} +} diff --git a/tests/testthat/test-sparsemat.R b/tests/testthat/test-sparsemat.R new file mode 100644 index 0000000..9d20ff7 --- /dev/null +++ b/tests/testthat/test-sparsemat.R @@ -0,0 +1,34 @@ +test_that("multiplication works", { + # library(Matrix) + # set.seed(42) + # A <- Matrix(rbinom(100,10,0.05), nrow = 10) + # rownames(A)=letters[1:10] + # colnames(A)=LETTERS[1:10] + A=new("dgCMatrix", i = c(0L, 1L, 3L, 4L, 6L, 8L, 9L, 1L, 2L, 5L, +6L, 0L, 2L, 3L, 7L, 9L, 0L, 1L, 3L, 5L, 8L, 9L, 3L, 5L, 6L, 7L, +8L, 9L, 3L, 5L, 6L, 0L, 1L, 2L, 4L, 7L, 8L, 5L, 3L, 4L, 0L, 3L, +4L, 5L, 8L, 9L), p = c(0L, 7L, 11L, 16L, 22L, 28L, 31L, 37L, +38L, 40L, 46L), Dim = c(10L, 10L), Dimnames = list(c("a", "b", +"c", "d", "e", "f", "g", "h", "i", "j"), c("A", "B", "C", "D", +"E", "F", "G", "H", "I", "J")), x = c(2, 2, 1, 1, 1, 1, 1, 1, +2, 2, 2, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, +1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1), factors = list()) + colScaleM(A) + csA=new("dgCMatrix", i = c(0L, 1L, 3L, 4L, 6L, 8L, 9L, 1L, 2L, 5L, +6L, 0L, 2L, 3L, 7L, 9L, 0L, 1L, 3L, 5L, 8L, 9L, 3L, 5L, 6L, 7L, +8L, 9L, 3L, 5L, 6L, 0L, 1L, 2L, 4L, 7L, 8L, 5L, 3L, 4L, 0L, 3L, +4L, 5L, 8L, 9L), p = c(0L, 7L, 11L, 16L, 22L, 28L, 31L, 37L, +38L, 40L, 46L), Dim = c(10L, 10L), Dimnames = list(c("a", "b", +"c", "d", "e", "f", "g", "h", "i", "j"), c("A", "B", "C", "D", +"E", "F", "G", "H", "I", "J")), x = c(0.222222222222222, 0.222222222222222, +0.111111111111111, 0.111111111111111, 0.111111111111111, 0.111111111111111, +0.111111111111111, 0.142857142857143, 0.285714285714286, 0.285714285714286, +0.285714285714286, 0.125, 0.375, 0.25, 0.125, 0.125, 0.166666666666667, +0.166666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667, +0.166666666666667, 0.222222222222222, 0.222222222222222, 0.111111111111111, +0.111111111111111, 0.222222222222222, 0.111111111111111, 0.333333333333333, +0.333333333333333, 0.333333333333333, 0.142857142857143, 0.285714285714286, +0.142857142857143, 0.142857142857143, 0.142857142857143, 0.142857142857143, +1, 0.5, 0.5, 0.125, 0.25, 0.25, 0.125, 0.125, 0.125), factors = list()) + expect_equal(colScaleM(A), csA) +}) From 0e071d9ea7318861ef3a2d9b0be1f1985e359db2 Mon Sep 17 00:00:00 2001 From: Gregory Jefferis Date: Wed, 29 Nov 2023 21:36:43 +0000 Subject: [PATCH 2/4] faster implementation of geomScaleM * avoid matrix multiplication and just use scale facs instead. * test vs old implementation --- R/sparsemat.R | 6 +++++- tests/testthat/test-sparsemat.R | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/R/sparsemat.R b/R/sparsemat.R index ceaf106..0489d16 100644 --- a/R/sparsemat.R +++ b/R/sparsemat.R @@ -66,5 +66,9 @@ rowScaleM <- function(A, na.rm=TRUE) { #' @rdname colScaleM #' @description sqrt of both column and row normalisation geomScaleM <- function(A, na.rm=TRUE) { - sqrt(colScaleM(A, na.rm=na.rm)*rowScaleM(A, na.rm=na.rm)) + sf1=sqrt(1/Matrix::rowSums(A)) + if(na.rm) sf1[!is.finite(sf1)]=0 + sf2=sqrt(1/Matrix::colSums(A)) + if(na.rm) sf2[!is.finite(sf2)]=0 + Matrix::dimScale(A, d1 = sf1, d2 = sf2) } diff --git a/tests/testthat/test-sparsemat.R b/tests/testthat/test-sparsemat.R index 9d20ff7..d444a59 100644 --- a/tests/testthat/test-sparsemat.R +++ b/tests/testthat/test-sparsemat.R @@ -31,4 +31,7 @@ test_that("multiplication works", { 0.142857142857143, 0.142857142857143, 0.142857142857143, 0.142857142857143, 1, 0.5, 0.5, 0.125, 0.25, 0.25, 0.125, 0.125, 0.125), factors = list()) expect_equal(colScaleM(A), csA) + + expect_equal(geomScaleM(A), + sqrt(colScaleM(A)*rowScaleM(A))) }) From 35436e56d6443d80297765c7e3663fc0f34ea4f2 Mon Sep 17 00:00:00 2001 From: Gregory Jefferis Date: Thu, 25 Jul 2024 15:20:16 +0100 Subject: [PATCH 3/4] spelling --- DESCRIPTION | 2 +- R/sparsemat.R | 6 +++--- inst/WORDLIST | 2 ++ man/colScaleM.Rd | 6 +++--- 4 files changed, 9 insertions(+), 7 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 0a8dd48..5f97019 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,7 +13,7 @@ Description: Provides data source agnostic utility functions to support License: GPL (>= 3) Encoding: UTF-8 LazyData: true -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 Imports: bit64, checkmate, diff --git a/R/sparsemat.R b/R/sparsemat.R index 0489d16..38c399b 100644 --- a/R/sparsemat.R +++ b/R/sparsemat.R @@ -10,11 +10,11 @@ #' their base R cousins (e.g. \code{scale}). As an example, normalising a 4x4k #' subset of VNC connectivity data took \itemize{ #' -#' \item colScaleM sparse 3.5ms +#' \item \code{colScaleM} sparse 3.5ms #' -#' \item colScaleM dense 1460ms +#' \item \code{colScaleM} dense 1460ms #' -#' \item scale dense 365ms +#' \item \code{scale} dense 365ms #' #' } #' diff --git a/inst/WORDLIST b/inst/WORDLIST index aa15387..eebe043 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -5,8 +5,10 @@ Lifecycle Natverse ORCID PNs +VNC connectome connectomics +etc fafbseg flywire natverse diff --git a/man/colScaleM.Rd b/man/colScaleM.Rd index 3bc8e86..d3d3a72 100644 --- a/man/colScaleM.Rd +++ b/man/colScaleM.Rd @@ -39,11 +39,11 @@ Conventionally in connectivity matrices, rows are upstream input their base R cousins (e.g. \code{scale}). As an example, normalising a 4x4k subset of VNC connectivity data took \itemize{ - \item colScaleM sparse 3.5ms + \item \code{colScaleM} sparse 3.5ms - \item colScaleM dense 1460ms + \item \code{colScaleM} dense 1460ms - \item scale dense 365ms + \item \code{scale} dense 365ms } From 52c083c7b54fc490a5827c65b2f8ddd15448ebce Mon Sep 17 00:00:00 2001 From: Gregory Jefferis Date: Sun, 17 Aug 2025 13:32:19 +0100 Subject: [PATCH 4/4] doc polishing --- R/sparsemat.R | 7 ++++++- man/colScaleM.Rd | 7 ++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/R/sparsemat.R b/R/sparsemat.R index 38c399b..fd41bf2 100644 --- a/R/sparsemat.R +++ b/R/sparsemat.R @@ -23,7 +23,12 @@ #' be \emph{much} faster and for small ones, both methods will be fast anyway. #' #' Note also that these functions were originally called \code{colScale} etc -#' but then the \code{Matrix} package added functions of the same name. +#' but then the \code{\link[Matrix]{Matrix}} package added functions of the +#' same name. The functions in this package differ in two respects from the +#' \code{Matrix::\link[Matrix:colScale]{colScale,rowScale}} functions. First +#' they calculate the required row or column sums to perform the scaling. +#' Second, they handle the inevitable zero sum rows or columns via the default +#' \code{na.rm} argument. #' #' @param A a sparse (or dense) matrix #' @param na.rm when \code{T}, the default, converts any NA values (usually diff --git a/man/colScaleM.Rd b/man/colScaleM.Rd index d3d3a72..696c2f5 100644 --- a/man/colScaleM.Rd +++ b/man/colScaleM.Rd @@ -52,7 +52,12 @@ Conventionally in connectivity matrices, rows are upstream input be \emph{much} faster and for small ones, both methods will be fast anyway. Note also that these functions were originally called \code{colScale} etc - but then the \code{Matrix} package added functions of the same name. + but then the \code{\link[Matrix]{Matrix}} package added functions of the + same name. The functions in this package differ in two respects from the + \code{Matrix::\link[Matrix:colScale]{colScale,rowScale}} functions. First + they calculate the required row or column sums to perform the scaling. + Second, they handle the inevitable zero sum rows or columns via the default + \code{na.rm} argument. } \examples{ library(Matrix)