diff --git a/DESCRIPTION b/DESCRIPTION index 4918301..5f97019 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -13,12 +13,12 @@ Description: Provides data source agnostic utility functions to support License: GPL (>= 3) Encoding: UTF-8 LazyData: true -RoxygenNote: 7.2.3 +RoxygenNote: 7.3.2 Imports: bit64, checkmate, glue, - Matrix, + Matrix (>= 1.5-3), grDevices, stats Suggests: diff --git a/NAMESPACE b/NAMESPACE index 9d74675..988e48c 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,12 +1,15 @@ # Generated by roxygen2: do not edit by hand export(add_cluster_info) +export(colScaleM) export(cosine_sim) export(dataset_names) +export(geomScaleM) export(id2char) export(partner_summary2adjacency_matrix) export(prepare_cosine_matrix) export(register_dataset) +export(rowScaleM) importFrom(grDevices,hcl.colors) importFrom(stats,as.dist) importFrom(stats,hclust) diff --git a/R/sparsemat.R b/R/sparsemat.R new file mode 100644 index 0000000..fd41bf2 --- /dev/null +++ b/R/sparsemat.R @@ -0,0 +1,79 @@ +#' Efficient column and row scaling for sparse matrices +#' +#' @details Conventionally in connectivity matrices, rows are upstream input +#' neurons and columns are downstream output neurons. \code{colScaleM} will +#' therefore normalise by the total input onto each downstream neuron while +#' \code{rowScaleM} will normalise by the total output from each upstream +#' neuron. +#' +#' These functions will work with dense inputs but are less efficient than +#' their base R cousins (e.g. \code{scale}). As an example, normalising a 4x4k +#' subset of VNC connectivity data took \itemize{ +#' +#' \item \code{colScaleM} sparse 3.5ms +#' +#' \item \code{colScaleM} dense 1460ms +#' +#' \item \code{scale} dense 365ms +#' +#' } +#' +#' In conclusion sparse matrices with \code{colScaleM}/\code{rowScaleM} are +#' the way to go for real connectivity matrices! For large matrices they will +#' be \emph{much} faster and for small ones, both methods will be fast anyway. +#' +#' Note also that these functions were originally called \code{colScale} etc +#' but then the \code{\link[Matrix]{Matrix}} package added functions of the +#' same name. The functions in this package differ in two respects from the +#' \code{Matrix::\link[Matrix:colScale]{colScale,rowScale}} functions. First +#' they calculate the required row or column sums to perform the scaling. +#' Second, they handle the inevitable zero sum rows or columns via the default +#' \code{na.rm} argument. +#' +#' @param A a sparse (or dense) matrix +#' @param na.rm when \code{T}, the default, converts any NA values (usually +#' resulting from divide by zero errors when a neuron has no partners) to 0 +#' @description \code{colScale} normalises a matrix by the sum of each column +#' +#' @return A scaled sparse matrix +#' @export +#' +#' @seealso \code{Matrix::\link[Matrix]{colScale}} on which these are based and, +#' for the original version, +#' \url{https://stackoverflow.com/questions/39284774/column-rescaling-for-a-very-large-sparse-matrix-in-r} +#' +#' @examples +#' library(Matrix) +#' set.seed(42) +#' A <- Matrix(rbinom(100,10,0.05), nrow = 10) +#' rownames(A)=letters[1:10] +#' colnames(A)=LETTERS[1:10] +#' +#' colScaleM(A) +#' rowScaleM(A) +#' geomScaleM(A) +colScaleM <- function(A, na.rm=TRUE) { + scalefac = 1 / Matrix::colSums(A) + if(na.rm) scalefac[!is.finite(scalefac)]=0 + Matrix::colScale(A, scalefac) +} + +#' @export +#' @rdname colScaleM +#' @description \code{rowScale} normalises a matrix by the sum of each row +rowScaleM <- function(A, na.rm=TRUE) { + scalefac = 1 / Matrix::rowSums(A) + if(na.rm) scalefac[!is.finite(scalefac)]=0 + Matrix::rowScale(A, scalefac) +} + +#' @export +#' @rdname colScaleM +#' @description sqrt of both column and row normalisation +geomScaleM <- function(A, na.rm=TRUE) { + sf1=sqrt(1/Matrix::rowSums(A)) + if(na.rm) sf1[!is.finite(sf1)]=0 + sf2=sqrt(1/Matrix::colSums(A)) + if(na.rm) sf2[!is.finite(sf2)]=0 + Matrix::dimScale(A, d1 = sf1, d2 = sf2) +} diff --git a/inst/WORDLIST b/inst/WORDLIST index aa15387..eebe043 100644 --- a/inst/WORDLIST +++ b/inst/WORDLIST @@ -5,8 +5,10 @@ Lifecycle Natverse ORCID PNs +VNC connectome connectomics +etc fafbseg flywire natverse diff --git a/man/colScaleM.Rd b/man/colScaleM.Rd new file mode 100644 index 0000000..696c2f5 --- /dev/null +++ b/man/colScaleM.Rd @@ -0,0 +1,77 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/sparsemat.R +\name{colScaleM} +\alias{colScaleM} +\alias{rowScaleM} +\alias{geomScaleM} +\title{Efficient column and row scaling for sparse matrices} +\usage{ +colScaleM(A, na.rm = TRUE) + +rowScaleM(A, na.rm = TRUE) + +geomScaleM(A, na.rm = TRUE) +} +\arguments{ +\item{A}{a sparse (or dense) matrix} + +\item{na.rm}{when \code{T}, the default, converts any NA values (usually +resulting from divide by zero errors when a neuron has no partners) to 0} +} +\value{ +A scaled sparse matrix +} +\description{ +\code{colScale} normalises a matrix by the sum of each column + +\code{rowScale} normalises a matrix by the sum of each row + +sqrt of both column and row normalisation +} +\details{ +Conventionally in connectivity matrices, rows are upstream input + neurons and columns are downstream output neurons. \code{colScaleM} will + therefore normalise by the total input onto each downstream neuron while + \code{rowScaleM} will normalise by the total output from each upstream + neuron. + + These functions will work with dense inputs but are less efficient than + their base R cousins (e.g. \code{scale}). As an example, normalising a 4x4k + subset of VNC connectivity data took \itemize{ + + \item \code{colScaleM} sparse 3.5ms + + \item \code{colScaleM} dense 1460ms + + \item \code{scale} dense 365ms + + } + + In conclusion sparse matrices with \code{colScaleM}/\code{rowScaleM} are + the way to go for real connectivity matrices! For large matrices they will + be \emph{much} faster and for small ones, both methods will be fast anyway. + + Note also that these functions were originally called \code{colScale} etc + but then the \code{\link[Matrix]{Matrix}} package added functions of the + same name. The functions in this package differ in two respects from the + \code{Matrix::\link[Matrix:colScale]{colScale,rowScale}} functions. First + they calculate the required row or column sums to perform the scaling. + Second, they handle the inevitable zero sum rows or columns via the default + \code{na.rm} argument. +} +\examples{ +library(Matrix) +set.seed(42) +A <- Matrix(rbinom(100,10,0.05), nrow = 10) +rownames(A)=letters[1:10] +colnames(A)=LETTERS[1:10] + +colScaleM(A) +rowScaleM(A) +geomScaleM(A) +} +\seealso{ +\code{Matrix::\link[Matrix]{colScale}} on which these are based and, + for the original version, + \url{https://stackoverflow.com/questions/39284774/column-rescaling-for-a-very-large-sparse-matrix-in-r} +} diff --git a/tests/testthat/test-sparsemat.R b/tests/testthat/test-sparsemat.R new file mode 100644 index 0000000..d444a59 --- /dev/null +++ b/tests/testthat/test-sparsemat.R @@ -0,0 +1,37 @@ +test_that("multiplication works", { + # library(Matrix) + # set.seed(42) + # A <- Matrix(rbinom(100,10,0.05), nrow = 10) + # rownames(A)=letters[1:10] + # colnames(A)=LETTERS[1:10] + A=new("dgCMatrix", i = c(0L, 1L, 3L, 4L, 6L, 8L, 9L, 1L, 2L, 5L, +6L, 0L, 2L, 3L, 7L, 9L, 0L, 1L, 3L, 5L, 8L, 9L, 3L, 5L, 6L, 7L, +8L, 9L, 3L, 5L, 6L, 0L, 1L, 2L, 4L, 7L, 8L, 5L, 3L, 4L, 0L, 3L, +4L, 5L, 8L, 9L), p = c(0L, 7L, 11L, 16L, 22L, 28L, 31L, 37L, +38L, 40L, 46L), Dim = c(10L, 10L), Dimnames = list(c("a", "b", +"c", "d", "e", "f", "g", "h", "i", "j"), c("A", "B", "C", "D", +"E", "F", "G", "H", "I", "J")), x = c(2, 2, 1, 1, 1, 1, 1, 1, +2, 2, 2, 1, 3, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 2, 1, 1, +1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1), factors = list()) + colScaleM(A) + csA=new("dgCMatrix", i = c(0L, 1L, 3L, 4L, 6L, 8L, 9L, 1L, 2L, 5L, +6L, 0L, 2L, 3L, 7L, 9L, 0L, 1L, 3L, 5L, 8L, 9L, 3L, 5L, 6L, 7L, +8L, 9L, 3L, 5L, 6L, 0L, 1L, 2L, 4L, 7L, 8L, 5L, 3L, 4L, 0L, 3L, +4L, 5L, 8L, 9L), p = c(0L, 7L, 11L, 16L, 22L, 28L, 31L, 37L, +38L, 40L, 46L), Dim = c(10L, 10L), Dimnames = list(c("a", "b", +"c", "d", "e", "f", "g", "h", "i", "j"), c("A", "B", "C", "D", +"E", "F", "G", "H", "I", "J")), x = c(0.222222222222222, 0.222222222222222, +0.111111111111111, 0.111111111111111, 0.111111111111111, 0.111111111111111, +0.111111111111111, 0.142857142857143, 0.285714285714286, 0.285714285714286, +0.285714285714286, 0.125, 0.375, 0.25, 0.125, 0.125, 0.166666666666667, +0.166666666666667, 0.166666666666667, 0.166666666666667, 0.166666666666667, +0.166666666666667, 0.222222222222222, 0.222222222222222, 0.111111111111111, +0.111111111111111, 0.222222222222222, 0.111111111111111, 0.333333333333333, +0.333333333333333, 0.333333333333333, 0.142857142857143, 0.285714285714286, +0.142857142857143, 0.142857142857143, 0.142857142857143, 0.142857142857143, +1, 0.5, 0.5, 0.125, 0.25, 0.25, 0.125, 0.125, 0.125), factors = list()) + expect_equal(colScaleM(A), csA) + + expect_equal(geomScaleM(A), + sqrt(colScaleM(A)*rowScaleM(A))) +})