diff --git a/DESCRIPTION b/DESCRIPTION index 0c5c748..e523d5d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -31,7 +31,8 @@ Imports: usethis, methods, withr, - utils + utils, + tibble Suggests: malevnc (> 0.3.1), fancr (>= 0.5.0), diff --git a/R/cosine.R b/R/cosine.R index 7d0c1df..3acab23 100644 --- a/R/cosine.R +++ b/R/cosine.R @@ -296,29 +296,58 @@ cf_cosine_plot <- function(ids=NULL, ..., threshold=5, } -#' @description \code{multi_connection_table} fetches partner connectivity data -#' (the first step in \code{cf_cosine_plot} but then gives you the option e.g. -#' to select specific classes of partner neurons. See examples. +#'@description \code{multi_connection_table} fetches partner connectivity data +#' (the first step in \code{cf_cosine_plot}) but then gives you the option e.g. +#' to select specific classes of partner neurons, fix type names etc. See +#' examples. #' -#' @importFrom dplyr distinct all_of -#' @param check_missing Whether to report if any query neurons are dropped (due -#' to insufficient partner neurons) (default:\code{TRUE}). -#' @param min_datasets How many datasets a type must be in to be included in the -#' output. The default of \code{Inf} => all datasets must contain the cell -#' type. A negative number defines the number of datasets from which a type -#' can be missing. For example \code{-1} would mean that types would still be -#' included even if they are missing from one dataset. +#'@details At present the malecns dataset is the best integrated of all with +#' "foreign type" columns referencing the prior flywire female brain and MANC +#' male nerve cord datasets. These in turn have been the target of ongoing FANC +#' and BANC annotation efforts. Therefore right now the simplest way to ensure +#' that types can be matched across datasets is to use +#' \code{prefer.foreign=TRUE} when requesting multiple datasets. However when +#' using just the malecns, the standard typing for that dataset has some +#' improvements, so \code{prefer.foreign=FALSE} would be better. The default +#' setting of \code{prefer.foreign=NA} therefore chooses +#' \code{prefer.foreign=TRUE} when malecns and at least one other dataset are +#' being requested and \code{FALSE} otherwise. #' -#' @rdname cf_cosine_plot -#' @export -#' @return \code{multi_connection_table} returns a connectivity dataframe as -#' returned by \code{cf_partners} but with an additional column -#' \code{partners} which indicates (for each row) whether the partner neurons -#' are the input or output neurons. +#' Nevertheless, if you want really tight control of the type to type mapping +#' it is recommended to fetch with \code{prefer.foreign=F, min_datasets=1} and +#' then manually review and fix up any types that you know should match. If you +#' also set \code{keep.all=T} they you can access the foreign types columns +#' as part of your logic for doing this. +#' +#'@importFrom dplyr distinct all_of +#'@param check_missing Whether to report if any query neurons are dropped (due +#' to insufficient partner neurons) (default:\code{TRUE}). +#'@param min_datasets How many datasets a type must be in to be included in the +#' output. The default of \code{Inf} => all datasets must contain the cell +#' type. A negative number defines the number of datasets from which a type can +#' be missing. For example \code{-1} would mean that types would still be +#' included even if they are missing from one dataset. +#'@param prefer.foreign Whether to use foreign types for male CNS data. The +#' default value of \code{NA} prefers foreign types when multiple datasets +#' including malecns are requested. See details. +#'@param MoreArgs Passed to \code{\link{cf_partners}} For expert use only. +#'@param ... additional arguments passed to \code{\link{cf_partners}} +#'@inheritParams cf_partners +#' +#'@rdname cf_cosine_plot +#'@export +#'@return \code{multi_connection_table} returns a connectivity dataframe as +#' returned by \code{cf_partners} but with an additional column \code{partners} +#' which indicates (for each row) whether the partner neurons are the input or +#' output neurons. multi_connection_table <- function(ids, partners=c("inputs", "outputs"), threshold=1L, group='type', check_missing=TRUE, - min_datasets=Inf + min_datasets=Inf, + prefer.foreign=NA, + keep.all=FALSE, + MoreArgs=NULL, + ... ) { if(isTRUE(group)) group='type' @@ -327,7 +356,9 @@ multi_connection_table <- function(ids, partners=c("inputs", "outputs"), if(length(partners)>1) { l=sapply(partners, simplify = F, function(p) multi_connection_table(kk, partners=p, threshold = threshold, group=group, - check_missing=F, min_datasets = min_datasets)) + check_missing=F, min_datasets = min_datasets, + prefer.foreign=prefer.foreign, MoreArgs=MoreArgs, + keep.all=keep.all, ...)) l=dplyr::bind_rows(l) if(check_missing) { query_keys <- l %>% group_by(partners) %>% @@ -346,10 +377,14 @@ multi_connection_table <- function(ids, partners=c("inputs", "outputs"), } kdf=keys2df(kk) datasets=unique(kdf$dataset) - MoreArgs=list() - if(length(datasets)>1 && "malecns" %in% datasets) - MoreArgs=list(malecns=list(prefer.foreign=TRUE)) - x <- cf_partners(kk, threshold = threshold, partners = partners, MoreArgs = MoreArgs) + if(is.null(MoreArgs)){ + MoreArgs=list() + if(isTRUE(prefer.foreign) || + ((length(datasets)>1 && "malecns" %in% datasets) && is.na(prefer.foreign))) + MoreArgs=list(malecns=list(prefer.foreign=TRUE)) + } + x <- cf_partners(kk, threshold = threshold, partners = partners, + MoreArgs = MoreArgs, keep.all=keep.all, ...) if(is.character(group)) x <- match_types(x, group, partners=partners, min_datasets = min_datasets) # mark which column was used for the query diff --git a/R/meta.R b/R/meta.R index 3cd7b85..8eb0ca3 100644 --- a/R/meta.R +++ b/R/meta.R @@ -50,10 +50,6 @@ get_meta_fun <- function(dataset) { #' extension package.) #' @param MoreArgs A named list of arguments to be passed when fetching metadata #' for a given function. See details. -#' @param keep.all When fetching metadata from different datasets, whether to -#' keep all metadata columns rather than just those in common -#' (default=\code{FALSE}) -#' #' @inheritParams cf_partners #' #' @importFrom dplyr mutate rename rename_with select case_when any_of diff --git a/R/partners.R b/R/partners.R index e3dbb39..55baa43 100644 --- a/R/partners.R +++ b/R/partners.R @@ -18,7 +18,10 @@ #' @param partners Whether to return inputs or outputs #' @param bind.rows Whether to bind data.frames for each dataset together, #' keeping only the common columns (default \code{TRUE} for convenience but -#' note that some columns will be dropped). +#' note that some columns will be dropped by unless \code{keep.all=TRUE}). +#' @param keep.all Whether to keep all columns when processing multiple datasets +#' rather than just those in common (default=\code{FALSE} only keeps shared +#' columns). #' @param MoreArgs Additional arguments in the form of a hierarchical list #' (expert use; see details and examples). #' @@ -42,7 +45,7 @@ #' MoreArgs = list(malecns=list(prefer.foreign=TRUE)) #' } cf_partners <- function(ids, threshold=1L, partners=c("inputs", "outputs"), - bind.rows=TRUE, MoreArgs=list()) { + bind.rows=TRUE, MoreArgs=list(), keep.all=FALSE) { partners=match.arg(partners) threshold <- checkmate::assert_integerish( threshold, lower=0L,len = 1, null.ok = F, all.missing = F) @@ -127,7 +130,7 @@ cf_partners <- function(ids, threshold=1L, partners=c("inputs", "outputs"), res[[n]]=tres } if(isTRUE(bind.rows)) { - res=bind_rows2(res) + res=bind_rows2(res, keep.all = keep.all) # record the datasets we tried to find attr(res, 'datasets')=names(ids) res diff --git a/R/utils.R b/R/utils.R index 4f0439c..97ed3eb 100644 --- a/R/utils.R +++ b/R/utils.R @@ -11,11 +11,42 @@ bind_rows2 <- function(l, keep.all=FALSE) { l=lapply(l, "[", commoncols) l <- do.call(function(...) rbind(..., make.row.names=FALSE), l) } else { - l <- dplyr::bind_rows(l) + l2=fix_mixed_col_types(l) + l <- dplyr::bind_rows(l2) } l } +fix_mixed_col_types <- function(l) { + dd=dplyr::bind_rows( + lapply(l, function(x) tibble::tibble( + name=names(x), + mode=sapply(x, mode))), + .id = 'dfname') + tofix <- dd %>% + group_by(name) %>% + summarise(nmodes=n_distinct(mode), + ndfs=n_distinct(dfname), + some_character=any(mode=='character')) %>% + filter(some_character & nmodes>1) + if(nrow(tofix)<1) + return(l) + + lapply(l, function(d) { + for (nm in intersect(tofix$name, names(d))) { + x <- d[[nm]] + if (!is.character(x)) { + # use id2char rather than as.character to ensure eg 100000 processed ok + ix <- try(coconat::id2char(x), silent = T) + # but fall back if id2char can't handle it + if(inherits(ix, 'try-error')) ix <- as.character(x) + } + d[[nm]] <- ix + } + d + }) +} + cf_connections <- function() { dslist=list() npds=c("hemibrain", "manc", "malecns", 'opticlobe') diff --git a/_pkgdown.yml b/_pkgdown.yml index ab0e2a9..6f7a917 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -8,3 +8,4 @@ articles: - getting-started - TuTu - AOTU063 + - extending-coconatfly diff --git a/man/cf_cosine_plot.Rd b/man/cf_cosine_plot.Rd index 4010580..d386b90 100644 --- a/man/cf_cosine_plot.Rd +++ b/man/cf_cosine_plot.Rd @@ -29,7 +29,11 @@ multi_connection_table( threshold = 1L, group = "type", check_missing = TRUE, - min_datasets = Inf + min_datasets = Inf, + prefer.foreign = NA, + keep.all = FALSE, + MoreArgs = NULL, + ... ) } \arguments{ @@ -37,7 +41,7 @@ multi_connection_table( wrapped by \code{\link{cf_ids}} \emph{or} a dataframe compatible with the \code{\link{keys}} function.} -\item{...}{Additional arguments passed to \code{\link{heatmap}}} +\item{...}{additional arguments passed to \code{\link{cf_partners}}} \item{threshold}{return only edges with at least this many matches. 0 is an option since neuprint sometimes returns 0 weight edges.} @@ -77,8 +81,8 @@ for details.} \item{min_datasets}{How many datasets a type must be in to be included in the output. The default of \code{Inf} => all datasets must contain the cell -type. A negative number defines the number of datasets from which a type -can be missing. For example \code{-1} would mean that types would still be +type. A negative number defines the number of datasets from which a type can +be missing. For example \code{-1} would mean that types would still be included even if they are missing from one dataset.} \item{nas}{What to do with entries that have NAs. Default is to set them to 0 @@ -88,6 +92,16 @@ similarity.} \item{check_missing}{Whether to report if any query neurons are dropped (due to insufficient partner neurons) (default:\code{TRUE}).} + +\item{prefer.foreign}{Whether to use foreign types for male CNS data. The +default value of \code{NA} prefers foreign types when multiple datasets +including malecns are requested. See details.} + +\item{keep.all}{Whether to keep all columns when processing multiple datasets +rather than just those in common (default=\code{FALSE} only keeps shared +columns).} + +\item{MoreArgs}{Passed to \code{\link{cf_partners}} For expert use only.} } \value{ The result of \code{\link{heatmap}} invisibly including the row and @@ -96,9 +110,9 @@ The result of \code{\link{heatmap}} invisibly including the row and matrix. \code{multi_connection_table} returns a connectivity dataframe as - returned by \code{cf_partners} but with an additional column - \code{partners} which indicates (for each row) whether the partner neurons - are the input or output neurons. + returned by \code{cf_partners} but with an additional column \code{partners} + which indicates (for each row) whether the partner neurons are the input or + output neurons. } \description{ \code{cf_cosine_plot} is the workhorse function for within and @@ -107,8 +121,9 @@ output of \code{multi_connection_table} if you need more control. See examples. \code{multi_connection_table} fetches partner connectivity data - (the first step in \code{cf_cosine_plot} but then gives you the option e.g. - to select specific classes of partner neurons. See examples. + (the first step in \code{cf_cosine_plot}) but then gives you the option e.g. + to select specific classes of partner neurons, fix type names etc. See + examples. } \details{ \code{group=FALSE} only makes sense for single dataset clustering - @@ -141,6 +156,24 @@ examples. argument then you will get an error. This is because \code{cf_cosine_plot} has no way of knowing which label corresponds to which neuron, almost certainly resulting in incorrect row labels on your dendrogram. + +At present the malecns dataset is the best integrated of all with + "foreign type" columns referencing the prior flywire female brain and MANC + male nerve cord datasets. These in turn have been the target of ongoing FANC + and BANC annotation efforts. Therefore right now the simplest way to ensure + that types can be matched across datasets is to use + \code{prefer.foreign=TRUE} when requesting multiple datasets. However when + using just the malecns, the standard typing for that dataset has some + improvements, so \code{prefer.foreign=FALSE} would be better. The default + setting of \code{prefer.foreign=NA} therefore chooses + \code{prefer.foreign=TRUE} when malecns and at least one other dataset are + being requested and \code{FALSE} otherwise. + + Nevertheless, if you want really tight control of the type to type mapping + it is recommended to fetch with \code{prefer.foreign=F, min_datasets=1} and + then manually review and fix up any types that you know should match. If you + also set \code{keep.all=T} they you can access the foreign types columns + as part of your logic for doing this. } \examples{ \donttest{ diff --git a/man/cf_meta.Rd b/man/cf_meta.Rd index cb6eed4..b4ef47c 100644 --- a/man/cf_meta.Rd +++ b/man/cf_meta.Rd @@ -19,15 +19,15 @@ other input that can be processed by the \code{\link{keys}} function \item{bind.rows}{Whether to bind data.frames for each dataset together, keeping only the common columns (default \code{TRUE} for convenience but -note that some columns will be dropped).} +note that some columns will be dropped by unless \code{keep.all=TRUE}).} \item{integer64}{Whether ids should be character vectors (default) or 64 bit ints (more compact but a little fragile as they rely on the \code{bit64} extension package.)} -\item{keep.all}{When fetching metadata from different datasets, whether to -keep all metadata columns rather than just those in common -(default=\code{FALSE})} +\item{keep.all}{Whether to keep all columns when processing multiple datasets +rather than just those in common (default=\code{FALSE} only keeps shared +columns).} \item{MoreArgs}{A named list of arguments to be passed when fetching metadata for a given function. See details.} diff --git a/man/cf_partners.Rd b/man/cf_partners.Rd index 5661bef..f3d250c 100644 --- a/man/cf_partners.Rd +++ b/man/cf_partners.Rd @@ -9,7 +9,8 @@ cf_partners( threshold = 1L, partners = c("inputs", "outputs"), bind.rows = TRUE, - MoreArgs = list() + MoreArgs = list(), + keep.all = FALSE ) } \arguments{ @@ -24,10 +25,14 @@ option since neuprint sometimes returns 0 weight edges.} \item{bind.rows}{Whether to bind data.frames for each dataset together, keeping only the common columns (default \code{TRUE} for convenience but -note that some columns will be dropped).} +note that some columns will be dropped by unless \code{keep.all=TRUE}).} \item{MoreArgs}{Additional arguments in the form of a hierarchical list (expert use; see details and examples).} + +\item{keep.all}{Whether to keep all columns when processing multiple datasets +rather than just those in common (default=\code{FALSE} only keeps shared +columns).} } \value{ A data.frame or a named list (when \code{bind.rows=FALSE}) diff --git a/vignettes/AOTU063.Rmd b/vignettes/AOTU063.Rmd index c008766..d468590 100644 --- a/vignettes/AOTU063.Rmd +++ b/vignettes/AOTU063.Rmd @@ -33,17 +33,17 @@ library(dplyr) For this analysis we will use the version 630 connectivity / annotation data released in June 2023. We will set an option use the lower level fafbseg package to ensure this. +You may need to download the relevant data dumps if you have not done so previously. ```{r} -fafbseg::flywire_connectome_data_version(set = 630) +fafbseg::download_flywire_release_data(version = 630) ``` -You may need to download the relevant data dumps if you have not done so previously. - -```{r, eval=FALSE} -fafbseg::download_flywire_release_data(version = 630) +```{r} +fafbseg::flywire_connectome_data_version(set = 630) ``` + ```{r} aotu63=cf_meta(cf_ids(query = '/type:AOTU063.*', datasets = c("flywire","hemibrain"))) aotu63 diff --git a/vignettes/extending-coconatfly.Rmd b/vignettes/extending-coconatfly.Rmd index 089b2f6..8f6c74b 100644 --- a/vignettes/extending-coconatfly.Rmd +++ b/vignettes/extending-coconatfly.Rmd @@ -1,8 +1,8 @@ --- -title: "Extending coconatfly with external data sources" +title: "4. Extending coconatfly with external data sources" output: rmarkdown::html_vignette vignette: > - %\VignetteIndexEntry{Extending coconatfly with external data sources} + %\VignetteIndexEntry{4. Extending coconatfly with external data sources} %\VignetteEngine{knitr::rmarkdown} %\VignetteEncoding{UTF-8} ---