diff --git a/DESCRIPTION b/DESCRIPTION index 8f4a851..6e14690 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -39,7 +39,8 @@ Imports: readr, ggh4x, data.table, - rlang + rlang, + duckdb Depends: R (>= 3.4.1), SummarizedExperiment, diff --git a/NAMESPACE b/NAMESPACE index ac7b6ec..3d95b18 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -186,6 +186,7 @@ import(methods) importClassesFrom(RSQLite,SQLiteConnection) importClassesFrom(S4Vectors,DFrame) importClassesFrom(S4Vectors,DataFrame) +importClassesFrom(duckdb,duckdb_connection) importFrom(S4Vectors,setValidity2) importFrom(magrittr,"%$%") importFrom(magrittr,"%>%") diff --git a/R/aggDb.R b/R/aggDb.R index d3f6cb7..5eabd9f 100644 --- a/R/aggDb.R +++ b/R/aggDb.R @@ -14,7 +14,7 @@ aggdb <- function(path) { } tryCatch( { - con <- DBI::dbConnect(RSQLite::SQLite(), path) + con <- DBI::dbConnect(DBI::dbDriver("SQLite"), path) }, error = function(e) { stop(sprintf("Invalid aggdb path '%s'", path), call. = FALSE) diff --git a/R/allClasses.R b/R/allClasses.R index 4fa1deb..d5a9b68 100644 --- a/R/allClasses.R +++ b/R/allClasses.R @@ -92,7 +92,7 @@ setClass("genoMatrix", contains = "SummarizedExperiment") #' @description #' Compressed SQLite (".gdb") representation of genotype data and associated variant annotation and sample info tables. #' These allow for rapid and memory-efficient loading of sample genotype data and associated metadata within R. -#' The slots of the gdb class are inherited entirely from the [`RSQLite::SQLiteConnection-class`]. +#' The slots of the gdb class are inherited entirely from the duckdb::duckdb() connection. #' A host of RVAT methods described here allow for convenient querying and manipulation of a gdb, for complex queries #' users can also directly perform SQL queries on the gdb as exemplified in the examples and tutorials. #' @@ -154,9 +154,9 @@ NULL #' #' @rdname gdb #' @import DBI -#' @importClassesFrom RSQLite SQLiteConnection +#' @importClassesFrom duckdb duckdb_connection #' @export -setClass("gdb", contains = "SQLiteConnection") +setClass("gdb", contains = "duckdb_connection") #' Class to manage multiple varSets @@ -706,6 +706,7 @@ NULL #' @rdname aggdb #' @usage NULL +#' @importClassesFrom RSQLite SQLiteConnection #' @export setClass("aggdb", contains = "SQLiteConnection") diff --git a/R/allGenerics.R b/R/allGenerics.R index c7c804c..95c97cd 100644 --- a/R/allGenerics.R +++ b/R/allGenerics.R @@ -417,7 +417,7 @@ setGeneric("insertVarRecord", function(object, record) { standardGeneric("insertVarRecord") }) -setGeneric("insertDosageRecord", function(object, record) { +setGeneric("insertDosageRecord", function(object, record, var_id) { standardGeneric("insertDosageRecord") }) diff --git a/R/gdb-anno-cohort.R b/R/gdb-anno-cohort.R index 6c70660..4992f56 100644 --- a/R/gdb-anno-cohort.R +++ b/R/gdb-anno-cohort.R @@ -54,10 +54,14 @@ setMethod( # build where statement if (!is.null(VAR_id)) { - VAR_id <- sprintf( - "VAR_id in (%s)", - paste(as.integer(VAR_id), collapse = ",") - ) + if (length(VAR_id) > 0L) { + VAR_id <- sprintf( + "VAR_id in (%s)", + paste(as.integer(VAR_id), collapse = ",") + ) + } else { + VAR_id <- "1=0" + } if (length(where) > 0L) { where <- sprintf("(%s) AND (%s)", VAR_id, where) } else { @@ -221,12 +225,14 @@ setMethod( if (verbose) { message(sprintf("Loading table '%s' from '%s'", name, value)) } - DBI::dbWriteTable( - con = gdb, - name = name, - value = value, - sep = sep, - overwrite = TRUE + DBI::dbExecute( + gdb, + sprintf( + "CREATE TABLE %s AS SELECT * FROM read_csv_auto('%s', delim='%s')", + name, + value, + sep + ) ) source_info <- value } else { @@ -605,8 +611,8 @@ setMethod( if (meta_anno_exists) { DBI::dbExecute( object, - "DELETE FROM anno WHERE name = :table_to_remove", - params = list(table_to_remove = name) + "DELETE FROM anno WHERE name = ?", + params = list(name) ) } @@ -614,8 +620,8 @@ setMethod( if (meta_cohort_exists) { DBI::dbExecute( object, - "DELETE FROM cohort WHERE name = :table_to_remove", - params = list(table_to_remove = name) + "DELETE FROM cohort WHERE name = ?", + params = list(name) ) } diff --git a/R/gdb-buildGdb.R b/R/gdb-buildGdb.R index 29d668d..7785850 100644 --- a/R/gdb-buildGdb.R +++ b/R/gdb-buildGdb.R @@ -25,7 +25,7 @@ buildGdb <- function( vcf, output, - skipIndexes = FALSE, + skipIndexes = FALSE, ## DuckDB does indexing internally, so no need to do it manually as well ##TODO: check is this is true skipVarRanges = FALSE, overWrite = FALSE, genomeBuild = NULL, @@ -190,7 +190,7 @@ setMethod( carrierN <- length(carrierIndex) for (ai in seq_along(alleles)) { # write variant info - insertVarRecord( + var_id <- insertVarRecord( object, record = c(records[i, 1:4], alleles[ai], records[i, 6:9]) ) @@ -203,14 +203,15 @@ setMethod( collapse = "/" ) } - insertDosageRecord(object, record = gtia) + insertDosageRecord(object, record = gtia, var_id = var_id) } # bi-allelic site } else { - insertVarRecord(object, record = records[i, 1:9]) + var_id <- insertVarRecord(object, record = records[i, 1:9]) insertDosageRecord( object, - record = substr(records[i, -(1:9), drop = FALSE], 1, 3) + record = substr(records[i, -(1:9), drop = FALSE], 1, 3), + var_id = var_id ) } } @@ -233,32 +234,34 @@ setMethod( "insertVarRecord", signature = "gdb", definition = function(object, record) { - DBI::dbExecute( + var_id <- DBI::dbGetQuery( object, - paste0( - "insert into var(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT) ", - "values (:chrom, :pos, :id, :ref, :alt, :qual, :flt, :info, :form)" - ), + "INSERT INTO var(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT) + VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?) + RETURNING VAR_id;", params = list( - chrom = record[1], - pos = record[2], - id = record[3], - ref = record[4], - alt = record[5], - qual = record[6], - flt = record[7], - info = record[8], - form = record[9] + record[1], # CHROM + as.integer(record[2]), # POS + record[3], # ID + record[4], # REF + record[5], # ALT + record[6], # QUAL + record[7], # FILTER + record[8], # INFO + record[9] # FORMAT ) - ) + )$VAR_id + + return(var_id) } ) setMethod( "insertDosageRecord", signature = "gdb", - definition = function(object, record) { - record[record == "0/0"] <- "0" + definition = function(object, record, var_id) { + ## record contains the rows of the VCF file, where each row corresponds to a variant, the first 9 columns correspond to headers (CHROM, POS, etc.) and from column 10 the samples. var_id is added so that it can be passed from teh insertVarRecord function + record[record == "0/0"] <- "0" ## all different options, if a allele is unkown (due to for example sequence quality), it is set to 0 (the reference allele) record[record == "./0"] <- "0" record[record == "0/1"] <- "1" record[record == "1/0"] <- "1" @@ -272,8 +275,9 @@ setMethod( record[record == "1|1"] <- "2" record[record == ".|."] <- "N" record[record == ".:0"] <- "N" - obs <- sort(unique(c(record))) + obs <- sort(unique(c(record))) ## stores all unique genotypes if (sum(!obs %in% c("0", "1", "2", "N")) > 0L) { + ## if the obs contains other characters than 0, 1, 2, N it leads to an error stop( sprintf( paste0( @@ -285,12 +289,15 @@ setMethod( call. = FALSE ) } - + ## compress record <- I(list(memCompress(paste(record, collapse = ""), type = "gzip"))) + DBI::dbExecute( + ## opens the connection to the gdb object, - "insert into dosage(GT) values (:record)", - params = list(record = record) + "INSERT INTO dosage(VAR_id, GT) + VALUES (?, ?)", ## inserts into the var_id, since this is not done automatically with duckDB + params = list(var_id, record) ) } ) @@ -340,10 +347,10 @@ setMethod( # insert GRanges into gdb as blobs per chromosome DBI::dbExecute( object, - statement = "insert into var_ranges(CHROM,ranges) values (:CHROM, :ranges)", + statement = "insert into var_ranges(CHROM,ranges) values (?, ?)", params = list( - CHROM = chrom, - ranges = list(serialize(gr, connection = NULL)) + chrom, + list(serialize(gr, connection = NULL)) ) ) } @@ -373,23 +380,70 @@ setMethod( invisible(NULL) } +.buildgdb_create_schema <- function(gdb, verbose = TRUE) { + ## drop tables if they already exist. This is needed in DuckDB. + .drop_duckdb_objects(gdb, verbose = verbose) -.buildgdb_create_schema <- function(gdb) { - DBI::dbExecute( - gdb, - paste0( - "create table var (VAR_id integer primary key, CHROM text, POS int, ", - "ID text, REF text, ALT text, QUAL text, FILTER text, INFO text, FORMAT text);" - ) - ) - DBI::dbExecute(gdb, "create table SM (IID text, sex int)") + # create sequence + DBI::dbExecute(gdb, "CREATE SEQUENCE var_seq START 1;") + + # create table with default VAR_id from sequence DBI::dbExecute( gdb, - "create table dosage (VAR_id integer primary key, GT BLOB);" + " + CREATE TABLE var ( + VAR_id INTEGER PRIMARY KEY DEFAULT nextval('var_seq'), + CHROM TEXT, + POS INT, + ID TEXT, + REF TEXT, + ALT TEXT, + QUAL TEXT, + FILTER TEXT, + INFO TEXT, + FORMAT TEXT + );" ) - DBI::dbExecute(gdb, "create table anno (name text,value text,date text)") - DBI::dbExecute(gdb, "create table cohort (name text,value text,date text)") - DBI::dbExecute(gdb, "create table meta (name text,value text)") + + DBI::dbExecute(gdb, "CREATE TABLE SM (IID TEXT, sex INT);") + DBI::dbExecute(gdb, "CREATE TABLE dosage (VAR_id INTEGER, GT BLOB);") + DBI::dbExecute(gdb, "CREATE TABLE anno (name TEXT,value TEXT,date TEXT);") + DBI::dbExecute(gdb, "CREATE TABLE cohort (name TEXT,value TEXT,date TEXT);") + DBI::dbExecute(gdb, "CREATE TABLE meta (name TEXT,value TEXT);") invisible(NULL) } + + +.drop_duckdb_objects <- function(gdb, verbose = TRUE) { + # drop tables if they exist + tables <- DBI::dbListTables(gdb) + for (tbl in c( + "var", + "SM", + "dosage", + "anno", + "cohort", + "meta", + "var_ranges" + )) { + if (tbl %in% tables) { + if (verbose) { + message(sprintf("Dropping existing table '%s'", tbl)) + } + DBI::dbExecute(gdb, sprintf("DROP TABLE IF EXISTS %s;", tbl)) + } + } + + # drop sequence + sequences <- DBI::dbGetQuery( + gdb, + "SELECT sequence_name FROM duckdb_sequences();" + ) + if ("var_seq" %in% sequences$sequence_name) { + if (verbose) { + message("Dropping existing sequence 'var_seq'") + } + DBI::dbExecute(gdb, "DROP SEQUENCE var_seq;") + } +} diff --git a/R/gdb-getGT.R b/R/gdb-getGT.R index 7f27da4..8755d41 100644 --- a/R/gdb-getGT.R +++ b/R/gdb-getGT.R @@ -342,7 +342,7 @@ setMethod( "select VAR_id, 'diploid' as ploidy, CHROM, POS, REF ", "from var where VAR_id in (%s);" ), - paste(as.integer(VAR_id), collapse = ",") + paste(as.integer(VAR_id), collapse = ",") ##TODO, will this work if VAR_id is very large? or better to split into chunks? ) ) @@ -404,7 +404,7 @@ setMethod( "select VAR_id, GT from dosage where VAR_id in (%s);", paste(as.integer(VAR_id), collapse = ",") ) - GT <- RSQLite::dbGetQuery(object, query) + GT <- DBI::dbGetQuery(object, query) # check if all variants are found success <- sum(VAR_id %in% GT[, 1]) diff --git a/R/gdb-shared-helpers.R b/R/gdb-shared-helpers.R index bdb6756..5c74eed 100644 --- a/R/gdb-shared-helpers.R +++ b/R/gdb-shared-helpers.R @@ -61,25 +61,33 @@ # add rvat version to meta table DBI::dbExecute( gdb, - "INSERT INTO meta VALUES (:name, :value)", + "INSERT INTO meta VALUES (?, ?)", params = list( - name = "rvatVersion", - value = as.character(packageVersion("rvat")) + "rvatVersion", + as.character(packageVersion("rvat")) ) ) # add random identifier DBI::dbExecute( gdb, - "INSERT INTO meta VALUES (:name, :value)", + "INSERT INTO meta VALUES (?, ?)", params = list( - name = "id", - value = paste(sample(c(letters, 0:9), 28L, replace = TRUE), collapse = "") + "id", + paste(sample(c(letters, 0:9), 28L, replace = TRUE), collapse = "") ) ) # add genome build - if (!is.null(genomeBuild) && !genomeBuild %in% names(nonPAR)) { + if (!is.null(genomeBuild)) { + ## added this to insert into genomeBuild into the meta table + DBI::dbExecute( + gdb, + "INSERT INTO meta VALUES (?, ?)", + params = list("genomeBuild", genomeBuild) + ) + } + if (!genomeBuild %in% names(nonPAR)) { warning( sprintf( paste0( @@ -95,26 +103,15 @@ ) } - DBI::dbExecute( - gdb, - "INSERT INTO meta VALUES (:name, :value)", - params = list( - name = "genomeBuild", - value = if (is.null(genomeBuild)) NA_character_ else genomeBuild - ) - ) - # add creation date DBI::dbExecute( gdb, - "INSERT INTO meta VALUES (:name, :value)", + "INSERT INTO meta VALUES (?, ?)", params = list( - name = "creationDate", - value = as.character(round(Sys.time(), units = "secs")) + "creationDate", + as.character(round(Sys.time(), units = "secs")) ) ) - - invisible(NULL) } @@ -129,8 +126,11 @@ as.character(round(Sys.time(), units = "secs")) )) } - DBI::dbExecute(gdb, "create index var_idx on var (VAR_id)") - DBI::dbExecute(gdb, "create index var_idx2 on var (CHROM,POS,REF,ALT)") + DBI::dbExecute(gdb, "create index IF NOT EXISTS var_idx on var (VAR_id)") + DBI::dbExecute( + gdb, + "create index IF NOT EXISTS var_idx2 on var (CHROM,POS,REF,ALT)" + ) # SM indexes if (verbose) { @@ -139,7 +139,7 @@ as.character(round(Sys.time(), units = "secs")) )) } - DBI::dbExecute(gdb, "create index SM_idx on SM (IID)") + DBI::dbExecute(gdb, "create index IF NOT EXISTS SM_idx on SM (IID)") # dosage indexes if (verbose) { @@ -148,7 +148,12 @@ as.character(round(Sys.time(), units = "secs")) )) } - DBI::dbExecute(gdb, "create index dosage_idx on dosage (VAR_id)") + DBI::dbExecute( + gdb, + "create index IF NOT EXISTS dosage_idx on dosage (VAR_id)" + ) + + DBI::dbExecute(gdb, "ANALYZE") ##TODO: SEE IF THIS IS INTERESTING TO USE # return nothing invisible(NULL) @@ -340,28 +345,28 @@ if (table_name %in% existing_tables) { delete_query <- sprintf( - "DELETE FROM %s WHERE name = :name", + "DELETE FROM %s WHERE name = ?", meta_table ) DBI::dbExecute( gdb, delete_query, - params = list(name = table_name) + params = list(table_name) ) } # using :name, :value, :date as placeholders insert_query <- sprintf( - "INSERT INTO %s (name, value, date) VALUES (:name, :value, :date)", + "INSERT INTO %s (name, value, date) VALUES (?, ?, ?)", meta_table ) DBI::dbExecute( gdb, insert_query, params = list( - name = table_name, - value = source_info, - date = date() + table_name, + source_info, + date() ) ) diff --git a/R/gdb-utils.R b/R/gdb-utils.R index dc5bd03..6d3fa6d 100755 --- a/R/gdb-utils.R +++ b/R/gdb-utils.R @@ -68,7 +68,7 @@ concatGdb <- function( if (verbose) { message(sprintf("Merging '%s'", gdb_i)) } - DBI::dbExecute(gdb, "attach :src as src", params = list(src = gdb_i)) + DBI::dbExecute(gdb, sprintf("ATTACH '%s' AS src", gdb_i)) DBI::dbExecute(gdb, "insert into var select * from src.var order by VAR_id") DBI::dbExecute( gdb, @@ -110,8 +110,44 @@ concatGdb <- function( as.character(round(Sys.time(), units = "secs")) )) } - DBI::dbExecute(gdb, "update var set VAR_id=rowid") - DBI::dbExecute(gdb, "update dosage set VAR_id=rowid") + # generate VAR_id map + DBI::dbExecute( + gdb, + " + CREATE TEMP TABLE VAR_id_map AS + SELECT + VAR_id AS old_id, + ROW_NUMBER() OVER ( + ORDER BY VAR_id + ) AS new_id + FROM var + " + ) + + # apply the mapping to the var table + DBI::dbExecute( + gdb, + " + UPDATE var + SET VAR_id = m.new_id + FROM VAR_id_map AS m + WHERE var.VAR_id = m.old_id + " + ) + + # apply the same mapping to the dosage table + DBI::dbExecute( + gdb, + " + UPDATE dosage + SET VAR_id = m.new_id + FROM VAR_id_map AS m + WHERE dosage.VAR_id = m.old_id + " + ) + + # clean up + DBI::dbExecute(gdb, "DROP TABLE VAR_id_map") } # generate indexes @@ -239,7 +275,7 @@ concatGdb <- function( if (verbose) { message("Creating SM, anno and cohort tables") } - DBI::dbExecute(gdb, "attach :src as src", params = list(src = gdb_list[1])) + DBI::dbExecute(gdb, sprintf("attach '%s' as src", gdb_list[1])) DBI::dbExecute(gdb, "create table SM as select * from src.SM") DBI::dbExecute(gdb, "detach src") DBI::dbExecute(gdb, "create table anno (name text,value text,date text)") @@ -278,7 +314,7 @@ setMethod( ) # list tables - master <- DBI::dbGetQuery(object, "select * from sqlite_master") + master <- DBI::dbGetQuery(object, "select * from sqlite_master") ##TODO: works with duckdb, but is this the best option? tables.base <- gdb_protected_tables[ !gdb_protected_tables %in% c("var_ranges", "tmp") ] @@ -365,6 +401,7 @@ setMethod( addRangedVarinfo(gdb, overwrite = TRUE, verbose = verbose) ## meta table + DBI::dbExecute(gdb, "DROP TABLE IF EXISTS meta") DBI::dbExecute(gdb, "create table meta (name text,value text)") .gdb_populate_meta_table( gdb = gdb, @@ -542,14 +579,23 @@ setMethod( for (copied in tables) { indexes <- master[ master$type == "index" & - master$tbl_name == copied, + master$tbl_name == copied & + !is.na(master$sql), ]$sql + ## skip if there are no indexes (is this necessary, or had the varInfo table always an index for VAR_id?) + if (length(indexes) == 0) { + next + } + + ## duckdb uses schema.table syntax (it must use attach_name.tablename) for (index in indexes) { - DBI::dbExecute( - gdb, - gsub("CREATE INDEX ", sprintf("CREATE INDEX %s.", attach_name), index) + query <- sub( + paste0("ON\\s+", copied, "\\b"), + paste0("ON ", attach_name, ".", copied), + index ) + DBI::dbExecute(gdb, query) } } diff --git a/R/gdb.R b/R/gdb.R index 3611bf8..75ce754 100644 --- a/R/gdb.R +++ b/R/gdb.R @@ -4,29 +4,33 @@ #' @rdname gdb #' @usage NULL #' @export -setMethod("show", signature = "gdb", definition = function(object) { +#' +setMethod("show", "gdb", function(object) { cat("rvat gdb object\n", "Path:", getGdbPath(object), "\n") }) #' @rdname gdb #' @usage NULL #' @export +#' setMethod("close", signature = "gdb", definition = function(con) { DBI::dbDisconnect(con) }) + # constructors -------------------------------------------- #' @rdname gdb #' @usage NULL #' @export + gdb <- function(path) { if (!file.exists(path)) { stop(sprintf("'%s' doesn't exist.", path), call. = FALSE) } tryCatch( { - con <- DBI::dbConnect(DBI::dbDriver("SQLite"), path) + con <- DBI::dbConnect(duckdb::duckdb(), path) }, error = function(e) { stop(sprintf("Invalid gdb path '%s'", path), call. = FALSE) @@ -35,10 +39,11 @@ gdb <- function(path) { new("gdb", con) } + gdb_init <- function(path) { tryCatch( { - con <- DBI::dbConnect(DBI::dbDriver("SQLite"), path) + con <- DBI::dbConnect(duckdb::duckdb(), path) }, error = function(e) { stop(sprintf("Invalid gdb path '%s'", path), call. = FALSE) @@ -92,7 +97,7 @@ setMethod("getGdbId", signature = "gdb", definition = function(object) { #' @usage NULL #' @export setMethod("getGdbPath", signature = "gdb", definition = function(object) { - object@dbname + DBI::dbGetInfo(object)$dbname }) #' @rdname gdb @@ -198,7 +203,7 @@ setMethod( ranges[x, ][["end"]] + padding ) query <- paste("select * from var where", paste(query, collapse = " or ")) - query <- RSQLite::dbGetQuery(gdb, query) + query <- DBI::dbGetQuery(gdb, query) query } queries <- do.call( diff --git a/R/mapToCDS.R b/R/mapToCDS.R index d189114..caf67cd 100755 --- a/R/mapToCDS.R +++ b/R/mapToCDS.R @@ -176,7 +176,7 @@ setMethod( .mapToCDS_get_overlapping_chroms <- function(object, exons) { chrom <- unique(as.character(seqnames(exons))) - chrom_gdb <- RSQLite::dbGetQuery(object, "select CHROM from var_ranges")$CHROM + chrom_gdb <- DBI::dbGetQuery(object, "select CHROM from var_ranges")$CHROM if (any(grepl("chr", chrom_gdb, fixed = TRUE))) { chrom_gdb <- gsub("chr", "", chrom_gdb, fixed = TRUE) } diff --git a/R/spatialClust.R b/R/spatialClust.R index 1884a32..0bdfb77 100755 --- a/R/spatialClust.R +++ b/R/spatialClust.R @@ -59,7 +59,7 @@ setMethod( # add grouped concat query <- sprintf( - "select unit, group_concat(VAR_id) as VAR_id, group_concat(weight) as weight, '%s' as varSetName, group_concat(POS) as POS from (%s) x group by unit", + "select unit, group_concat(VAR_id order by VAR_id) as VAR_id, group_concat(weight order by VAR_id) as weight, '%s' as varSetName, group_concat(POS order by VAR_id) as POS from (%s) x group by unit order by unit", varSetName, query ) @@ -77,10 +77,10 @@ setMethod( con = output ) - handle <- RSQLite::dbSendQuery(object, query) + handle <- DBI::dbSendQuery(object, query) on.exit(DBI::dbClearResult(handle), add = TRUE) - while (!RSQLite::dbHasCompleted(handle)) { - chunk <- RSQLite::dbFetch(handle, n = memlimit) + while (!DBI::dbHasCompleted(handle)) { + chunk <- DBI::dbFetch(handle, n = memlimit) if (nrow(chunk) == 0L) { break diff --git a/R/varSet.R b/R/varSet.R index 9a81d92..9054568 100644 --- a/R/varSet.R +++ b/R/varSet.R @@ -854,7 +854,7 @@ setMethod( # add grouped concat query <- sprintf( - "select unit, group_concat(VAR_id) as VAR_id, group_concat(w) as w, '%s' as varSetName from (%s) x group by unit", + "select unit, group_concat(VAR_id order by VAR_id) as VAR_id, group_concat(w order by VAR_id) as w, '%s' as varSetName from (%s) x group by unit order by unit", varSetName, query ) diff --git a/man/ACAT.Rd b/man/ACAT.Rd index c8e8032..83cde6f 100644 --- a/man/ACAT.Rd +++ b/man/ACAT.Rd @@ -65,7 +65,7 @@ library(rvatData) # this will first ACAT P-values across statistical tests # and then ACAT these P-values across varSets data(rvbresults) -rvbresults <- rvbresults[1:1000,] +rvbresults <- rvbresults[1:1000, ] ACAT( rvbresults, aggregate = list("test", "varSetName") @@ -78,8 +78,8 @@ ACAT( ) # Input P-value shouldn't be exactly 0 or 1 (see: https://github.com/yaowuliu/ACAT). -# By default, P-values that are exactly 0 or 1 are reset (`fixpval = TRUE`) to the -# minimum P-value (>0) and maximum P-value (<1) in the results. +# By default, P-values that are exactly 0 or 1 are reset (`fixpval = TRUE`) to the +# minimum P-value (>0) and maximum P-value (<1) in the results. # Alternatives include: # manual minmax values: ACAT( diff --git a/man/aggdb.Rd b/man/aggdb.Rd index 2488779..dfa09e4 100644 --- a/man/aggdb.Rd +++ b/man/aggdb.Rd @@ -66,22 +66,28 @@ gdb <- create_example_gdb() # generate the aggregates based on a varSetFile varsetfile <- varSetFile(rvat_example("rvatData_varsetfile.txt.gz")) -varset <- getVarSet(varsetfile, unit = c("NEK1", "SOD1", "ABCA4"), varSetName = "High") +varset <- getVarSet( + varsetfile, + unit = c("NEK1", "SOD1", "ABCA4"), + varSetName = "High" +) aggfile <- tempfile() -aggregate(x = gdb, - varSet = varset, - maxMAF = 0.001, - output = aggfile, - verbose = FALSE) +aggregate( + x = gdb, + varSet = varset, + maxMAF = 0.001, + output = aggfile, + verbose = FALSE +) # connect to aggdb, see ?aggdb for more details aggdb <- aggdb(aggfile) -# list units and samples in aggdb +# list units and samples in aggdb head(listUnits(aggdb)) head(listSamples(aggdb)) -# retrieve aggregates +# retrieve aggregates aggregates <- getUnit(aggdb, unit = "SOD1") # see ?`assocTest-aggdb` for details on running association tests on an aggdb diff --git a/man/assocTest-aggdb.Rd b/man/assocTest-aggdb.Rd index 7d69252..a6ad020 100644 --- a/man/assocTest-aggdb.Rd +++ b/man/assocTest-aggdb.Rd @@ -75,7 +75,11 @@ gdb <- create_example_gdb() # assocTest-aggdb allows for running association tests on pre-constructed aggregates. # below we first generate the aggregates based on a varSetFile varsetfile <- varSetFile(rvat_example("rvatData_varsetfile.txt.gz")) -varset <- getVarSet(varsetfile, unit = c("NEK1", "SOD1", "ABCA4"), varSetName = "High") +varset <- getVarSet( + varsetfile, + unit = c("NEK1", "SOD1", "ABCA4"), + varSetName = "High" +) aggfile <- tempfile() aggregate( x = gdb, diff --git a/man/assocTest-gdb.Rd b/man/assocTest-gdb.Rd index 852a925..1301e24 100644 --- a/man/assocTest-gdb.Rd +++ b/man/assocTest-gdb.Rd @@ -178,110 +178,137 @@ varsetfile <- varSetFile(rvat_example("rvatData_varsetfile.txt.gz")) # for example purposes, upload a small cohort cohort <- getCohort(gdb, "pheno") -cohort <- cohort[cohort$IID \%in\% colnames(GTsmall),] +cohort <- cohort[cohort$IID \%in\% colnames(GTsmall), ] uploadCohort(gdb, name = "phenosmall", value = cohort) # run a firth burden test on a binary phenotype -varsetlist <- getVarSet(varsetfile, unit = c("SOD1", "NEK1"), varSetName = "High") -rvb <- assocTest(gdb, - varSet = varsetlist, - cohort = "phenosmall", - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = "firth", - name = "example") +varsetlist <- getVarSet( + varsetfile, + unit = c("SOD1", "NEK1"), + varSetName = "High" +) +rvb <- assocTest( + gdb, + varSet = varsetlist, + cohort = "phenosmall", + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = "firth", + name = "example" +) # run ACAT-v and SKAT tests -rvb <- assocTest(gdb, - varSet = varsetlist, - cohort = "phenosmall", - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("acatvfirth", "skat_burden_robust", "skato_robust"), - name = "example") +rvb <- assocTest( + gdb, + varSet = varsetlist, + cohort = "phenosmall", + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("acatvfirth", "skat_burden_robust", "skato_robust"), + name = "example" +) # run a burden test on a continuous phenotype -rvb <- assocTest(gdb, - varSet = varsetlist, - cohort = "phenosmall", - pheno = "age", - continuous = TRUE, - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("lm", "skat", "acatv"), - name = "example") - -# run single variant tests on a binary phenotype -sv <- assocTest(gdb, - varSet = varsetlist, - cohort = "phenosmall", - pheno = "pheno", - singlevar = TRUE, - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("firth", "glm", "scoreSPA"), - name = "example", - minCarriers = 1) +rvb <- assocTest( + gdb, + varSet = varsetlist, + cohort = "phenosmall", + pheno = "age", + continuous = TRUE, + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("lm", "skat", "acatv"), + name = "example" +) + +# run single variant tests on a binary phenotype +sv <- assocTest( + gdb, + varSet = varsetlist, + cohort = "phenosmall", + pheno = "pheno", + singlevar = TRUE, + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("firth", "glm", "scoreSPA"), + name = "example", + minCarriers = 1 +) # similarly a list of VAR_ids can be specified instead of a varSetList/varSetFile -sv <- assocTest(gdb, - VAR_id = 1:20, - cohort = "phenosmall", - pheno = "pheno", - singlevar = TRUE, - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("firth", "glm", "scoreSPA"), - name = "example", - minCarriers = 1) +sv <- assocTest( + gdb, + VAR_id = 1:20, + cohort = "phenosmall", + pheno = "pheno", + singlevar = TRUE, + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("firth", "glm", "scoreSPA"), + name = "example", + minCarriers = 1 +) # apply variant filters -rvb <- assocTest(gdb, - varSet = varsetlist, - cohort = "phenosmall", - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("firth", "skat_robust", "acatv"), - name = "example", - maxMAF = 0.05, - minCarriers = 1, - minCallrateVar = 0.9, - minCallrateSM = 0.95) +rvb <- assocTest( + gdb, + varSet = varsetlist, + cohort = "phenosmall", + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("firth", "skat_robust", "acatv"), + name = "example", + maxMAF = 0.05, + minCarriers = 1, + minCallrateVar = 0.9, + minCallrateSM = 0.95 +) # Perform MAF-weighted burden tests (madsen-browning) -rvb <- assocTest(gdb, - varSet = varsetlist, - cohort = "phenosmall", - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("firth", "skat_robust", "acatv"), - MAFweights = "mb") +rvb <- assocTest( + gdb, + varSet = varsetlist, + cohort = "phenosmall", + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("firth", "skat_robust", "acatv"), + MAFweights = "mb" +) # CADD-weighted burden test -varsetlist_cadd <- getVarSet(varsetfile, unit = c("SOD1", "NEK1"), varSetName = "CADD") -rvb <- assocTest(gdb, - varSet = varsetlist_cadd, - cohort = "phenosmall", - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("firth", "skat_robust", "acatv")) +varsetlist_cadd <- getVarSet( + varsetfile, + unit = c("SOD1", "NEK1"), + varSetName = "CADD" +) +rvb <- assocTest( + gdb, + varSet = varsetlist_cadd, + cohort = "phenosmall", + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("firth", "skat_robust", "acatv") +) # Perform recessive burden test -rvb <- assocTest(gdb, - varSet = varsetlist, - cohort = "phenosmall", - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("firth", "skat_robust", "acatv"), - geneticModel = "recessive") - -# Resampled burden test -rvb <- assocTest(gdb, - varSet = varsetlist[1], - cohort = "phenosmall", - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("skat", "skat_burden", "acatv"), - name = "example", - methodResampling = "permutation", - nResampling = 100) +rvb <- assocTest( + gdb, + varSet = varsetlist, + cohort = "phenosmall", + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("firth", "skat_robust", "acatv"), + geneticModel = "recessive" +) +# Resampled burden test +rvb <- assocTest( + gdb, + varSet = varsetlist[1], + cohort = "phenosmall", + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("skat", "skat_burden", "acatv"), + name = "example", + methodResampling = "permutation", + nResampling = 100 +) } diff --git a/man/assocTest-genoMatrix.Rd b/man/assocTest-genoMatrix.Rd index 0660e45..b7ddd6e 100644 --- a/man/assocTest-genoMatrix.Rd +++ b/man/assocTest-genoMatrix.Rd @@ -146,79 +146,96 @@ library(rvatData) data(GTsmall) # run a firth burden test on a binary phenotype -rvb <- assocTest(GTsmall, - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = "firth", - name = "example") +rvb <- assocTest( + GTsmall, + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = "firth", + name = "example" +) # run ACAT-v and SKAT tests -rvb <- assocTest(GTsmall, - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("acatvfirth", "skat_burden_robust", "skato_robust"), - name = "example") +rvb <- assocTest( + GTsmall, + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("acatvfirth", "skat_burden_robust", "skato_robust"), + name = "example" +) # run a burden test on a continuous phenotype -rvb <- assocTest(GTsmall, - pheno = "age", - continuous = TRUE, - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("lm", "skat", "acatv"), - name = "example") +rvb <- assocTest( + GTsmall, + pheno = "age", + continuous = TRUE, + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("lm", "skat", "acatv"), + name = "example" +) # run single variant tests on a binary phenotype -sv <- assocTest(GTsmall, - pheno = "pheno", - singlevar = TRUE, - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("firth", "glm", "scoreSPA"), - name = "example", - minCarriers = 1 - ) +sv <- assocTest( + GTsmall, + pheno = "pheno", + singlevar = TRUE, + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("firth", "glm", "scoreSPA"), + name = "example", + minCarriers = 1 +) # apply variant filters -rvb <- assocTest(GTsmall, - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("firth", "skat_robust", "acatv"), - name = "example", - maxMAF = 0.001, - minCarriers = 2, - minCallrateVar = 0.9, - minCallrateSM = 0.95) +rvb <- assocTest( + GTsmall, + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("firth", "skat_robust", "acatv"), + name = "example", + maxMAF = 0.001, + minCarriers = 2, + minCallrateVar = 0.9, + minCallrateSM = 0.95 +) # Perform MAF-weighted burden tests (madsen-browning) -rvb <- assocTest(GTsmall, - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("firth", "skat_robust", "acatv"), - MAFweights = "mb") +rvb <- assocTest( + GTsmall, + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("firth", "skat_robust", "acatv"), + MAFweights = "mb" +) # Perform weighted burden tests with custom weights gdb <- create_example_gdb() # add cadd scores caddscores <- getAnno(gdb, "varInfo", VAR_id = rownames(GTsmall)) -caddscores <- caddscores[match(rownames(GTsmall), caddscores$VAR_id),] +caddscores <- caddscores[match(rownames(GTsmall), caddscores$VAR_id), ] # CADD-weighted burden test -rvb <- assocTest(recode(GTsmall, weights = caddscores$CADDphred), - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("firth", "skat_robust", "acatv")) +rvb <- assocTest( + recode(GTsmall, weights = caddscores$CADDphred), + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("firth", "skat_robust", "acatv") +) # Perform recessive burden test -rvb <- assocTest(GTsmall, - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("firth", "skat_robust", "acatv"), - geneticModel = "recessive") +rvb <- assocTest( + GTsmall, + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("firth", "skat_robust", "acatv"), + geneticModel = "recessive" +) # Resampled burden test -rvb <- assocTest(GTsmall, - pheno = "pheno", - covar = c("PC1", "PC2", "PC3", "PC4"), - test = c("skat", "skat_burden", "acatv"), - name = "example", - methodResampling = "permutation", - nResampling = 100) +rvb <- assocTest( + GTsmall, + pheno = "pheno", + covar = c("PC1", "PC2", "PC3", "PC4"), + test = c("skat", "skat_burden", "acatv"), + name = "example", + methodResampling = "permutation", + nResampling = 100 +) } diff --git a/man/buildCorMatrix.Rd b/man/buildCorMatrix.Rd index 9cc9f75..e1afbf6 100644 --- a/man/buildCorMatrix.Rd +++ b/man/buildCorMatrix.Rd @@ -47,15 +47,19 @@ gdb <- create_example_gdb() # generate the aggregates based on a varSetFile varsetfile <- varSetFile(rvat_example("rvatData_varsetfile.txt.gz")) -varset <- getVarSet(varsetfile, - unit = c("NEK1", "SOD1", "ABCA4"), - varSetName = "High") +varset <- getVarSet( + varsetfile, + unit = c("NEK1", "SOD1", "ABCA4"), + varSetName = "High" +) aggfile <- tempfile() -aggregate(x = gdb, - varSet = varset, - maxMAF = 0.001, - output = aggfile, - verbose = FALSE) +aggregate( + x = gdb, + varSet = varset, + maxMAF = 0.001, + output = aggfile, + verbose = FALSE +) # build a block-wise correlation matrix cormatrix <- buildCorMatrix( diff --git a/man/buildGeneSet.Rd b/man/buildGeneSet.Rd index 2ac4821..327466d 100644 --- a/man/buildGeneSet.Rd +++ b/man/buildGeneSet.Rd @@ -39,16 +39,18 @@ Currently these can be built directly from GMT-files, data.frames and lists. } \examples{ genesetlist <- buildGeneSet( - list("geneset1" = c("SOD1", "NEK1"), - "geneset2" = c("ABCA4", "SOD1", "NEK1"), - "geneset3" = c("FUS", "NEK1") - )) + list( + "geneset1" = c("SOD1", "NEK1"), + "geneset2" = c("ABCA4", "SOD1", "NEK1"), + "geneset3" = c("FUS", "NEK1") + ) +) geneset <- genesetlist[[1]] # list units and weights included in the geneSet listUnits(geneset) listWeights(geneset) -# note that usually you'll work with geneSets in either a geneSetList or +# note that usually you'll work with geneSets in either a geneSetList or # a geneSetFile (see ?geneSetList and ?geneSetFile) } diff --git a/man/buildVarSet-gdb.Rd b/man/buildVarSet-gdb.Rd index 663a55d..57e2b26 100644 --- a/man/buildVarSet-gdb.Rd +++ b/man/buildVarSet-gdb.Rd @@ -57,21 +57,25 @@ library(rvatData) # Build a varset including variants with a moderate predicted impact gdb <- create_example_gdb() varsetfile_moderate <- tempfile() -buildVarSet(object = gdb, - output = varsetfile_moderate, - varSetName = "Moderate", - unitTable = "varInfo", - unitName = "gene_name", - where = "ModerateImpact = 1") +buildVarSet( + object = gdb, + output = varsetfile_moderate, + varSetName = "Moderate", + unitTable = "varInfo", + unitName = "gene_name", + where = "ModerateImpact = 1" +) # Build a varset that contains CADD scores varsetfile_cadd <- tempfile() -buildVarSet(object = gdb, - output = varsetfile_cadd, - varSetName = "CADD", - unitTable = "varInfo", - unitName = "gene_name", - weightName = "CADDphred") +buildVarSet( + object = gdb, + output = varsetfile_cadd, + varSetName = "CADD", + unitTable = "varInfo", + unitName = "gene_name", + weightName = "CADDphred" +) # connect to varsetfile and retrieve variant sets varsetfile <- varSetFile(varsetfile_moderate) diff --git a/man/buildVarSet.Rd b/man/buildVarSet.Rd index 871e0dc..ffa004e 100644 --- a/man/buildVarSet.Rd +++ b/man/buildVarSet.Rd @@ -15,21 +15,25 @@ library(rvatData) # Build a varSetFile including variants with a moderate predicted impact gdb <- create_example_gdb() varsetfile_moderate <- tempfile() -buildVarSet(object = gdb, - output = varsetfile_moderate, - varSetName = "Moderate", - unitTable = "varInfo", - unitName = "gene_name", - where = "ModerateImpact = 1") +buildVarSet( + object = gdb, + output = varsetfile_moderate, + varSetName = "Moderate", + unitTable = "varInfo", + unitName = "gene_name", + where = "ModerateImpact = 1" +) # Build a varSetFile that contains CADD scores varsetfile_cadd <- tempfile() -buildVarSet(object = gdb, - output = varsetfile_cadd, - varSetName = "CADD", - unitTable = "varInfo", - unitName = "gene_name", - weightName = "CADDphred") +buildVarSet( + object = gdb, + output = varsetfile_cadd, + varSetName = "CADD", + unitTable = "varInfo", + unitName = "gene_name", + weightName = "CADDphred" +) # in addition to building a varSetFile from a gdb, # it can also be build directly from a data.frame diff --git a/man/collapseAggDbs.Rd b/man/collapseAggDbs.Rd index c76a23e..5980a4b 100644 --- a/man/collapseAggDbs.Rd +++ b/man/collapseAggDbs.Rd @@ -29,18 +29,22 @@ gdb <- create_example_gdb() # generate two aggregate files varsetfile <- varSetFile(rvat_example("rvatData_varsetfile.txt.gz")) aggdb1 <- tempfile() -aggregate(x = gdb, - varSet = getVarSet(varsetfile, unit = c("SOD1", "FUS"), varSetName = "High"), - maxMAF = 0.001, - output = aggdb1, - verbose = FALSE) +aggregate( + x = gdb, + varSet = getVarSet(varsetfile, unit = c("SOD1", "FUS"), varSetName = "High"), + maxMAF = 0.001, + output = aggdb1, + verbose = FALSE +) aggdb2 <- tempfile() -aggregate(x = gdb, - varSet = getVarSet(varsetfile, unit = c("NEK1"), varSetName = "High"), - maxMAF = 0.001, - output = aggdb2, - verbose = FALSE) +aggregate( + x = gdb, + varSet = getVarSet(varsetfile, unit = c("NEK1"), varSetName = "High"), + maxMAF = 0.001, + output = aggdb2, + verbose = FALSE +) # collapse aggdbs aggdb <- tempfile() diff --git a/man/gdb.Rd b/man/gdb.Rd index 9b52327..4173166 100644 --- a/man/gdb.Rd +++ b/man/gdb.Rd @@ -26,7 +26,7 @@ \description{ Compressed SQLite (".gdb") representation of genotype data and associated variant annotation and sample info tables. These allow for rapid and memory-efficient loading of sample genotype data and associated metadata within R. -The slots of the gdb class are inherited entirely from the \code{\link[RSQLite:SQLiteConnection-class]{RSQLite::SQLiteConnection}}. +The slots of the gdb class are inherited entirely from the duckdb::duckdb() connection. A host of RVAT methods described here allow for convenient querying and manipulation of a gdb, for complex queries users can also directly perform SQL queries on the gdb as exemplified in the examples and tutorials. } @@ -91,13 +91,11 @@ gdb and clear from annotation / cohort metadata tables. \examples{ library(rvatData) - + # build a gdb directly from a vcf file vcfpath <- rvat_example("rvatData.vcf.gz") # example vcf from rvatData package gdbpath <- tempfile() # write gdb to temporary file -buildGdb(vcf = vcfpath, - output = gdbpath, - genomeBuild = "GRCh38") +buildGdb(vcf = vcfpath, output = gdbpath, genomeBuild = "GRCh38") gdb <- gdb(gdbpath) # upload variant info @@ -113,9 +111,11 @@ listAnno(gdb) listCohort(gdb) # retrieve annotations for a genomic interval -varinfo <- getAnno(gdb, - table = "varinfo", - ranges = data.frame(CHROM = "chr1", start = 11013847, end = 11016874)) +varinfo <- getAnno( + gdb, + table = "varinfo", + ranges = data.frame(CHROM = "chr1", start = 11013847, end = 11016874) +) head(varinfo) # see ?getAnno for more details @@ -126,7 +126,7 @@ head(pheno) # delete table uploadAnno(gdb, name = "varInfo2", value = varinfo) -dropTable(gdb, "varInfo2") +dropTable(gdb, "varInfo2") # retrieve gdb metadata getGdbPath(gdb) @@ -135,26 +135,35 @@ getGenomeBuild(gdb) # retrieve genotypes # first extract variant ids that we want to retrieve the genotypes for # note that `where` here is an SQL compliant -varinfo <- getAnno(gdb, - table = "varinfo", - where = "gene_name = 'SOD1' and ModerateImpact = 1") +varinfo <- getAnno( + gdb, + table = "varinfo", + where = "gene_name = 'SOD1' and ModerateImpact = 1" +) -# retrieve genotypes for variants extracted above -GT <- getGT(gdb, - VAR_id = varinfo$VAR_id, - cohort = "pheno") +# retrieve genotypes for variants extracted above +GT <- getGT(gdb, VAR_id = varinfo$VAR_id, cohort = "pheno") # retrieve genotypes for a given genomic interval GT_fromranges <- getGT( - gdb, - ranges = data.frame(CHROM = "chr21", start = 31659666, end = 31668931), - cohort = "pheno", - anno = "varInfo", - annoFields = c("VAR_id", "CHROM", "POS", "REF", "ALT", "HighImpact", "ModerateImpact", "Synonymous") + gdb, + ranges = data.frame(CHROM = "chr21", start = 31659666, end = 31668931), + cohort = "pheno", + anno = "varInfo", + annoFields = c( + "VAR_id", + "CHROM", + "POS", + "REF", + "ALT", + "HighImpact", + "ModerateImpact", + "Synonymous" + ) ) # Learn about the getGT method in ?getGT # Learn about the genoMatrix format in ?genoMatrix - + # Learn more about the downstream methods available for the gdb class in the relevant help pages # e.g. ?mapVariants, ?subsetGdb, ?assocTest, ?writeVcf close(gdb) diff --git a/man/geneSet.Rd b/man/geneSet.Rd index 0d1ea93..406d463 100644 --- a/man/geneSet.Rd +++ b/man/geneSet.Rd @@ -40,17 +40,19 @@ In the following code snippets, x is a geneSet object. \examples{ genesetlist <- buildGeneSet( - list("geneset1" = c("SOD1", "NEK1"), - "geneset2" = c("ABCA4", "SOD1", "NEK1"), - "geneset3" = c("FUS", "NEK1") - )) + list( + "geneset1" = c("SOD1", "NEK1"), + "geneset2" = c("ABCA4", "SOD1", "NEK1"), + "geneset3" = c("FUS", "NEK1") + ) +) geneset <- genesetlist[[1]] # list units and weights included in the geneSet listUnits(geneset) listWeights(geneset) -# note that usually you'll work with geneSets in either a geneSetList or +# note that usually you'll work with geneSets in either a geneSetList or # a geneSetFile (see ?geneSetList and ?geneSetFile) } \seealso{ diff --git a/man/geneSetAssoc.Rd b/man/geneSetAssoc.Rd index 5b037d4..ff94cb3 100644 --- a/man/geneSetAssoc.Rd +++ b/man/geneSetAssoc.Rd @@ -103,15 +103,19 @@ Perform self-contained or competitive gene set analyses. \examples{ library(rvatData) data(rvbresults) -res <- rvbresults[rvbresults$test == "firth" & - rvbresults$varSetName == "ModerateImpact", ] +res <- rvbresults[ + rvbresults$test == "firth" & + rvbresults$varSetName == "ModerateImpact", +] # example genesetlist used in examples below (see ?buildGeneSet on build geneSetLists/geneSetFiles) genesetlist <- buildGeneSet( - list("geneset1" = c("SOD1", "NEK1"), - "geneset2" = c("ABCA4", "SOD1", "NEK1"), - "geneset3" = c("FUS", "NEK1") - )) + list( + "geneset1" = c("SOD1", "NEK1"), + "geneset2" = c("ABCA4", "SOD1", "NEK1"), + "geneset3" = c("FUS", "NEK1") + ) +) # Perform competitive gene set analysis using a linear model GSAresults <- geneSetAssoc( @@ -121,13 +125,13 @@ GSAresults <- geneSetAssoc( test = c("lm") ) -# Outlying gene association scores can be remedied by either setting Z-score cutoffs (i.e. all Z-scores exceeding these values will be set to the respective cutoff), +# Outlying gene association scores can be remedied by either setting Z-score cutoffs (i.e. all Z-scores exceeding these values will be set to the respective cutoff), # or inverse normal transforming the Z-scores: GSAresults <- geneSetAssoc( res, genesetlist, covar = c("nvar"), - test = c("lm"), + test = c("lm"), maxSetSize = 500, Zcutoffs = c(-4, 4) # lower and upper bounds ) @@ -136,20 +140,20 @@ GSAresults <- geneSetAssoc( res, genesetlist, covar = c("nvar"), - test = c("lm"), + test = c("lm"), maxSetSize = 500, INT = TRUE # perform inverse normal transformation ) -# Conditional gene set analyses can be performed to test whether gene sets are associated independently with the phenotype of interest. +# Conditional gene set analyses can be performed to test whether gene sets are associated independently with the phenotype of interest. # In the example below we test whether gene sets are independent of geneset1 GSAresults <- geneSetAssoc( res, condition = getGeneSet(genesetlist, "geneset1"), geneSet = genesetlist, covar = c("nvar"), - test = c("lm"), + test = c("lm"), maxSetSize = 500 ) @@ -158,7 +162,7 @@ GSAresults <- geneSetAssoc( res, genesetlist, covar = c("nvar"), - test = c("lm"), + test = c("lm"), maxSetSize = 500, oneSided = TRUE ) @@ -169,7 +173,7 @@ GSAresults <- geneSetAssoc( GSAresults <- geneSetAssoc( res, genesetlist, - test = c("fisher"), + test = c("fisher"), threshold = 1e-4, maxSetSize = 500 ) diff --git a/man/geneSetFile.Rd b/man/geneSetFile.Rd index 947b0de..6ce6d39 100644 --- a/man/geneSetFile.Rd +++ b/man/geneSetFile.Rd @@ -80,12 +80,13 @@ library(rvatData) # can also be build based on a GMT-file (see ?buildGeneSet) file <- tempfile() genesetfile <- buildGeneSet( - list("geneset1" = c("SOD1", "NEK1"), - "geneset2" = c("ABCA4", "SOD1", "NEK1"), - "geneset3" = c("FUS", "NEK1") - ), + list( + "geneset1" = c("SOD1", "NEK1"), + "geneset2" = c("ABCA4", "SOD1", "NEK1"), + "geneset3" = c("FUS", "NEK1") + ), output = file - ) +) # connect to a geneSetFile genesetfile <- geneSetFile(file) @@ -93,7 +94,7 @@ genesetfile <- geneSetFile(file) # extract a couple of gene sets from the geneSetFile, which will return a geneSetList (see ?geneSetList) getGeneSet(genesetfile, c("geneset1", "geneset2")) -# list included gene sets +# list included gene sets genesets <- listGeneSets(genesetfile) head(genesets) diff --git a/man/geneSetList.Rd b/man/geneSetList.Rd index cdd53e9..550e0c0 100644 --- a/man/geneSetList.Rd +++ b/man/geneSetList.Rd @@ -99,11 +99,12 @@ library(rvatData) # build a geneSetList # can also be build based on a GMT-file (see ?buildGeneSet) genesetlist <- buildGeneSet( - list("geneset1" = c("SOD1", "NEK1"), - "geneset2" = c("ABCA4", "SOD1", "NEK1"), - "geneset3" = c("FUS", "NEK1") - ) + list( + "geneset1" = c("SOD1", "NEK1"), + "geneset2" = c("ABCA4", "SOD1", "NEK1"), + "geneset3" = c("FUS", "NEK1") ) +) # extract a couple of gene sets from the geneSetList, which will return a new geneSetList getGeneSet(genesetlist, c("geneset1", "geneset2")) @@ -130,7 +131,12 @@ dropUnits(genesetlist, unit = "NEK1") # remap IDs linker <- data.frame( gene_name = c("SOD1", "NEK1", "FUS", "ABCA4"), - gene_id = c("ENSG00000142168","ENSG00000137601", "ENSG00000089280", "ENSG00000198691") + gene_id = c( + "ENSG00000142168", + "ENSG00000137601", + "ENSG00000089280", + "ENSG00000198691" + ) ) genesetlist_remapped <- remapIDs(genesetlist, linker) listUnits(genesetlist_remapped) diff --git a/man/genoMatrix.Rd b/man/genoMatrix.Rd index 842d503..02c0f11 100644 --- a/man/genoMatrix.Rd +++ b/man/genoMatrix.Rd @@ -139,10 +139,10 @@ dim(GT) GT[1:5, 1:5] # Subset samples based on sample info -GT[ ,GT$pheno == 1] +GT[, GT$pheno == 1] # Subset first two variants -GT[1:2,] +GT[1:2, ] # Extract variant/sample summaries -------------------------------------------------- @@ -170,7 +170,7 @@ recode(GT, geneticModel = "dominant") recode(GT, geneticModel = "recessive") # see ?recode for details -# recode missing genotypes +# recode missing genotypes recode(GT, imputeMethod = "meanImpute") recode(GT, imputeMethod = "missingToRef") # see ?recode for details @@ -181,19 +181,22 @@ recode(GT, imputeMethod = "missingToRef") aggregate(recode(GT, imputeMethod = "meanImpute")) # set `returnGT = FALSE` to return aggregates as a vector instead -aggregate <- aggregate(recode(GT, imputeMethod = "meanImpute"), returnGT = FALSE) +aggregate <- aggregate( + recode(GT, imputeMethod = "meanImpute"), + returnGT = FALSE +) head(aggregate) # Update cohort and variant info in genoMatrix -------------------------- gdb <- create_example_gdb() anno <- getAnno(gdb, "varInfo", fields = c("VAR_id", "CADDphred", "PolyPhen")) -updateGT(GT, anno = anno ) +updateGT(GT, anno = anno) pheno <- colData(GT) -updateGT(GT, SM = colData(GT)[,1:4]) +updateGT(GT, SM = colData(GT)[, 1:4]) # Get variant carriers -# retrieve a data.frame that lists the samples carrying each respective +# retrieve a data.frame that lists the samples carrying each respective # variant in the genoMatrix. Additional variant and sample info can be included # using the `rowDataFields` and `colDataFields` respectively. carriers <- getCarriers( diff --git a/man/getAnno.Rd b/man/getAnno.Rd index f6f1690..1fedf4f 100644 --- a/man/getAnno.Rd +++ b/man/getAnno.Rd @@ -54,30 +54,40 @@ varinfo <- getAnno(gdb, table = "varInfo") head(varinfo) # extract a genomic range -varinfo <- getAnno(gdb, - table = "varInfo", - ranges = data.frame(CHROM = "chr1", start = 11013847, end = 11016874)) +varinfo <- getAnno( + gdb, + table = "varInfo", + ranges = data.frame(CHROM = "chr1", start = 11013847, end = 11016874) +) head(varinfo) # keep only specified fields -varinfo <- getAnno(gdb, - table = "varInfo", - fields = c("VAR_id", "CHROM", "POS", "REF", "ALT", "ModerateImpact"), - ranges = data.frame(CHROM = "chr1", start = 11013847, end = 11016874)) +varinfo <- getAnno( + gdb, + table = "varInfo", + fields = c("VAR_id", "CHROM", "POS", "REF", "ALT", "ModerateImpact"), + ranges = data.frame(CHROM = "chr1", start = 11013847, end = 11016874) +) head(varinfo) # the `where` parameter can be used to to pass an SQL-compliant where clause t -varinfo <- getAnno(gdb, - table = "varInfo", - where = "gene_name = 'SOD1' and ModerateImpact = 1") +varinfo <- getAnno( + gdb, + table = "varInfo", + where = "gene_name = 'SOD1' and ModerateImpact = 1" +) head(varinfo) # the `inner` and `left` parameters can be used to perform inner and left join operations respectively # e.g. we can use the `inner` parameter to filter e.g. based on a table containing QC-passing variants # for example: -uploadAnno(gdb, name = "QCpass", value = data.frame(VAR_id = 1:100), skipRemap = TRUE, verbose = FALSE) -varinfo <- getAnno(gdb, - inner = "QCpass", - table = "varInfo") +uploadAnno( + gdb, + name = "QCpass", + value = data.frame(VAR_id = 1:100), + skipRemap = TRUE, + verbose = FALSE +) +varinfo <- getAnno(gdb, inner = "QCpass", table = "varInfo") } diff --git a/man/getGT.Rd b/man/getGT.Rd index 57f27cd..a62e7de 100644 --- a/man/getGT.Rd +++ b/man/getGT.Rd @@ -90,11 +90,16 @@ library(rvatData) gdb <- create_example_gdb() # retrieve genotypes of a set of variants based on their VAR_ids -varinfo <- getAnno(gdb, table = "varinfo", where = "gene_name = 'SOD1' and ModerateImpact = 1") +varinfo <- getAnno( + gdb, + table = "varinfo", + where = "gene_name = 'SOD1' and ModerateImpact = 1" +) GT <- getGT( gdb, VAR_id = varinfo$VAR_id, - cohort = "pheno") + cohort = "pheno" +) # retrieve genotypes of a set of variants in a varSet varsetfile <- varSetFile(rvat_example("rvatData_varsetfile.txt.gz")) @@ -102,14 +107,16 @@ varset <- getVarSet(varsetfile, unit = "NEK1", varSetName = "High") GT <- getGT( gdb, varSet = varset, - cohort = "pheno") + cohort = "pheno" +) # see ?varSetFile and ?getVarSet for more details # retrieve genotypes for a genomic interval GT <- getGT( gdb, ranges = data.frame(CHROM = "chr21", start = 31659666, end = 31668931), - cohort = "pheno") + cohort = "pheno" +) # the `anno` parameter can be specified to include variant annotations in the rowData of the genoMatrix GT <- getGT( @@ -117,7 +124,16 @@ GT <- getGT( ranges = data.frame(CHROM = "chr21", start = 31659666, end = 31668931), cohort = "pheno", anno = "varInfo", - annoFields = c("VAR_id", "CHROM", "POS", "REF", "ALT", "HighImpact", "ModerateImpact", "Synonymous") + annoFields = c( + "VAR_id", + "CHROM", + "POS", + "REF", + "ALT", + "HighImpact", + "ModerateImpact", + "Synonymous" + ) ) head(rowData(GT)) @@ -134,7 +150,11 @@ head(rowData(GT)) # to assign variant ploidy. (diploid, XnonPAR, YnonPAR). Accepted inputs are GRCh37, hg19, GRCh38, hg38. # We recommend, however, to set the genome build when building the gdb: the genome build will theb # be included in the gdb metadata and used automatically. see ?buildGdb for details -varinfo <- getAnno(gdb, table = "varinfo", where = "gene_name = 'UBQLN2' and ModerateImpact = 1") +varinfo <- getAnno( + gdb, + table = "varinfo", + where = "gene_name = 'UBQLN2' and ModerateImpact = 1" +) GT <- getGT( gdb, VAR_id = varinfo$VAR_id, diff --git a/man/getVarSet.Rd b/man/getVarSet.Rd index 46acdba..9b35ec2 100644 --- a/man/getVarSet.Rd +++ b/man/getVarSet.Rd @@ -34,10 +34,7 @@ head(varsets) unique(listVarSets(varsetfile)) # specific varSets can be selected using the `varSetName` parameter -varsets <- getVarSet(varsetfile, - unit = c("SOD1", "FUS"), - varSetName = "CADD" - ) +varsets <- getVarSet(varsetfile, unit = c("SOD1", "FUS"), varSetName = "CADD") head(varsets) # see ?varSetFile and ?varSetList for more details on connecting and handling varsetfiles. diff --git a/man/manhattan.Rd b/man/manhattan.Rd index 704f7b2..77d9136 100644 --- a/man/manhattan.Rd +++ b/man/manhattan.Rd @@ -51,27 +51,43 @@ library(rvatData) data(rvbresults) # generate manhatan plot -man <- manhattan(rvbresults[rvbresults$varSetName == "ModerateImpact" & rvbresults$test == "firth",], - label = "unit", - contigs = "GRCh38") +man <- manhattan( + rvbresults[ + rvbresults$varSetName == "ModerateImpact" & rvbresults$test == "firth", + ], + label = "unit", + contigs = "GRCh38" +) # if many overlapping gene label, try setting `labelRepel = TRUE` -man <- manhattan(rvbresults[rvbresults$varSetName == "ModerateImpact" & rvbresults$test == "firth",], - label = "unit", - labelRepel = TRUE, - contigs = "GRCh38") +man <- manhattan( + rvbresults[ + rvbresults$varSetName == "ModerateImpact" & rvbresults$test == "firth", + ], + label = "unit", + labelRepel = TRUE, + contigs = "GRCh38" +) # alter the significane threshold using the `threshold` parameter -man <- manhattan(rvbresults[rvbresults$varSetName == "ModerateImpact" & rvbresults$test == "firth",], - label = "unit", - labelRepel = TRUE, - threshold = 1e-5, - contigs = "GRCh38") +man <- manhattan( + rvbresults[ + rvbresults$varSetName == "ModerateImpact" & rvbresults$test == "firth", + ], + label = "unit", + labelRepel = TRUE, + threshold = 1e-5, + contigs = "GRCh38" +) # the threshold for displaying labels can be set using the `labelTrheshold` parameter -man <- manhattan(rvbresults[rvbresults$varSetName == "ModerateImpact" & rvbresults$test == "firth",], - label = "unit", - labelRepel = TRUE, - labelThreshold = 1e-12, - contigs = "GRCh38") +man <- manhattan( + rvbresults[ + rvbresults$varSetName == "ModerateImpact" & rvbresults$test == "firth", + ], + label = "unit", + labelRepel = TRUE, + labelThreshold = 1e-12, + contigs = "GRCh38" +) } diff --git a/man/mapVariants.Rd b/man/mapVariants.Rd index 855bb4d..f1e87e2 100644 --- a/man/mapVariants.Rd +++ b/man/mapVariants.Rd @@ -82,10 +82,7 @@ ranges <- GRanges( gene_name = c("SOD1", "NEK1") ) -mapVariants(gdb, - ranges = ranges, - uploadName = "gene", - verbose = FALSE) +mapVariants(gdb, ranges = ranges, uploadName = "gene", verbose = FALSE) # similarly, ranges can be a data.frame ranges <- data.frame( @@ -95,38 +92,48 @@ ranges <- data.frame( gene_name = c("SOD1", "NEK1") ) -mapVariants(gdb, - ranges = ranges, - uploadName = "gene", - verbose = FALSE, - overWrite = TRUE) +mapVariants( + gdb, + ranges = ranges, + uploadName = "gene", + verbose = FALSE, + overWrite = TRUE +) # often you'd want to map variants to a large set of ranges, such as ensembl models # mapVariants supports several file formats, including gff/gtf, bed and ranges # map variants using a gtf file gtffile <- tempfile(fileext = ".gtf") -rtracklayer::export(makeGRangesFromDataFrame(ranges), - con = gtffile, - format = "gtf") +rtracklayer::export( + makeGRangesFromDataFrame(ranges), + con = gtffile, + format = "gtf" +) -mapVariants(gdb, - gff = gtffile, - uploadName = "gene", - verbose = FALSE, - overWrite = TRUE) +mapVariants( + gdb, + gff = gtffile, + uploadName = "gene", + verbose = FALSE, + overWrite = TRUE +) # map variants using a bed file bedfile <- tempfile(fileext = ".bed") -rtracklayer::export(makeGRangesFromDataFrame(ranges), - con = bedfile, - format = "bed") -mapVariants(gdb, - bed = bedfile, - uploadName = "gene", - verbose = FALSE, - overWrite = TRUE) +rtracklayer::export( + makeGRangesFromDataFrame(ranges), + con = bedfile, + format = "bed" +) +mapVariants( + gdb, + bed = bedfile, + uploadName = "gene", + verbose = FALSE, + overWrite = TRUE +) # see the variant annotation tutorial on the rvat website for more details } diff --git a/tests/testthat/data/rvatData.gdb b/tests/testthat/data/rvatData.gdb new file mode 100644 index 0000000..9b83b52 Binary files /dev/null and b/tests/testthat/data/rvatData.gdb differ diff --git a/tests/testthat/setup.R b/tests/testthat/setup.R index 9702fc6..7987feb 100644 --- a/tests/testthat/setup.R +++ b/tests/testthat/setup.R @@ -4,3 +4,20 @@ library(rvatData) gdbpath <- rvat_example("rvatData.gdb") testgdb <- withr::local_tempfile() + +create_example_gdb <- function (gdbpath = NULL, filepath = NULL) +{ + if (is.null(filepath)) { + filepath <- tempfile() + } else { + if (file.exists(filepath)) { + stop(sprintf("%s already exists. Please delete or use a different filepath.", + filepath)) + } + } + gdb <- test_path("data/rvatData.gdb") + if (!file.copy(gdb, filepath)) { + stop("Failed to copy example gdb") + } + gdb(filepath) +} diff --git a/tests/testthat/test-assocTest.R b/tests/testthat/test-assocTest.R index 8c5f357..4783365 100644 --- a/tests/testthat/test-assocTest.R +++ b/tests/testthat/test-assocTest.R @@ -360,9 +360,10 @@ test_that("dominant/recessive model filtering works", { test_that("assocTest-gdb and assocTest-GT result in identical output", { params <- generate_param() + gdb <- create_example_gdb() warnings <- capture_warnings( run_test_gdb <- run_tests_gdb( - gdb(rvat_example("rvatData.gdb")), + gdb, params, VAR_id = rownames(GT), var_sv = rownames(GT)[1:25] diff --git a/tests/testthat/test-cli.R b/tests/testthat/test-cli.R index 3d5414d..f703e99 100644 --- a/tests/testthat/test-cli.R +++ b/tests/testthat/test-cli.R @@ -3,6 +3,9 @@ # - directly checks if arguments are passed correctly, rather than indirectly by checking outputs etc. # - tests will directly fail if parameters or defaults ihave not been properly updated in CLI +gdb <- create_example_gdb() +gdb_path <- getGdbPath(gdb) + # buildGdb test_that("--buildGdb works", { # generate mocked function/method that returns the environment @@ -189,7 +192,7 @@ test_that("--subsetGdb works", { commandArgs = function(trailingOnly = TRUE) { c( "--subsetGdb", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--output=example_subset.gdb" ) }, @@ -205,7 +208,7 @@ test_that("--subsetGdb works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, output = "example_subset.gdb" ) expected_args <- c( @@ -225,7 +228,7 @@ test_that("--subsetGdb works", { commandArgs = function(trailingOnly = TRUE) { c( "--subsetGdb", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--output=example.gdb", "--intersection=varInfo", "--where='ModerateImpact = 1'", @@ -248,7 +251,7 @@ test_that("--subsetGdb works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, output = "example.gdb", intersection = "varInfo", where = "'ModerateImpact = 1'", @@ -289,7 +292,7 @@ test_that("--uploadAnno works", { commandArgs = function(trailingOnly = TRUE) { c( "--uploadAnno", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--name=data", "--value=data.txt" ) @@ -306,7 +309,7 @@ test_that("--uploadAnno works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, name = "data", value = "data.txt" ) @@ -325,7 +328,7 @@ test_that("--uploadAnno works", { commandArgs = function(trailingOnly = TRUE) { c( "--uploadAnno", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--name=data", "--value=data.txt", "--sep='|'", @@ -350,7 +353,7 @@ test_that("--uploadAnno works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, name = "data", value = "data.txt", sep = "'|'", @@ -393,7 +396,7 @@ test_that("--uploadCohort works", { commandArgs = function(trailingOnly = TRUE) { c( "--uploadCohort", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--name=data", "--value=data.txt" ) @@ -410,7 +413,7 @@ test_that("--uploadCohort works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, name = "data", value = "data.txt" ) @@ -431,7 +434,7 @@ test_that("--uploadCohort works", { commandArgs = function(trailingOnly = TRUE) { c( "--uploadCohort", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--name=data", "--value=data.txt", "--sep='|'", @@ -451,7 +454,7 @@ test_that("--uploadCohort works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, name = "data", value = "data.txt", sep = "'|'", @@ -492,7 +495,7 @@ test_that("--listAnno works", { commandArgs = function(trailingOnly = TRUE) { c( "--listAnno", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")) + sprintf("--gdb=%s", gdb_path) ) }, .package = "base" @@ -507,7 +510,7 @@ test_that("--listAnno works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")) + object = gdb ) expected_args <- c( expected_args, @@ -537,7 +540,7 @@ test_that("--listCohort works", { commandArgs = function(trailingOnly = TRUE) { c( "--listCohort", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")) + sprintf("--gdb=%s", gdb_path) ) }, .package = "base" @@ -552,7 +555,7 @@ test_that("--listCohort works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")) + object = gdb ) expected_args <- c( expected_args, @@ -581,7 +584,7 @@ test_that("--mapVariants works", { commandArgs = function(trailingOnly = TRUE) { c( "--mapVariants", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")) + sprintf("--gdb=%s", gdb_path) ) }, .package = "base" @@ -596,7 +599,7 @@ test_that("--mapVariants works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")) + object = gdb ) expected_args <- c( expected_args, @@ -615,7 +618,7 @@ test_that("--mapVariants works", { commandArgs = function(trailingOnly = TRUE) { c( "--mapVariants", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--ranges=ranges.txt", "--gff=ensembl.gtf", "--bed=bedfile.bed", @@ -641,7 +644,7 @@ test_that("--mapVariants works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, ranges = "ranges.txt", gff = "ensembl.gtf", bed = "bedfile.bed", @@ -686,7 +689,7 @@ test_that("--buildVarSet works", { commandArgs = function(trailingOnly = TRUE) { c( "--buildVarSet", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--varSetName=LOF", "--unitTable=gene", "--unitName=gene_id", @@ -705,7 +708,7 @@ test_that("--buildVarSet works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, varSetName = "LOF", unitTable = "gene", unitName = "gene_id", @@ -728,7 +731,7 @@ test_that("--buildVarSet works", { commandArgs = function(trailingOnly = TRUE) { c( "--buildVarSet", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--varSetName=LOF", "--unitTable=gene", "--unitName=gene_id", @@ -752,7 +755,7 @@ test_that("--buildVarSet works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, varSetName = "LOF", unitTable = "gene", unitName = "gene_id", @@ -795,7 +798,7 @@ test_that("--spatialClust works", { commandArgs = function(trailingOnly = TRUE) { c( "--spatialClust", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--output=varsetfile.txt.gz", "--varSetName=LOF", "--unitTable=gene", @@ -816,7 +819,7 @@ test_that("--spatialClust works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, varSetName = "LOF", unitTable = "gene", unitName = "gene_id", @@ -843,7 +846,7 @@ test_that("--spatialClust works", { commandArgs = function(trailingOnly = TRUE) { c( "--spatialClust", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--varSetName=LOF", "--unitTable=gene", "--unitName=gene_id", @@ -868,7 +871,7 @@ test_that("--spatialClust works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, varSetName = "LOF", unitTable = "gene", unitName = "gene_id", @@ -915,7 +918,7 @@ test_that("--summariseGeno works", { commandArgs = function(trailingOnly = TRUE) { c( "--summariseGeno", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--output=sumgeno.txt.gz" ) }, @@ -931,7 +934,7 @@ test_that("--summariseGeno works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, output = "sumgeno.txt.gz" ) expected_args <- c( @@ -957,7 +960,7 @@ test_that("--summariseGeno works", { ## save varsetfile varsetfile <- withr::local_tempfile() buildVarSet( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, varSetName = "Moderate", unitTable = "varInfo", unitName = "gene_name", @@ -974,7 +977,7 @@ test_that("--summariseGeno works", { commandArgs = function(trailingOnly = TRUE) { c( "--summariseGeno", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--output=sumgeno.txt.gz", "--cohort=pheno", sprintf("--varSet=%s", varsetfile), @@ -1013,7 +1016,7 @@ test_that("--summariseGeno works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, cohort = "pheno", VAR_id = as.character(1:10), pheno = "pheno", @@ -1078,7 +1081,7 @@ test_that("--aggregate works", { commandArgs = function(trailingOnly = TRUE) { c( "--aggregate", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), sprintf("--VAR_id=%s", varids), "--output=aggregate.txt.gz" ) @@ -1095,7 +1098,7 @@ test_that("--aggregate works", { # check against expected args expected_args <- list( - x = gdb(rvatData::rvat_example("rvatData.gdb")), + x = gdb, VAR_id = as.character(1:10), output = "aggregate.txt.gz" ) @@ -1119,7 +1122,7 @@ test_that("--aggregate works", { ## save varsetfile varsetfile <- withr::local_tempfile() buildVarSet( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, varSetName = "Moderate", unitTable = "varInfo", unitName = "gene_name", @@ -1136,7 +1139,7 @@ test_that("--aggregate works", { commandArgs = function(trailingOnly = TRUE) { c( "--aggregate", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--cohort=pheno", sprintf("--varSet=%s", varsetfile), sprintf("--VAR_id=%s", varids), @@ -1177,7 +1180,7 @@ test_that("--aggregate works", { # check against expected args expected_args <- list( - x = gdb(rvatData::rvat_example("rvatData.gdb")), + x = gdb, cohort = "pheno", VAR_id = as.character(1:10), pheno = "pheno", @@ -1236,7 +1239,7 @@ test_that("--mergeAggDbs works", { # test defaults aggdb1 <- withr::local_tempfile() aggdb2 <- withr::local_tempfile() - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- gdb agg1 <- aggregate( gdb, VAR_id = 1:10, @@ -1345,7 +1348,7 @@ test_that("--collapseAggDbs works", { # test defaults aggdb1 <- withr::local_tempfile() aggdb2 <- withr::local_tempfile() - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- gdb agg1 <- aggregate( gdb, VAR_id = 1:10, @@ -1461,7 +1464,7 @@ test_that("--assocTest works", { commandArgs = function(trailingOnly = TRUE) { c( "--assocTest", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--pheno=pheno", "--test=firth", "--output=assocTest.txt.gz" @@ -1479,7 +1482,7 @@ test_that("--assocTest works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, pheno = "pheno", test = "firth", output = "assocTest.txt.gz" @@ -1504,7 +1507,7 @@ test_that("--assocTest works", { ## save varsetfile varsetfile <- withr::local_tempfile() null <- buildVarSet( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, varSetName = "Moderate", unitTable = "varInfo", unitName = "gene_name", @@ -1529,7 +1532,7 @@ test_that("--assocTest works", { commandArgs = function(trailingOnly = TRUE) { c( "--assocTest", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--pheno=pheno,sex", "--test=firth,glm,scoreSPA", "--cohort=pheno", @@ -1581,7 +1584,7 @@ test_that("--assocTest works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, pheno = c("pheno", "sex"), test = c("firth", "glm", "scoreSPA"), cohort = "pheno", @@ -1657,7 +1660,7 @@ test_that("--assocTest works", { genesetfile <- withr::local_tempfile() null <- buildVarSet( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, varSetName = "Moderate", unitTable = "varInfo", unitName = "gene_name", @@ -1667,7 +1670,7 @@ test_that("--assocTest works", { ) null <- aggregate( - x = gdb(rvatData::rvat_example("rvatData.gdb")), + x = gdb, varSet = getVarSet(varSetFile(varsetfile), unit = c("SOD1", "ABCA4")), output = aggdb, verbose = FALSE @@ -1690,7 +1693,7 @@ test_that("--assocTest works", { "--test=firth", sprintf("--aggdb=%s", aggdb), sprintf("--geneSet=%s", genesetfile), - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--output=assocTest.txt.gz" ) }, @@ -1742,7 +1745,7 @@ test_that("--assocTest works", { "--pheno=pheno", "--test=firth,glm", sprintf("--geneSet=%s", genesetfile), - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--cohort=pheno", "--name=test", "--continuous", @@ -1993,7 +1996,7 @@ test_that("--buildCorMatrix works", { # test defaults aggdb <- withr::local_tempfile() - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- gdb varsetfile <- varSetFile(rvat_example("rvatData_varsetfile.txt.gz")) varsetlist <- getVarSet( varsetfile, @@ -2199,7 +2202,7 @@ test_that("--geneSetAssoc works", { ## cormatrix aggdb <- withr::local_tempfile() cormatrix <- withr::local_tempfile() - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- gdb varsetfile <- varSetFile(rvat_example("rvatData_varsetfile.txt.gz")) varsetlist <- getVarSet( varsetfile, @@ -2410,7 +2413,7 @@ test_that("--writeVcf works", { commandArgs = function(trailingOnly = TRUE) { c( "--writeVcf", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--output=writeVcf.txt.gz" ) }, @@ -2426,7 +2429,7 @@ test_that("--writeVcf works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, output = "writeVcf.txt.gz" ) @@ -2452,7 +2455,7 @@ test_that("--writeVcf works", { commandArgs = function(trailingOnly = TRUE) { c( "--writeVcf", - sprintf("--gdb=%s", rvatData::rvat_example("rvatData.gdb")), + sprintf("--gdb=%s", gdb_path), "--output=writeVcf.txt.gz", sprintf("--VAR_id=%s", varids), sprintf("--IID=%s", iids), @@ -2475,7 +2478,7 @@ test_that("--writeVcf works", { # check against expected args expected_args <- list( - object = gdb(rvatData::rvat_example("rvatData.gdb")), + object = gdb, output = "writeVcf.txt.gz", VAR_id = as.character(1:10), IID = as.character(1:10), diff --git a/tests/testthat/test-gdb-extractRanges.R b/tests/testthat/test-gdb-extractRanges.R index 5a5a503..a211956 100644 --- a/tests/testthat/test-gdb-extractRanges.R +++ b/tests/testthat/test-gdb-extractRanges.R @@ -20,7 +20,7 @@ create_test_ranges <- function() { } test_that("extractRanges basic functionality works", { - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- create_example_gdb() ranges <- create_test_ranges() # check basic extraction @@ -48,7 +48,7 @@ test_that("extractRanges basic functionality works", { }) test_that("extractRanges handles different input formats", { - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- create_example_gdb() # ensembl gene info gtf <- readr::read_tsv( @@ -84,7 +84,7 @@ test_that("extractRanges handles different input formats", { }) test_that("extractRanges padding functionality works", { - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- create_example_gdb() # check whether INDELs are extracted correctly # includes three deletions that should overlap the start coordinates of ranges @@ -104,11 +104,11 @@ test_that("extractRanges padding functionality works", { ranges = ranges, padding = 10L ) - expect_identical(check_with_padding, c("1362", "1206", "1765")) + expect_identical(sort(check_with_padding), sort(c("1362", "1206", "1765"))) }) test_that("extractRanges handles edge cases correctly", { - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- create_example_gdb() ranges <- create_test_ranges() # check no overlap message and empty result @@ -148,7 +148,7 @@ test_that("extractRanges handles edge cases correctly", { }) test_that("extractRanges input validation works correctly", { - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- create_example_gdb() ranges <- create_test_ranges()$basic # expect an error when invalid padding values are provided diff --git a/tests/testthat/test-gdb-getGT.R b/tests/testthat/test-gdb-getGT.R index ee81077..546c376 100644 --- a/tests/testthat/test-gdb-getGT.R +++ b/tests/testthat/test-gdb-getGT.R @@ -1,6 +1,6 @@ test_that("getGT basic functionality works", { # GT with 100 variants for testing - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- create_example_gdb() GT <- expect_no_error(getGT( gdb, VAR_id = 1:100, @@ -27,7 +27,7 @@ test_that("getGT basic functionality works", { test_that("getGT handles invalid VAR_ids correctly", { - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- create_example_gdb() # loading VAR_ids that are not present in gdb should yield a warning GT_reference <- getGT(gdb, VAR_id = 1:100, cohort = "pheno", verbose = FALSE) @@ -58,7 +58,7 @@ test_that("getGT handles invalid VAR_ids correctly", { }) test_that("getGT works with varSet and weights", { - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- create_example_gdb() # build a varset containing CADD weights CADD_file <- withr::local_tempfile(fileext = ".txt.gz") @@ -105,7 +105,7 @@ test_that("getGT works with varSet and weights", { }) test_that("getGT handles different cohort specifications", { - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- create_example_gdb() # test specifying gdb cohort name GT1 <- getGT(gdb, VAR_id = 1:100, cohort = "pheno", verbose = FALSE) @@ -125,7 +125,7 @@ test_that("getGT handles different cohort specifications", { }) test_that("getGT annotation functionality works", { - gdb <- gdb(rvatData::rvat_example("rvatData.gdb")) + gdb <- create_example_gdb() # GT without annotations GT1 <- getGT(gdb, VAR_id = 1:100, cohort = "pheno", verbose = FALSE)