Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ Imports:
readr,
ggh4x,
data.table,
rlang
rlang,
duckdb
Depends:
R (>= 3.4.1),
SummarizedExperiment,
Expand Down
1 change: 1 addition & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,7 @@ import(methods)
importClassesFrom(RSQLite,SQLiteConnection)
importClassesFrom(S4Vectors,DFrame)
importClassesFrom(S4Vectors,DataFrame)
importClassesFrom(duckdb,duckdb_connection)
importFrom(S4Vectors,setValidity2)
importFrom(magrittr,"%$%")
importFrom(magrittr,"%>%")
2 changes: 1 addition & 1 deletion R/aggDb.R
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ aggdb <- function(path) {
}
tryCatch(
{
con <- DBI::dbConnect(RSQLite::SQLite(), path)
con <- DBI::dbConnect(DBI::dbDriver("SQLite"), path)
},
error = function(e) {
stop(sprintf("Invalid aggdb path '%s'", path), call. = FALSE)
Expand Down
7 changes: 4 additions & 3 deletions R/allClasses.R
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ setClass("genoMatrix", contains = "SummarizedExperiment")
#' @description
#' Compressed SQLite (".gdb") representation of genotype data and associated variant annotation and sample info tables.
#' These allow for rapid and memory-efficient loading of sample genotype data and associated metadata within R.
#' The slots of the gdb class are inherited entirely from the [`RSQLite::SQLiteConnection-class`].
#' The slots of the gdb class are inherited entirely from the duckdb::duckdb() connection.
#' A host of RVAT methods described here allow for convenient querying and manipulation of a gdb, for complex queries
#' users can also directly perform SQL queries on the gdb as exemplified in the examples and tutorials.
#'
Expand Down Expand Up @@ -154,9 +154,9 @@ NULL
#'
#' @rdname gdb
#' @import DBI
#' @importClassesFrom RSQLite SQLiteConnection
#' @importClassesFrom duckdb duckdb_connection
#' @export
setClass("gdb", contains = "SQLiteConnection")
setClass("gdb", contains = "duckdb_connection")


#' Class to manage multiple varSets
Expand Down Expand Up @@ -706,6 +706,7 @@ NULL

#' @rdname aggdb
#' @usage NULL
#' @importClassesFrom RSQLite SQLiteConnection
#' @export
setClass("aggdb", contains = "SQLiteConnection")

Expand Down
2 changes: 1 addition & 1 deletion R/allGenerics.R
Original file line number Diff line number Diff line change
Expand Up @@ -417,7 +417,7 @@ setGeneric("insertVarRecord", function(object, record) {
standardGeneric("insertVarRecord")
})

setGeneric("insertDosageRecord", function(object, record) {
setGeneric("insertDosageRecord", function(object, record, var_id) {
standardGeneric("insertDosageRecord")
})

Expand Down
34 changes: 20 additions & 14 deletions R/gdb-anno-cohort.R
Original file line number Diff line number Diff line change
Expand Up @@ -54,10 +54,14 @@ setMethod(

# build where statement
if (!is.null(VAR_id)) {
VAR_id <- sprintf(
"VAR_id in (%s)",
paste(as.integer(VAR_id), collapse = ",")
)
if (length(VAR_id) > 0L) {
VAR_id <- sprintf(
"VAR_id in (%s)",
paste(as.integer(VAR_id), collapse = ",")
)
} else {
VAR_id <- "1=0"
}
if (length(where) > 0L) {
where <- sprintf("(%s) AND (%s)", VAR_id, where)
} else {
Expand Down Expand Up @@ -221,12 +225,14 @@ setMethod(
if (verbose) {
message(sprintf("Loading table '%s' from '%s'", name, value))
}
DBI::dbWriteTable(
con = gdb,
name = name,
value = value,
sep = sep,
overwrite = TRUE
DBI::dbExecute(
gdb,
sprintf(
"CREATE TABLE %s AS SELECT * FROM read_csv_auto('%s', delim='%s')",
name,
value,
sep
)
)
source_info <- value
} else {
Expand Down Expand Up @@ -605,17 +611,17 @@ setMethod(
if (meta_anno_exists) {
DBI::dbExecute(
object,
"DELETE FROM anno WHERE name = :table_to_remove",
params = list(table_to_remove = name)
"DELETE FROM anno WHERE name = ?",
params = list(name)
)
}

# remove from cohort meta (if exists)
if (meta_cohort_exists) {
DBI::dbExecute(
object,
"DELETE FROM cohort WHERE name = :table_to_remove",
params = list(table_to_remove = name)
"DELETE FROM cohort WHERE name = ?",
params = list(name)
)
}

Expand Down
138 changes: 96 additions & 42 deletions R/gdb-buildGdb.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
buildGdb <- function(
vcf,
output,
skipIndexes = FALSE,
skipIndexes = FALSE, ## DuckDB does indexing internally, so no need to do it manually as well ##TODO: check is this is true
skipVarRanges = FALSE,
overWrite = FALSE,
genomeBuild = NULL,
Expand Down Expand Up @@ -190,7 +190,7 @@ setMethod(
carrierN <- length(carrierIndex)
for (ai in seq_along(alleles)) {
# write variant info
insertVarRecord(
var_id <- insertVarRecord(
object,
record = c(records[i, 1:4], alleles[ai], records[i, 6:9])
)
Expand All @@ -203,14 +203,15 @@ setMethod(
collapse = "/"
)
}
insertDosageRecord(object, record = gtia)
insertDosageRecord(object, record = gtia, var_id = var_id)
}
# bi-allelic site
} else {
insertVarRecord(object, record = records[i, 1:9])
var_id <- insertVarRecord(object, record = records[i, 1:9])
insertDosageRecord(
object,
record = substr(records[i, -(1:9), drop = FALSE], 1, 3)
record = substr(records[i, -(1:9), drop = FALSE], 1, 3),
var_id = var_id
)
}
}
Expand All @@ -233,32 +234,34 @@ setMethod(
"insertVarRecord",
signature = "gdb",
definition = function(object, record) {
DBI::dbExecute(
var_id <- DBI::dbGetQuery(
object,
paste0(
"insert into var(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT) ",
"values (:chrom, :pos, :id, :ref, :alt, :qual, :flt, :info, :form)"
),
"INSERT INTO var(CHROM, POS, ID, REF, ALT, QUAL, FILTER, INFO, FORMAT)
VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)
RETURNING VAR_id;",
params = list(
chrom = record[1],
pos = record[2],
id = record[3],
ref = record[4],
alt = record[5],
qual = record[6],
flt = record[7],
info = record[8],
form = record[9]
record[1], # CHROM
as.integer(record[2]), # POS
record[3], # ID
record[4], # REF
record[5], # ALT
record[6], # QUAL
record[7], # FILTER
record[8], # INFO
record[9] # FORMAT
)
)
)$VAR_id

return(var_id)
}
)

setMethod(
"insertDosageRecord",
signature = "gdb",
definition = function(object, record) {
record[record == "0/0"] <- "0"
definition = function(object, record, var_id) {
## record contains the rows of the VCF file, where each row corresponds to a variant, the first 9 columns correspond to headers (CHROM, POS, etc.) and from column 10 the samples. var_id is added so that it can be passed from teh insertVarRecord function
record[record == "0/0"] <- "0" ## all different options, if a allele is unkown (due to for example sequence quality), it is set to 0 (the reference allele)
record[record == "./0"] <- "0"
record[record == "0/1"] <- "1"
record[record == "1/0"] <- "1"
Expand All @@ -272,8 +275,9 @@ setMethod(
record[record == "1|1"] <- "2"
record[record == ".|."] <- "N"
record[record == ".:0"] <- "N"
obs <- sort(unique(c(record)))
obs <- sort(unique(c(record))) ## stores all unique genotypes
if (sum(!obs %in% c("0", "1", "2", "N")) > 0L) {
## if the obs contains other characters than 0, 1, 2, N it leads to an error
stop(
sprintf(
paste0(
Expand All @@ -285,12 +289,15 @@ setMethod(
call. = FALSE
)
}

## compress
record <- I(list(memCompress(paste(record, collapse = ""), type = "gzip")))

DBI::dbExecute(
## opens the connection to the gdb
object,
"insert into dosage(GT) values (:record)",
params = list(record = record)
"INSERT INTO dosage(VAR_id, GT)
VALUES (?, ?)", ## inserts into the var_id, since this is not done automatically with duckDB
params = list(var_id, record)
)
}
)
Expand Down Expand Up @@ -340,10 +347,10 @@ setMethod(
# insert GRanges into gdb as blobs per chromosome
DBI::dbExecute(
object,
statement = "insert into var_ranges(CHROM,ranges) values (:CHROM, :ranges)",
statement = "insert into var_ranges(CHROM,ranges) values (?, ?)",
params = list(
CHROM = chrom,
ranges = list(serialize(gr, connection = NULL))
chrom,
list(serialize(gr, connection = NULL))
)
)
}
Expand Down Expand Up @@ -373,23 +380,70 @@ setMethod(
invisible(NULL)
}

.buildgdb_create_schema <- function(gdb, verbose = TRUE) {
## drop tables if they already exist. This is needed in DuckDB.
.drop_duckdb_objects(gdb, verbose = verbose)

.buildgdb_create_schema <- function(gdb) {
DBI::dbExecute(
gdb,
paste0(
"create table var (VAR_id integer primary key, CHROM text, POS int, ",
"ID text, REF text, ALT text, QUAL text, FILTER text, INFO text, FORMAT text);"
)
)
DBI::dbExecute(gdb, "create table SM (IID text, sex int)")
# create sequence
DBI::dbExecute(gdb, "CREATE SEQUENCE var_seq START 1;")

# create table with default VAR_id from sequence
DBI::dbExecute(
gdb,
"create table dosage (VAR_id integer primary key, GT BLOB);"
"
CREATE TABLE var (
VAR_id INTEGER PRIMARY KEY DEFAULT nextval('var_seq'),
CHROM TEXT,
POS INT,
ID TEXT,
REF TEXT,
ALT TEXT,
QUAL TEXT,
FILTER TEXT,
INFO TEXT,
FORMAT TEXT
);"
)
DBI::dbExecute(gdb, "create table anno (name text,value text,date text)")
DBI::dbExecute(gdb, "create table cohort (name text,value text,date text)")
DBI::dbExecute(gdb, "create table meta (name text,value text)")

DBI::dbExecute(gdb, "CREATE TABLE SM (IID TEXT, sex INT);")
DBI::dbExecute(gdb, "CREATE TABLE dosage (VAR_id INTEGER, GT BLOB);")
DBI::dbExecute(gdb, "CREATE TABLE anno (name TEXT,value TEXT,date TEXT);")
DBI::dbExecute(gdb, "CREATE TABLE cohort (name TEXT,value TEXT,date TEXT);")
DBI::dbExecute(gdb, "CREATE TABLE meta (name TEXT,value TEXT);")

invisible(NULL)
}


.drop_duckdb_objects <- function(gdb, verbose = TRUE) {
# drop tables if they exist
tables <- DBI::dbListTables(gdb)
for (tbl in c(
"var",
"SM",
"dosage",
"anno",
"cohort",
"meta",
"var_ranges"
)) {
if (tbl %in% tables) {
if (verbose) {
message(sprintf("Dropping existing table '%s'", tbl))
}
DBI::dbExecute(gdb, sprintf("DROP TABLE IF EXISTS %s;", tbl))
}
}

# drop sequence
sequences <- DBI::dbGetQuery(
gdb,
"SELECT sequence_name FROM duckdb_sequences();"
)
if ("var_seq" %in% sequences$sequence_name) {
if (verbose) {
message("Dropping existing sequence 'var_seq'")
}
DBI::dbExecute(gdb, "DROP SEQUENCE var_seq;")
}
}
4 changes: 2 additions & 2 deletions R/gdb-getGT.R
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ setMethod(
"select VAR_id, 'diploid' as ploidy, CHROM, POS, REF ",
"from var where VAR_id in (%s);"
),
paste(as.integer(VAR_id), collapse = ",")
paste(as.integer(VAR_id), collapse = ",") ##TODO, will this work if VAR_id is very large? or better to split into chunks?
)
)

Expand Down Expand Up @@ -404,7 +404,7 @@ setMethod(
"select VAR_id, GT from dosage where VAR_id in (%s);",
paste(as.integer(VAR_id), collapse = ",")
)
GT <- RSQLite::dbGetQuery(object, query)
GT <- DBI::dbGetQuery(object, query)

# check if all variants are found
success <- sum(VAR_id %in% GT[, 1])
Expand Down
Loading