diff --git a/.github/scripts/setup_azurite.sh b/.github/scripts/setup_azurite.sh new file mode 100644 index 0000000..50998b2 --- /dev/null +++ b/.github/scripts/setup_azurite.sh @@ -0,0 +1,64 @@ +#!/bin/bash + +# The MIT License (MIT) +# Copyright (c) 2024 dātma, inc™ +# +# Permission is hereby granted, free of charge, to any person obtaining a copy of +# this software and associated documentation files (the "Software"), to deal in +# the Software without restriction, including without limitation the rights to +# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of +# the Software, and to permit persons to whom the Software is furnished to do so, +# subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in all +# copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS +# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR +# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER +# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. +# + +set -e + +# Install Dependencies +sudo apt-get update +sudo apt-get -y install azure-cli + +# Install Azurite +sudo npm install -g azurite +which azurite +echo "azurite version = $(azurite --version)" + +AZURITE_DIR=$GITHUB_WORKSPACE/azurite +mkdir -p $AZURITE_DIR +pushd $AZURITE_DIR + +# Generate certificates +openssl req -newkey rsa:2048 -x509 -nodes -keyout key.pem -new -out cert.pem -sha256 -days 365 -addext "subjectAltName=IP:127.0.0.1" -subj "/C=CO/ST=ST/L=LO/O=OR/OU=OU/CN=CN" +sudo cp $AZURITE_DIR/cert.pem /usr/local/share/ca-certificates/ca-certificates.crt +sudo update-ca-certificates + +# Start azurite +azurite --silent --skipApiVersionCheck --loose --location $AZURITE_DIR --cert cert.pem --key key.pem &> /dev/null & + +# Env to run tests +export AZURE_STORAGE_ACCOUNT=devstoreaccount1 +export AZURE_STORAGE_KEY=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw== +export AZURE_STORAGE_SERVICE_ENDPOINT="https://127.0.0.1:10000/devstoreaccount1" +export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt + +# Create container called test as TileDB expects the container to be already created +AZURE_CONNECTION_STRING="DefaultEndpointsProtocol=https;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=https://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=https://127.0.0.1:10001/devstoreaccount1;" +az storage container create -n test --connection-string $AZURE_CONNECTION_STRING + +# Setup examples workspace on azurite +cd $GITHUB_WORKSPACE/examples +tar xzvf examples_ws.tgz +echo "Azure Storage Blob upload-batch..." +az storage blob upload-batch -d test/ws -s ws --connection-string $AZURE_CONNECTION_STRING +echo "Azure Storage Blob upload-batch DONE" + +popd diff --git a/.github/workflows/basic.yml b/.github/workflows/basic.yml index 3f5a1bc..7e18d4a 100644 --- a/.github/workflows/basic.yml +++ b/.github/workflows/basic.yml @@ -96,3 +96,15 @@ jobs: run: | PYTHONPATH=. pytest PYTHONPATH=. python test/test.py + tar xzf examples/examples_ws.tgz + PYTHONPATH=. WORKSPACE=ws ./examples/test.sh + + - name: Cloud Test - Azurite + run: | + source .github/scripts/setup_azurite.sh + echo "Testing on Azurite..." + PYTHONPATH=. WORKSPACE=az://test/ws ./examples/test.sh + ls -l /tmp/tiledb_bookkeeping + echo "Testing on Azurite DONE" + + diff --git a/examples/examples_ws.tgz b/examples/examples_ws.tgz new file mode 100644 index 0000000..123526f Binary files /dev/null and b/examples/examples_ws.tgz differ diff --git a/examples/genomicsdb_cache b/examples/genomicsdb_cache index 60b1c75..cd1a879 100755 --- a/examples/genomicsdb_cache +++ b/examples/genomicsdb_cache @@ -27,16 +27,11 @@ # import argparse -import os - -import genomicsdb +import json +import genomicsdb_common -def normalize_path(path): - if "://" in path: - return path - else: - return os.path.abspath(path) +import genomicsdb def is_cloud_path(path): @@ -81,10 +76,23 @@ def main(): parser.add_argument( "-l", "--loader", required=False, help="Optional - URL to loader file. Defaults to loader.json in workspace" ) + parser.add_argument( + "-i", + "--interval", + action="append", + required=False, + help="genomic intervals over which to operate. The intervals should be specified in the :- format with START and END optional.\nThis argument may be specified 0 or more times e.g -i chr1:1-10000 -i chr2 -i chr3:1000. \nNote: \n\t1. -i/--interval and -I/--interval-list are mutually exclusive \n\t2. either samples and/or intervals using -i/-I/-s/-S options has to be specified", # noqa + ) + parser.add_argument( + "-I", + "--interval-list", + required=False, + help="genomic intervals listed in a file over which to operate.\nThe intervals should be specified in the :- format, with START and END optional one interval per line. \nNote: \n\t1. -i/--interval and -I/--interval-list are mutually exclusive \n\t2. either samples and/or intervals using -i/-I/-s/-S options has to be specified", # noqa + ) args = parser.parse_args() - workspace = normalize_path(args.workspace) + workspace = genomicsdb_common.normalize_path(args.workspace) if not genomicsdb.is_file(workspace + "/__tiledb_workspace.tdb"): raise RuntimeError(f"workspace({workspace}) not found") @@ -113,6 +121,14 @@ def main(): if is_cloud_path(loader_file): with open("loader.json", "wb") as f: f.write(genomicsdb.read_entire_file(loader_file).encode()) + if is_cloud_path(workspace) and (args.interval or args.interval_list): + contigs_map, intervals = genomicsdb_common.parse_vidmap_json(vidmap_file, args.interval or args.interval_list) + loader = json.loads(genomicsdb.read_entire_file("loader.json")) + partitions = loader["column_partitions"] + + for array in genomicsdb_common.get_arrays(contigs_map, intervals, partitions): + print(f"Caching fragments for array {array}") + genomicsdb.cache_array_metadata(workspace, array) if __name__ == "__main__": diff --git a/examples/genomicsdb_common.py b/examples/genomicsdb_common.py new file mode 100644 index 0000000..2a926f9 --- /dev/null +++ b/examples/genomicsdb_common.py @@ -0,0 +1,101 @@ +# +# genomicsdb_common python script +# +# The MIT License +# +# Copyright (c) 2024 dātma, inc™ +# +# Permission is hereby granted, free of charge, to any person obtaining a copy +# of this software and associated documentation files (the "Software"), to deal +# in the Software without restriction, including without limitation the rights +# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +# copies of the Software, and to permit persons to whom the Software is +# furnished to do so, subject to the following conditions: +# +# The above copyright notice and this permission notice shall be included in +# all copies or substantial portions of the Software. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +# THE SOFTWARE. +# + +import json +import os +import re + +import genomicsdb + + +def normalize_path(path): + if "://" in path: + return path + else: + return os.path.abspath(path) + + +def parse_vidmap_json(vidmap_file, intervals=None): + if isinstance(intervals, str): + is_file = True + else: + is_file = False + contigs_map = {} + vidmap = json.loads(genomicsdb.read_entire_file(vidmap_file)) + contigs = vidmap["contigs"] + if intervals and is_file: + with open(intervals) as file: + intervals = [line.rstrip() for line in file] + all_intervals = not intervals + if not intervals: + intervals = [] + for contig in contigs: + contigs_map[contig["name"]] = contig + if all_intervals: + intervals.append(contig["name"]) + return contigs_map, intervals + + +interval_pattern = re.compile(r"(.*):(.*)-(.*)|(.*):(.*)|(.*)") + + +# get tiledb offsets for interval +def parse_interval(interval: str): + result = re.match(interval_pattern, interval) + if result: + length = len(result.groups()) + if length == 6: + if result.group(1) and result.group(2) and result.group(3): + return result.group(1), int(result.group(2)), int(result.group(3)) + elif result.group(4) and result.group(5): + return result.group(4), int(result.group(5)), None + elif result.group(6): + return result.group(6), 1, None + raise RuntimeError(f"Interval {interval} could not be parsed") + + +def get_arrays(contigs_map, intervals, partitions): + arrays = set() + for interval in intervals: + contig, start, end = parse_interval(interval) + if contig in contigs_map: + contig_offset = contigs_map[contig]["tiledb_column_offset"] + start - 1 + length = contigs_map[contig]["length"] + if end and end < length + 1: + contig_end = contigs_map[contig]["tiledb_column_offset"] + end - 1 + else: + end = length + contig_end = contigs_map[contig]["tiledb_column_offset"] + length - 1 + else: + print(f"Contig({contig}) not found in vidmap.json") + continue + + for partition in partitions: + if contig_end < partition["begin"]["tiledb_column"] or contig_offset > partition["end"]["tiledb_column"]: + continue + arrays.add(partition["array_name"]) + + return arrays diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index 462138d..8bd57ac 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -32,6 +32,7 @@ import os import re import sys +import genomicsdb_common import pyarrow as pa import pyarrow.parquet as pq @@ -41,27 +42,6 @@ from genomicsdb.protobuf import genomicsdb_coordinates_pb2 as query_coords from genomicsdb.protobuf import genomicsdb_export_config_pb2 as query_pb -def parse_vidmap_json(vidmap_file, intervals=None): - if isinstance(intervals, str): - is_file = True - else: - is_file = False - contigs_map = {} - vidmap = json.loads(genomicsdb.read_entire_file(vidmap_file)) - contigs = vidmap["contigs"] - if intervals and is_file: - with open(intervals) as file: - intervals = [line.rstrip() for line in file] - all_intervals = not intervals - if not intervals: - intervals = [] - for contig in contigs: - contigs_map[contig["name"]] = contig - if all_intervals: - intervals.append(contig["name"]) - return contigs_map, intervals - - def parse_callset_json(callset_file): callset = json.loads(genomicsdb.read_entire_file(callset_file)) callsets = callset["callsets"] @@ -118,13 +98,6 @@ def genomicsdb_connect(workspace, callset_file, vidmap_file, filter): return genomicsdb.connect_with_protobuf(export_config) -def normalize_path(path): - if "://" in path: - return path - else: - return os.path.abspath(path) - - def setup_gdb(): parser = argparse.ArgumentParser( prog="query", @@ -230,7 +203,7 @@ def setup_gdb(): args = parser.parse_args() - workspace = normalize_path(args.workspace) + workspace = genomicsdb_common.normalize_path(args.workspace) if not genomicsdb.is_file(workspace + "/__tiledb_workspace.tdb"): raise RuntimeError(f"workspace({workspace}) not found") callset_file = args.callset @@ -250,7 +223,7 @@ def setup_gdb(): raise RuntimeError(f"callset({callset_file}) vidmap({vidmap_file}) or loader({loader_file}) not found") if args.list_contigs: - _, intervals = parse_vidmap_json(vidmap_file) + _, intervals = genomicsdb_common.parse_vidmap_json(vidmap_file) print(*intervals, sep="\n") sys.exit(0) @@ -274,7 +247,7 @@ def setup_gdb(): "Specify at least one of either -i/-interval -I/--interval-list -s/--sample -S/--sample-list has to be specified" # noqa ) - contigs_map, intervals = parse_vidmap_json(vidmap_file, intervals or interval_list) + contigs_map, intervals = genomicsdb_common.parse_vidmap_json(vidmap_file, intervals or interval_list) row_tuples = parse_callset_json_for_row_ranges(callset_file, samples or sample_list) # parse loader.json for partitions @@ -286,23 +259,6 @@ def setup_gdb(): return gdb, workspace, partitions, contigs_map, intervals, row_tuples, args -interval_pattern = re.compile(r"(.*):(.*)-(.*)|(.*):(.*)|(.*)") - - -def parse_interval(interval: str): - result = re.match(interval_pattern, interval) - if result: - length = len(result.groups()) - if length == 6: - if result.group(1) and result.group(2) and result.group(3): - return result.group(1), int(result.group(2)), int(result.group(3)) - elif result.group(4) and result.group(5): - return result.group(4), int(result.group(5)), None - elif result.group(6): - return result.group(6), 1, None - raise RuntimeError(f"Interval {interval} could not be parsed") - - def generate_output_filename(output, output_type, interval, idx): if output_type == "arrow": output_filename = os.path.join(output, f"{interval.replace(':', '-')}") @@ -356,7 +312,7 @@ def main(): for interval in intervals: print(f"Processing interval({interval})...") # get tiledb offsets for interval - contig, start, end = parse_interval(interval) + contig, start, end = genomicsdb_common.parse_interval(interval) if contig in contigs_map: contig_offset = contigs_map[contig]["tiledb_column_offset"] + start - 1 length = contigs_map[contig]["length"] diff --git a/examples/run.sh b/examples/run.sh index 176a5c3..125e04a 100755 --- a/examples/run.sh +++ b/examples/run.sh @@ -57,13 +57,15 @@ NTHREADS=${NTHREADS:=-8} ########################################### if [[ ! -d env ]]; then - python3.11 -m venv env + python3 -m venv env source env/bin/activate pip install genomicsdb else source env/bin/activate fi +PATH=$(dirname $0):$PATH + if [[ ! -z ${SAMPLES} ]]; then for SAMPLE in "${SAMPLES[@]}" do @@ -78,7 +80,12 @@ fi echo $LOADER_FILE $CALLSET_FILE $VIDMAP_FILE rm -f loader.json callset.json vidmap.json -./genomicsdb_cache -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE +for INTERVAL in "${INTERVALS[@]}" +do + INTERVAL_LIST="$INTERVAL_LIST -i $INTERVAL" +done + +genomicsdb_cache -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE $INTERVAL_LIST if [[ -f loader.json ]]; then export LOADER_FILE="loader.json" @@ -93,8 +100,8 @@ fi run_query() { INTERVAL=$1 OUTPUT_FILE=$2 - echo ./genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE - /usr/bin/time -l ./genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE + echo genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE + /usr/bin/time -l genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE } export -f run_query diff --git a/examples/test.sh b/examples/test.sh index 21b6bde..e10e185 100755 --- a/examples/test.sh +++ b/examples/test.sh @@ -46,7 +46,6 @@ VIDMAP_FILE=vidmap_file.json LOADER_FILE=loader_file.json FILTER='ISHOMREF' - if [[ $(uname) == "Darwin" ]]; then TEMP_DIR=$(mktemp -d -t test-examples) else @@ -94,7 +93,7 @@ run_command() { fi } -PATH=.:$PATH +PATH=$(dirname $0):$PATH run_command "genomicsdb_query" 2 run_command "genomicsdb_query -h" run_command "genomicsdb_query --version" @@ -103,7 +102,7 @@ run_command "genomicsdb_query --list-contigs" 2 run_command "genomicsdb_query -w $WORKSPACE" 1 run_command "genomicsdb_query -w $WORKSPACE -i xx -S XX" 1 -if [[ ! -d $WORKSPACE ]]; then +if [[ -z $WORKSPACE ]]; then echo "Specify an existing workspace in env WORKSPACE" exit 1 fi @@ -146,4 +145,15 @@ run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -S $TEMP_D run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list" run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -f $FILTER" +run_command "genomicsdb_cache -w $WORKSPACE $INTERVAL_ARGS" +export TILEDB_CACHE=1 +if [[ $WORKSPACE == *://* ]]; then + if [[ ! -f loader.json ]] || [[ ! -f callset.json ]] || [[ ! -f vidmap.json ]]; then + die "Could not cache workspace metadata for cloud URL=$WORKSPACE" + fi + echo "Running from cached metadata for workspace=$WORKSPACE..." + run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -s HG00097 -l loader.json -c callset.json -v vidmap.json" + echo "Running from cached metadata for workspace=$WORKSPACE DONE" +fi + cleanup diff --git a/src/genomicsdb.pxd b/src/genomicsdb.pxd index 9609138..5f1bd85 100644 --- a/src/genomicsdb.pxd +++ b/src/genomicsdb.pxd @@ -252,5 +252,7 @@ cdef extern from "genomicsdb_utils.h": cdef bint c_is_file "genomicsdb::is_file"(string) cdef ssize_t c_file_size "genomicsdb::file_size"(string) cdef int c_read_entire_file "genomicsdb::read_entire_file"(string, void**, size_t*) + cdef vector[string] c_get_array_names "genomicsdb::get_array_names"(string) + cdef int c_cache_fragment_metadata "genomicsdb::cache_fragment_metadata"(string, string) pass diff --git a/src/genomicsdb.pyx b/src/genomicsdb.pyx index b3ede64..e55eb05 100644 --- a/src/genomicsdb.pyx +++ b/src/genomicsdb.pyx @@ -495,3 +495,8 @@ def read_entire_file(filename): contents_string = contents.decode("utf-8") free(contents) return contents_string + + +def cache_array_metadata(workspace, array): + if c_cache_fragment_metadata(as_string(workspace), as_string(array)) != 0: + print(f"Could not cache fragment metadata for array={array} in {workspace}") diff --git a/src/genomicsdb_processor.h b/src/genomicsdb_processor.h index ff51232..0a57408 100644 --- a/src/genomicsdb_processor.h +++ b/src/genomicsdb_processor.h @@ -81,10 +81,6 @@ class VariantCallProcessor : public GenomicsDBVariantCallProcessor { PyObject* _intervals_list = NULL; }; -#define STRING_FIELD(NAME, TYPE) (TYPE.is_string() || TYPE.is_char() || TYPE.num_elements > 1 || (NAME.compare("GT") == 0)) -#define INT_FIELD(TYPE) (TYPE.is_int()) -#define FLOAT_FIELD(TYPE) (TYPE.is_float()) - class ColumnarVariantCallProcessor : public GenomicsDBVariantCallProcessor { public: ColumnarVariantCallProcessor() {