GenomicsDB · nalinigans · Dec 16, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024
diff --git a/.github/scripts/setup_azurite.sh b/.github/scripts/setup_azurite.sh
@@ -0,0 +1,64 @@
+#!/bin/bash
+
+# The MIT License (MIT)
+# Copyright (c) 2024 dātma, inc™
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of
+# this software and associated documentation files (the "Software"), to deal in
+# the Software without restriction, including without limitation the rights to
+# use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+# the Software, and to permit persons to whom the Software is furnished to do so,
+# subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+# FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+# COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+# IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+#
+
+set -e
+
+# Install Dependencies
+sudo apt-get update
+sudo apt-get -y install azure-cli
+
+# Install Azurite
+sudo npm install -g azurite
+which azurite
+echo "azurite version = $(azurite --version)"
+
+AZURITE_DIR=$GITHUB_WORKSPACE/azurite
+mkdir -p $AZURITE_DIR
+pushd $AZURITE_DIR
+
+# Generate certificates
+openssl req -newkey rsa:2048 -x509 -nodes -keyout key.pem -new -out cert.pem -sha256 -days 365 -addext "subjectAltName=IP:127.0.0.1" -subj "/C=CO/ST=ST/L=LO/O=OR/OU=OU/CN=CN"
+sudo cp $AZURITE_DIR/cert.pem /usr/local/share/ca-certificates/ca-certificates.crt
+sudo update-ca-certificates
+
+# Start azurite
+azurite --silent --skipApiVersionCheck --loose --location $AZURITE_DIR --cert cert.pem --key key.pem &> /dev/null &
+
+# Env to run tests
+export AZURE_STORAGE_ACCOUNT=devstoreaccount1
+export AZURE_STORAGE_KEY=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==
+export AZURE_STORAGE_SERVICE_ENDPOINT="https://127.0.0.1:10000/devstoreaccount1"
+export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
+
+# Create container called test as TileDB expects the container to be already created
+AZURE_CONNECTION_STRING="DefaultEndpointsProtocol=https;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=https://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=https://127.0.0.1:10001/devstoreaccount1;"
+az storage container create -n test --connection-string $AZURE_CONNECTION_STRING
+
+# Setup examples workspace on azurite
+cd $GITHUB_WORKSPACE/examples
+tar xzvf examples_ws.tgz
+echo "Azure Storage Blob upload-batch..."
+az storage blob upload-batch -d test/ws -s ws --connection-string $AZURE_CONNECTION_STRING
+echo "Azure Storage Blob upload-batch DONE"
+
+popd
diff --git a/.github/workflows/basic.yml b/.github/workflows/basic.yml
@@ -96,3 +96,15 @@ jobs:
       run: |
         PYTHONPATH=. pytest
         PYTHONPATH=. python test/test.py
+        tar xzf examples/examples_ws.tgz
+        PYTHONPATH=. WORKSPACE=ws ./examples/test.sh
+
+    - name: Cloud Test - Azurite
+      run: |
+        source .github/scripts/setup_azurite.sh
+        echo "Testing on Azurite..."
+        PYTHONPATH=. WORKSPACE=az://test/ws ./examples/test.sh
+        ls -l /tmp/tiledb_bookkeeping
+        echo "Testing on Azurite DONE"
+
+
diff --git a/examples/examples_ws.tgz b/examples/examples_ws.tgz
diff --git a/examples/genomicsdb_cache b/examples/genomicsdb_cache
@@ -27,16 +27,11 @@
 #
 
 import argparse
-import os
-
-import genomicsdb
+import json
 
+import genomicsdb_common
 
-def normalize_path(path):
-    if "://" in path:
-        return path
-    else:
-        return os.path.abspath(path)
+import genomicsdb
 
 
 def is_cloud_path(path):
@@ -81,10 +76,23 @@ def main():
     parser.add_argument(
         "-l", "--loader", required=False, help="Optional - URL to loader file. Defaults to loader.json in workspace"
     )
+    parser.add_argument(
+        "-i",
+        "--interval",
+        action="append",
+        required=False,
+        help="genomic intervals over which to operate. The intervals should be specified in the <CONTIG>:<START>-<END> format with START and END optional.\nThis argument may be specified 0 or more times e.g -i chr1:1-10000 -i chr2 -i chr3:1000. \nNote: \n\t1. -i/--interval and -I/--interval-list are mutually exclusive \n\t2. either samples and/or intervals using -i/-I/-s/-S options has to be specified",  # noqa
+    )
+    parser.add_argument(
+        "-I",
+        "--interval-list",
+        required=False,
+        help="genomic intervals listed in a file over which to operate.\nThe intervals should be specified in the <CONTIG>:<START>-<END> format, with START and END optional one interval per line. \nNote: \n\t1. -i/--interval and -I/--interval-list are mutually exclusive \n\t2. either samples and/or intervals using -i/-I/-s/-S options has to be specified",  # noqa
+    )
 
     args = parser.parse_args()
 
-    workspace = normalize_path(args.workspace)
+    workspace = genomicsdb_common.normalize_path(args.workspace)
     if not genomicsdb.is_file(workspace + "/__tiledb_workspace.tdb"):
         raise RuntimeError(f"workspace({workspace}) not found")
 
@@ -113,6 +121,14 @@ def main():
     if is_cloud_path(loader_file):
         with open("loader.json", "wb") as f:
             f.write(genomicsdb.read_entire_file(loader_file).encode())
+    if is_cloud_path(workspace) and (args.interval or args.interval_list):
+        contigs_map, intervals = genomicsdb_common.parse_vidmap_json(vidmap_file, args.interval or args.interval_list)
+        loader = json.loads(genomicsdb.read_entire_file("loader.json"))
+        partitions = loader["column_partitions"]
+
+        for array in genomicsdb_common.get_arrays(contigs_map, intervals, partitions):
+            print(f"Caching fragments for array {array}")
+            genomicsdb.cache_array_metadata(workspace, array)
 
 
 if __name__ == "__main__":

diff --git a/examples/genomicsdb_common.py b/examples/genomicsdb_common.py
@@ -0,0 +1,101 @@
+#
+# genomicsdb_common python script
+#
+# The MIT License
+#
+# Copyright (c) 2024 dātma, inc™
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+#
+
+import json
+import os
+import re
+
+import genomicsdb
+
+
+def normalize_path(path):
+    if "://" in path:
+        return path
+    else:
+        return os.path.abspath(path)
+
+
+def parse_vidmap_json(vidmap_file, intervals=None):
+    if isinstance(intervals, str):
+        is_file = True
+    else:
+        is_file = False
+    contigs_map = {}
+    vidmap = json.loads(genomicsdb.read_entire_file(vidmap_file))
+    contigs = vidmap["contigs"]
+    if intervals and is_file:
+        with open(intervals) as file:
+            intervals = [line.rstrip() for line in file]
+    all_intervals = not intervals
+    if not intervals:
+        intervals = []
+    for contig in contigs:
+        contigs_map[contig["name"]] = contig
+        if all_intervals:
+            intervals.append(contig["name"])
+    return contigs_map, intervals
+
+
+interval_pattern = re.compile(r"(.*):(.*)-(.*)|(.*):(.*)|(.*)")
+
+
+# get tiledb offsets for interval
+def parse_interval(interval: str):
+    result = re.match(interval_pattern, interval)
+    if result:
+        length = len(result.groups())
+        if length == 6:
+            if result.group(1) and result.group(2) and result.group(3):
+                return result.group(1), int(result.group(2)), int(result.group(3))
+            elif result.group(4) and result.group(5):
+                return result.group(4), int(result.group(5)), None
+            elif result.group(6):
+                return result.group(6), 1, None
+    raise RuntimeError(f"Interval {interval} could not be parsed")
+
+
+def get_arrays(contigs_map, intervals, partitions):
+    arrays = set()
+    for interval in intervals:
+        contig, start, end = parse_interval(interval)
+        if contig in contigs_map:
+            contig_offset = contigs_map[contig]["tiledb_column_offset"] + start - 1
+            length = contigs_map[contig]["length"]
+            if end and end < length + 1:
+                contig_end = contigs_map[contig]["tiledb_column_offset"] + end - 1
+            else:
+                end = length
+                contig_end = contigs_map[contig]["tiledb_column_offset"] + length - 1
+        else:
+            print(f"Contig({contig}) not found in vidmap.json")
+            continue
+
+        for partition in partitions:
+            if contig_end < partition["begin"]["tiledb_column"] or contig_offset > partition["end"]["tiledb_column"]:
+                continue
+            arrays.add(partition["array_name"])
+
+    return arrays
diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query
@@ -32,6 +32,7 @@ import os
 import re
 import sys
 
+import genomicsdb_common
 import pyarrow as pa
 import pyarrow.parquet as pq
 
@@ -41,27 +42,6 @@ from genomicsdb.protobuf import genomicsdb_coordinates_pb2 as query_coords
 from genomicsdb.protobuf import genomicsdb_export_config_pb2 as query_pb
 
 
-def parse_vidmap_json(vidmap_file, intervals=None):
-    if isinstance(intervals, str):
-        is_file = True
-    else:
-        is_file = False
-    contigs_map = {}
-    vidmap = json.loads(genomicsdb.read_entire_file(vidmap_file))
-    contigs = vidmap["contigs"]
-    if intervals and is_file:
-        with open(intervals) as file:
-            intervals = [line.rstrip() for line in file]
-    all_intervals = not intervals
-    if not intervals:
-        intervals = []
-    for contig in contigs:
-        contigs_map[contig["name"]] = contig
-        if all_intervals:
-            intervals.append(contig["name"])
-    return contigs_map, intervals
-
-
 def parse_callset_json(callset_file):
     callset = json.loads(genomicsdb.read_entire_file(callset_file))
     callsets = callset["callsets"]
@@ -118,13 +98,6 @@ def genomicsdb_connect(workspace, callset_file, vidmap_file, filter):
     return genomicsdb.connect_with_protobuf(export_config)
 
 
-def normalize_path(path):
-    if "://" in path:
-        return path
-    else:
-        return os.path.abspath(path)
-
-
 def setup_gdb():
     parser = argparse.ArgumentParser(
         prog="query",
@@ -230,7 +203,7 @@ def setup_gdb():
 
     args = parser.parse_args()
 
-    workspace = normalize_path(args.workspace)
+    workspace = genomicsdb_common.normalize_path(args.workspace)
     if not genomicsdb.is_file(workspace + "/__tiledb_workspace.tdb"):
         raise RuntimeError(f"workspace({workspace}) not found")
     callset_file = args.callset
@@ -250,7 +223,7 @@ def setup_gdb():
         raise RuntimeError(f"callset({callset_file}) vidmap({vidmap_file}) or loader({loader_file}) not found")
 
     if args.list_contigs:
-        _, intervals = parse_vidmap_json(vidmap_file)
+        _, intervals = genomicsdb_common.parse_vidmap_json(vidmap_file)
         print(*intervals, sep="\n")
         sys.exit(0)
 
@@ -274,7 +247,7 @@ def setup_gdb():
             "Specify at least one of either -i/-interval -I/--interval-list -s/--sample -S/--sample-list has to be specified"  # noqa
         )
 
-    contigs_map, intervals = parse_vidmap_json(vidmap_file, intervals or interval_list)
+    contigs_map, intervals = genomicsdb_common.parse_vidmap_json(vidmap_file, intervals or interval_list)
     row_tuples = parse_callset_json_for_row_ranges(callset_file, samples or sample_list)
 
     # parse loader.json for partitions
@@ -286,23 +259,6 @@ def setup_gdb():
     return gdb, workspace, partitions, contigs_map, intervals, row_tuples, args
 
 
-interval_pattern = re.compile(r"(.*):(.*)-(.*)|(.*):(.*)|(.*)")
-
-
-def parse_interval(interval: str):
-    result = re.match(interval_pattern, interval)
-    if result:
-        length = len(result.groups())
-        if length == 6:
-            if result.group(1) and result.group(2) and result.group(3):
-                return result.group(1), int(result.group(2)), int(result.group(3))
-            elif result.group(4) and result.group(5):
-                return result.group(4), int(result.group(5)), None
-            elif result.group(6):
-                return result.group(6), 1, None
-    raise RuntimeError(f"Interval {interval} could not be parsed")
-
-
 def generate_output_filename(output, output_type, interval, idx):
     if output_type == "arrow":
         output_filename = os.path.join(output, f"{interval.replace(':', '-')}")
@@ -356,7 +312,7 @@ def main():
     for interval in intervals:
         print(f"Processing interval({interval})...")
         # get tiledb offsets for interval
-        contig, start, end = parse_interval(interval)
+        contig, start, end = genomicsdb_common.parse_interval(interval)
         if contig in contigs_map:
             contig_offset = contigs_map[contig]["tiledb_column_offset"] + start - 1
             length = contigs_map[contig]["length"]

diff --git a/examples/run.sh b/examples/run.sh
@@ -57,13 +57,15 @@ NTHREADS=${NTHREADS:=-8}
 ###########################################
 
 if [[ ! -d env ]]; then
-  python3.11 -m venv env
+  python3 -m venv env
   source env/bin/activate
   pip install genomicsdb
 else
   source env/bin/activate
 fi
 
+PATH=$(dirname $0):$PATH
+
 if [[ ! -z ${SAMPLES} ]]; then
   for SAMPLE in "${SAMPLES[@]}"
   do
@@ -78,7 +80,12 @@ fi
 echo  $LOADER_FILE  $CALLSET_FILE   $VIDMAP_FILE
 
 rm -f loader.json callset.json vidmap.json
-./genomicsdb_cache -w $WORKSPACE -l $LOADER_FILE  -c $CALLSET_FILE -v $VIDMAP_FILE
+for INTERVAL in "${INTERVALS[@]}"
+do
+  INTERVAL_LIST="$INTERVAL_LIST -i  $INTERVAL"
+done
+
+genomicsdb_cache -w $WORKSPACE -l $LOADER_FILE  -c $CALLSET_FILE -v $VIDMAP_FILE $INTERVAL_LIST
 
 if [[ -f loader.json ]]; then
   export LOADER_FILE="loader.json"
@@ -93,8 +100,8 @@ fi
 run_query() {
   INTERVAL=$1
   OUTPUT_FILE=$2
-  echo ./genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE
-  /usr/bin/time -l ./genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE
+  echo genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE
+  /usr/bin/time -l genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE
 }
 
 export -f run_query