Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions .github/scripts/setup_azurite.sh
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,15 @@ export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt
AZURE_CONNECTION_STRING="DefaultEndpointsProtocol=https;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=https://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=https://127.0.0.1:10001/devstoreaccount1;"
az storage container create -n test --connection-string $AZURE_CONNECTION_STRING

# Setup examples workspace on azurite
# Setup test workspaces on azurite
mkdir oldstyle_dir
tar xzvf $GITHUB_WORKSPACE/test/inputs/sanity.test.tgz -C oldstyle_dir
az storage blob upload-batch -d test/oldstyle-dir -s oldstyle_dir --connection-string $AZURE_CONNECTION_STRING
export OLDSTYLE_DIR=az://test/oldstyle-dir

cd $GITHUB_WORKSPACE/examples
tar xzvf examples_ws.tgz
echo "Azure Storage Blob upload-batch..."
az storage blob upload-batch -d test/ws -s ws --connection-string $AZURE_CONNECTION_STRING
echo "Azure Storage Blob upload-batch DONE"
export WORKSPACE=az://test/ws

popd
3 changes: 2 additions & 1 deletion .github/workflows/basic.yml
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,8 @@ jobs:
run: |
source .github/scripts/setup_azurite.sh
echo "Testing on Azurite..."
PYTHONPATH=. WORKSPACE=az://test/ws ./examples/test.sh
echo "WORKSPACE=$WORKSPACE OLDSTYLE_DIR=$OLDSTYLE_DIR"
PYTHONPATH=. ./examples/test.sh
ls -l /tmp/tiledb_bookkeeping
echo "Testing on Azurite DONE"

Expand Down
13 changes: 11 additions & 2 deletions examples/genomicsdb_cache
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,11 @@ def is_cloud_path(path):
return False


def get_arrays(interval, contigs_map, partitions):
_, _, _, arrays = genomicsdb_common.get_arrays(interval, contigs_map, partitions)
return arrays


def main():
parser = argparse.ArgumentParser(
prog="cache",
Expand Down Expand Up @@ -125,8 +130,12 @@ def main():
contigs_map, intervals = genomicsdb_common.parse_vidmap_json(vidmap_file, args.interval or args.interval_list)
loader = json.loads(genomicsdb.read_entire_file("loader.json"))
partitions = loader["column_partitions"]

for array in genomicsdb_common.get_arrays(contigs_map, intervals, partitions):
arrays = {
arrays_for_interval
for interval in intervals
for arrays_for_interval in get_arrays(interval, contigs_map, partitions)
}
for array in arrays:
print(f"Caching fragments for array {array}")
if genomicsdb.array_exists(workspace, array):
genomicsdb.cache_array_metadata(workspace, array)
Expand Down
51 changes: 33 additions & 18 deletions examples/genomicsdb_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
import json
import os
import re
import sys

import genomicsdb

Expand Down Expand Up @@ -85,25 +86,39 @@ def parse_interval(interval: str):
raise RuntimeError(f"Interval {interval} could not be parsed")


def get_arrays(contigs_map, intervals, partitions):
arrays = set()
for interval in intervals:
contig, start, end = parse_interval(interval)
if contig in contigs_map:
contig_offset = contigs_map[contig]["tiledb_column_offset"] + start - 1
length = contigs_map[contig]["length"]
if end and end < length + 1:
contig_end = contigs_map[contig]["tiledb_column_offset"] + end - 1
else:
end = length
contig_end = contigs_map[contig]["tiledb_column_offset"] + length - 1
def get_arrays(interval, contigs_map, partitions):
contig, start, end = parse_interval(interval)
if contig in contigs_map:
contig_offset = contigs_map[contig]["tiledb_column_offset"] + start - 1
length = contigs_map[contig]["length"]
if end and end < length + 1:
contig_end = contigs_map[contig]["tiledb_column_offset"] + end - 1
else:
print(f"Contig({contig}) not found in vidmap.json")
end = length
contig_end = contigs_map[contig]["tiledb_column_offset"] + length - 1
else:
print(f"Contig({contig}) not found in vidmap.json")

arrays = []
for idx, partition in enumerate(partitions):
if isinstance(partition["begin"], int): # Old style vidmap json
column_begin = partition["begin"]
if "end" in partition.keys():
column_end = partition["end"]
elif idx + 1 < len(partitions):
column_end = partitions[idx + 1]["begin"] - 1
else:
column_end = sys.maxsize
else: # Generated with vcf2genomicsdb_init
column_begin = partition["begin"]["tiledb_column"]
column_end = partition["end"]["tiledb_column"]

if contig_end < column_begin or contig_offset > column_end:
continue

for partition in partitions:
if contig_end < partition["begin"]["tiledb_column"] or contig_offset > partition["end"]["tiledb_column"]:
continue
arrays.add(partition["array_name"])
if "array_name" in partition.keys():
arrays.append(partition["array_name"])
elif "array" in partition.keys():
arrays.append(partition["array"])

return arrays
return contig, start, end, arrays
56 changes: 12 additions & 44 deletions examples/genomicsdb_query
Original file line number Diff line number Diff line change
Expand Up @@ -307,7 +307,7 @@ def parse_args_for_max_bytes(max_arrow_byte_size):
def main():
gdb, workspace, partitions, contigs_map, intervals, row_tuples, args = setup_gdb()

if row_tuples and len(row_tuples) == 0:
if row_tuples is not None and len(row_tuples) == 0:
return

print(f"Starting genomicsdb_query for workspace({workspace}) and intervals({intervals})")
Expand All @@ -321,55 +321,23 @@ def main():
range.high = tuple[1]
row_range_list.range_list.extend([range])

output_type = args.output_type
output = args.output
if output_type == "json":
json_type = parse_args_for_json_type(args.json_output_type)
if output_type == "arrow":
max_arrow_bytes = parse_args_for_max_bytes(args.max_arrow_byte_size)
print(f"Using {args.max_arrow_byte_size} number of bytes as hint for writing out parquet files")

for interval in intervals:
print(f"Processing interval({interval})...")
# get tiledb offsets for interval
contig, start, end = genomicsdb_common.parse_interval(interval)
if contig in contigs_map:
contig_offset = contigs_map[contig]["tiledb_column_offset"] + start - 1
length = contigs_map[contig]["length"]
if end and end < length + 1:
contig_end = contigs_map[contig]["tiledb_column_offset"] + end - 1
else:
end = length
contig_end = contigs_map[contig]["tiledb_column_offset"] + length - 1
else:
print(f"Contig({contig}) not found in vidmap.json")
continue

arrays = []
for idx, partition in enumerate(partitions):
if isinstance(partition["begin"], int): # Old style vidmap json
column_begin = partition["begin"]
if "end" in partition.keys():
column_end = partition["end"]
elif idx + 1 < len(partitions):
column_end = partitions[idx + 1]["begin"] - 1
else:
column_end = sys.maxsize
else: # Generated with vcf2genomicsdb_init
column_begin = partition["begin"]["tiledb_column"]
column_end = partition["end"]["tiledb_column"]
if contig_end < column_begin or contig_offset > column_end:
continue
if "array_name" in partition.keys():
arrays.append(partition["array_name"])
elif "array" in partition.keys():
arrays.append(partition["array"])

arrays_length = len(arrays)
if arrays_length == 0:
contig, start, end, arrays = genomicsdb_common.get_arrays(interval, contigs_map, partitions)
if len(arrays) == 0:
print(f"No arrays in the workspace matched input interval({interval})")
continue
print(f"\tArrays:{arrays} under consideration for interval({interval})")

output_type = args.output_type
output = args.output
if output_type == "json":
json_type = parse_args_for_json_type(args.json_output_type)
if output_type == "arrow":
max_arrow_bytes = parse_args_for_max_bytes(args.max_arrow_byte_size)
print(f"Using {args.max_arrow_byte_size} number of bytes as hint for writing out parquet files")
print(f"\tArrays:{arrays} under consideration for interval({interval})")

for idx, array in enumerate(arrays):
if not genomicsdb.array_exists(workspace, array):
Expand Down
16 changes: 12 additions & 4 deletions examples/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -45,25 +45,29 @@ INTERVALS=("1:1-1000000")

#declare -a SAMPLES
#SAMPLES=("HG00096" "HG00097" "HG00099")
#SAMPLES_LIST=samples.list

#FILTER='resolve(GT, REF, ALT) &= "T/T"'
FILTER='!ISHOMREF'

export OUTPUT_FILE=${OUTPUT_FILE:-my_output}
export OUTPUT_FILE_TYPE=${OUTPUT_FILE_TYPE:-json}

export TILEDB_CACHE=1
NTHREADS=${NTHREADS:-8}

VENV=${VENV:-env}

###########################################
# Should not have to change anything below
###########################################

if [[ ! -d env ]]; then
python3 -m venv env
if [[ ! -d $VENV ]]; then
python3 -m venv $VENV
source env/bin/activate
pip install genomicsdb
else
source env/bin/activate
source $VENV/bin/activate
fi

PATH=$(dirname $0):$PATH
Expand All @@ -75,8 +79,12 @@ if [[ ! -z ${SAMPLES} ]]; then
done
fi

if [[ ! -z $SAMPLES_LIST ]]; then
export SAMPLE_ARGS="-S $SAMPLE_LIST"
fi

if [[ ! -z ${FILTER} ]]; then
FILTER_EXPR="-f $FILTER"
export FILTER_EXPR="-f $FILTER"
fi

echo $LOADER_FILE $CALLSET_FILE $VIDMAP_FILE
Expand Down
14 changes: 6 additions & 8 deletions examples/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -158,20 +158,17 @@ if [[ $WORKSPACE == *://* ]]; then
fi
rm -f loader.json callset.json vidmap.json

if [[ $WORKSPACE == *://* ]]; then
cleanup
exit 0
fi

####################################################################
#
# Check old style workspaces with genomicsdb_query/genomicsdb_cache
#
####################################################################

OLDSTYLE_DIR=$TEMP_DIR/old_style
mkdir -p $OLDSTYLE_DIR
tar xzf $(dirname $0)/../test/inputs/sanity.test.tgz -C $OLDSTYLE_DIR
if [[ -z $OLDSTYLE_DIR ]]; then
OLDSTYLE_DIR=$TEMP_DIR/old_style
mkdir -p $OLDSTYLE_DIR
tar xzf $(dirname $0)/../test/inputs/sanity.test.tgz -C $OLDSTYLE_DIR
fi
WORKSPACE=$OLDSTYLE_DIR/ws

run_command "genomicsdb_query -w $WORKSPACE --list-samples"
Expand All @@ -181,6 +178,7 @@ run_command "genomicsdb_query -w $WORKSPACE -s HG00097 -s HG00100 -s HG00096 -o
run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT"

OLDSTYLE_JSONS="-l $OLDSTYLE_DIR/loader.json -c $OLDSTYLE_DIR/callset_t0_1_2.json -v $OLDSTYLE_DIR/vid.json"
run_command "genomicsdb_cache -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS"
run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-samples"
run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-contigs"
run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-partitions"
Expand Down
Binary file modified test/inputs/sanity.test.tgz
Binary file not shown.