diff --git a/.github/scripts/setup_azurite.sh b/.github/scripts/setup_azurite.sh index 50998b2..1f0bcbf 100644 --- a/.github/scripts/setup_azurite.sh +++ b/.github/scripts/setup_azurite.sh @@ -54,11 +54,15 @@ export REQUESTS_CA_BUNDLE=/etc/ssl/certs/ca-certificates.crt AZURE_CONNECTION_STRING="DefaultEndpointsProtocol=https;AccountName=devstoreaccount1;AccountKey=Eby8vdM02xNOcqFlqUwJPLlmEtlCDXJ1OUzFT50uSRZ6IFsuFq2UVErCz4I6tq/K1SZFPTOtr/KBHBeksoGMGw==;BlobEndpoint=https://127.0.0.1:10000/devstoreaccount1;QueueEndpoint=https://127.0.0.1:10001/devstoreaccount1;" az storage container create -n test --connection-string $AZURE_CONNECTION_STRING -# Setup examples workspace on azurite +# Setup test workspaces on azurite +mkdir oldstyle_dir +tar xzvf $GITHUB_WORKSPACE/test/inputs/sanity.test.tgz -C oldstyle_dir +az storage blob upload-batch -d test/oldstyle-dir -s oldstyle_dir --connection-string $AZURE_CONNECTION_STRING +export OLDSTYLE_DIR=az://test/oldstyle-dir + cd $GITHUB_WORKSPACE/examples tar xzvf examples_ws.tgz -echo "Azure Storage Blob upload-batch..." az storage blob upload-batch -d test/ws -s ws --connection-string $AZURE_CONNECTION_STRING -echo "Azure Storage Blob upload-batch DONE" +export WORKSPACE=az://test/ws popd diff --git a/.github/workflows/basic.yml b/.github/workflows/basic.yml index 31093a3..57a600b 100644 --- a/.github/workflows/basic.yml +++ b/.github/workflows/basic.yml @@ -102,7 +102,8 @@ jobs: run: | source .github/scripts/setup_azurite.sh echo "Testing on Azurite..." - PYTHONPATH=. WORKSPACE=az://test/ws ./examples/test.sh + echo "WORKSPACE=$WORKSPACE OLDSTYLE_DIR=$OLDSTYLE_DIR" + PYTHONPATH=. ./examples/test.sh ls -l /tmp/tiledb_bookkeeping echo "Testing on Azurite DONE" diff --git a/examples/genomicsdb_cache b/examples/genomicsdb_cache index e8dd56f..1454f63 100755 --- a/examples/genomicsdb_cache +++ b/examples/genomicsdb_cache @@ -41,6 +41,11 @@ def is_cloud_path(path): return False +def get_arrays(interval, contigs_map, partitions): + _, _, _, arrays = genomicsdb_common.get_arrays(interval, contigs_map, partitions) + return arrays + + def main(): parser = argparse.ArgumentParser( prog="cache", @@ -125,8 +130,12 @@ def main(): contigs_map, intervals = genomicsdb_common.parse_vidmap_json(vidmap_file, args.interval or args.interval_list) loader = json.loads(genomicsdb.read_entire_file("loader.json")) partitions = loader["column_partitions"] - - for array in genomicsdb_common.get_arrays(contigs_map, intervals, partitions): + arrays = { + arrays_for_interval + for interval in intervals + for arrays_for_interval in get_arrays(interval, contigs_map, partitions) + } + for array in arrays: print(f"Caching fragments for array {array}") if genomicsdb.array_exists(workspace, array): genomicsdb.cache_array_metadata(workspace, array) diff --git a/examples/genomicsdb_common.py b/examples/genomicsdb_common.py index c0617d2..803e145 100644 --- a/examples/genomicsdb_common.py +++ b/examples/genomicsdb_common.py @@ -27,6 +27,7 @@ import json import os import re +import sys import genomicsdb @@ -85,25 +86,39 @@ def parse_interval(interval: str): raise RuntimeError(f"Interval {interval} could not be parsed") -def get_arrays(contigs_map, intervals, partitions): - arrays = set() - for interval in intervals: - contig, start, end = parse_interval(interval) - if contig in contigs_map: - contig_offset = contigs_map[contig]["tiledb_column_offset"] + start - 1 - length = contigs_map[contig]["length"] - if end and end < length + 1: - contig_end = contigs_map[contig]["tiledb_column_offset"] + end - 1 - else: - end = length - contig_end = contigs_map[contig]["tiledb_column_offset"] + length - 1 +def get_arrays(interval, contigs_map, partitions): + contig, start, end = parse_interval(interval) + if contig in contigs_map: + contig_offset = contigs_map[contig]["tiledb_column_offset"] + start - 1 + length = contigs_map[contig]["length"] + if end and end < length + 1: + contig_end = contigs_map[contig]["tiledb_column_offset"] + end - 1 else: - print(f"Contig({contig}) not found in vidmap.json") + end = length + contig_end = contigs_map[contig]["tiledb_column_offset"] + length - 1 + else: + print(f"Contig({contig}) not found in vidmap.json") + + arrays = [] + for idx, partition in enumerate(partitions): + if isinstance(partition["begin"], int): # Old style vidmap json + column_begin = partition["begin"] + if "end" in partition.keys(): + column_end = partition["end"] + elif idx + 1 < len(partitions): + column_end = partitions[idx + 1]["begin"] - 1 + else: + column_end = sys.maxsize + else: # Generated with vcf2genomicsdb_init + column_begin = partition["begin"]["tiledb_column"] + column_end = partition["end"]["tiledb_column"] + + if contig_end < column_begin or contig_offset > column_end: continue - for partition in partitions: - if contig_end < partition["begin"]["tiledb_column"] or contig_offset > partition["end"]["tiledb_column"]: - continue - arrays.add(partition["array_name"]) + if "array_name" in partition.keys(): + arrays.append(partition["array_name"]) + elif "array" in partition.keys(): + arrays.append(partition["array"]) - return arrays + return contig, start, end, arrays diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index 62c89b4..d86644c 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -307,7 +307,7 @@ def parse_args_for_max_bytes(max_arrow_byte_size): def main(): gdb, workspace, partitions, contigs_map, intervals, row_tuples, args = setup_gdb() - if row_tuples and len(row_tuples) == 0: + if row_tuples is not None and len(row_tuples) == 0: return print(f"Starting genomicsdb_query for workspace({workspace}) and intervals({intervals})") @@ -321,55 +321,23 @@ def main(): range.high = tuple[1] row_range_list.range_list.extend([range]) + output_type = args.output_type + output = args.output + if output_type == "json": + json_type = parse_args_for_json_type(args.json_output_type) + if output_type == "arrow": + max_arrow_bytes = parse_args_for_max_bytes(args.max_arrow_byte_size) + print(f"Using {args.max_arrow_byte_size} number of bytes as hint for writing out parquet files") + for interval in intervals: print(f"Processing interval({interval})...") - # get tiledb offsets for interval - contig, start, end = genomicsdb_common.parse_interval(interval) - if contig in contigs_map: - contig_offset = contigs_map[contig]["tiledb_column_offset"] + start - 1 - length = contigs_map[contig]["length"] - if end and end < length + 1: - contig_end = contigs_map[contig]["tiledb_column_offset"] + end - 1 - else: - end = length - contig_end = contigs_map[contig]["tiledb_column_offset"] + length - 1 - else: - print(f"Contig({contig}) not found in vidmap.json") - continue - arrays = [] - for idx, partition in enumerate(partitions): - if isinstance(partition["begin"], int): # Old style vidmap json - column_begin = partition["begin"] - if "end" in partition.keys(): - column_end = partition["end"] - elif idx + 1 < len(partitions): - column_end = partitions[idx + 1]["begin"] - 1 - else: - column_end = sys.maxsize - else: # Generated with vcf2genomicsdb_init - column_begin = partition["begin"]["tiledb_column"] - column_end = partition["end"]["tiledb_column"] - if contig_end < column_begin or contig_offset > column_end: - continue - if "array_name" in partition.keys(): - arrays.append(partition["array_name"]) - elif "array" in partition.keys(): - arrays.append(partition["array"]) - - arrays_length = len(arrays) - if arrays_length == 0: + contig, start, end, arrays = genomicsdb_common.get_arrays(interval, contigs_map, partitions) + if len(arrays) == 0: print(f"No arrays in the workspace matched input interval({interval})") continue - print(f"\tArrays:{arrays} under consideration for interval({interval})") - output_type = args.output_type - output = args.output - if output_type == "json": - json_type = parse_args_for_json_type(args.json_output_type) - if output_type == "arrow": - max_arrow_bytes = parse_args_for_max_bytes(args.max_arrow_byte_size) - print(f"Using {args.max_arrow_byte_size} number of bytes as hint for writing out parquet files") + print(f"\tArrays:{arrays} under consideration for interval({interval})") for idx, array in enumerate(arrays): if not genomicsdb.array_exists(workspace, array): diff --git a/examples/run.sh b/examples/run.sh index 0689575..da38252 100755 --- a/examples/run.sh +++ b/examples/run.sh @@ -45,8 +45,10 @@ INTERVALS=("1:1-1000000") #declare -a SAMPLES #SAMPLES=("HG00096" "HG00097" "HG00099") +#SAMPLES_LIST=samples.list #FILTER='resolve(GT, REF, ALT) &= "T/T"' +FILTER='!ISHOMREF' export OUTPUT_FILE=${OUTPUT_FILE:-my_output} export OUTPUT_FILE_TYPE=${OUTPUT_FILE_TYPE:-json} @@ -54,16 +56,18 @@ export OUTPUT_FILE_TYPE=${OUTPUT_FILE_TYPE:-json} export TILEDB_CACHE=1 NTHREADS=${NTHREADS:-8} +VENV=${VENV:-env} + ########################################### # Should not have to change anything below ########################################### -if [[ ! -d env ]]; then - python3 -m venv env +if [[ ! -d $VENV ]]; then + python3 -m venv $VENV source env/bin/activate pip install genomicsdb else - source env/bin/activate + source $VENV/bin/activate fi PATH=$(dirname $0):$PATH @@ -75,8 +79,12 @@ if [[ ! -z ${SAMPLES} ]]; then done fi +if [[ ! -z $SAMPLES_LIST ]]; then + export SAMPLE_ARGS="-S $SAMPLE_LIST" +fi + if [[ ! -z ${FILTER} ]]; then - FILTER_EXPR="-f $FILTER" + export FILTER_EXPR="-f $FILTER" fi echo $LOADER_FILE $CALLSET_FILE $VIDMAP_FILE diff --git a/examples/test.sh b/examples/test.sh index 85b2667..a65a2f6 100755 --- a/examples/test.sh +++ b/examples/test.sh @@ -158,20 +158,17 @@ if [[ $WORKSPACE == *://* ]]; then fi rm -f loader.json callset.json vidmap.json -if [[ $WORKSPACE == *://* ]]; then - cleanup - exit 0 -fi - #################################################################### # # Check old style workspaces with genomicsdb_query/genomicsdb_cache # #################################################################### -OLDSTYLE_DIR=$TEMP_DIR/old_style -mkdir -p $OLDSTYLE_DIR -tar xzf $(dirname $0)/../test/inputs/sanity.test.tgz -C $OLDSTYLE_DIR +if [[ -z $OLDSTYLE_DIR ]]; then + OLDSTYLE_DIR=$TEMP_DIR/old_style + mkdir -p $OLDSTYLE_DIR + tar xzf $(dirname $0)/../test/inputs/sanity.test.tgz -C $OLDSTYLE_DIR +fi WORKSPACE=$OLDSTYLE_DIR/ws run_command "genomicsdb_query -w $WORKSPACE --list-samples" @@ -181,6 +178,7 @@ run_command "genomicsdb_query -w $WORKSPACE -s HG00097 -s HG00100 -s HG00096 -o run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" OLDSTYLE_JSONS="-l $OLDSTYLE_DIR/loader.json -c $OLDSTYLE_DIR/callset_t0_1_2.json -v $OLDSTYLE_DIR/vid.json" +run_command "genomicsdb_cache -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-samples" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-contigs" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-partitions" diff --git a/test/inputs/sanity.test.tgz b/test/inputs/sanity.test.tgz index 68ed4df..e999175 100644 Binary files a/test/inputs/sanity.test.tgz and b/test/inputs/sanity.test.tgz differ