From bea23e0c0fdb48f7cc85083998f6d8c58a82b9dd Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Fri, 20 Dec 2024 14:53:21 -0800 Subject: [PATCH 1/8] Support old style jsons for vidmap, callset and loader in genomicsdb_query --- examples/genomicsdb_common.py | 12 +++++-- examples/genomicsdb_query | 45 +++++++++++++++++++++----- examples/run.sh | 12 +++++-- examples/test.sh | 59 ++++++++++++++++++++++++++--------- 4 files changed, 102 insertions(+), 26 deletions(-) diff --git a/examples/genomicsdb_common.py b/examples/genomicsdb_common.py index 2a926f9..1d1bc66 100644 --- a/examples/genomicsdb_common.py +++ b/examples/genomicsdb_common.py @@ -53,9 +53,17 @@ def parse_vidmap_json(vidmap_file, intervals=None): if not intervals: intervals = [] for contig in contigs: - contigs_map[contig["name"]] = contig + if isinstance(contig) == str: # Old style vidmap json + contig_name = contig + contigs_map[contig] = { + "length": contigs[contig]["length"], + "tiledb_column_offset": contigs[contig]["tiledb_column_offset"], + } + else: # Generated with vcf2genomicsdb_init + contig_name = contig["name"] + contigs_map[contig["name"]] = contig if all_intervals: - intervals.append(contig["name"]) + intervals.append(contig_name) return contigs_map, intervals diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index 5015c02..999c122 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -45,7 +45,12 @@ from genomicsdb.protobuf import genomicsdb_export_config_pb2 as query_pb def parse_callset_json(callset_file): callset = json.loads(genomicsdb.read_entire_file(callset_file)) callsets = callset["callsets"] - samples = [callset["sample_name"] for callset in callsets] + samples = [] + for callset in callsets: + if isinstance(callset) == str: # Old style vidmap json + samples.append(callset) + else: # Generated with vcf2genomicsdb_init + samples.append(callset["sample_name"]) return samples @@ -57,7 +62,16 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None): if isinstance(samples, str): with open(samples) as file: samples = [line.rstrip() for line in file] - rows = [callset["row_idx"] for sample in samples for callset in callsets if sample == callset["sample_name"]] + if not samples: + return None + # Old style json has type(callset) == str whereas the one generated with vcf2genomicsdb_init is a list + rows = [ + callset["row_idx"] + for sample in samples + for callset in callsets + if (isinstance(callset) != str and sample == callset["sample_name"]) + or (isinstance(callset) == str and sample == callset) + ] if len(rows) == 0: print(f"None of the samples{samples} specified were found in the workspace") return [] @@ -78,7 +92,7 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None): return row_tuples -def parse_loader_json(loader_file): +def parse_loader_json(workspace, loader_file, contigs_map): loader = json.loads(genomicsdb.read_entire_file(loader_file)) partitions = loader["column_partitions"] array_names = [partition["array_name"] for partition in partitions] @@ -140,7 +154,7 @@ def setup_gdb(): parser.add_argument( "--list-partitions", action="store_true", - help="List interval partitions for the ingested samples in the workspace and exit", + help="List interval partitions(genomicsdb arrays in the workspace) for the ingested samples in the workspace and exit", # noqa ) parser.add_argument( "-i", @@ -233,8 +247,9 @@ def setup_gdb(): sys.exit(0) if args.list_partitions: + contigs_map, _ = genomicsdb_common.parse_vidmap_json(vidmap_file) # parse loader.json for partitions - partitions = parse_loader_json(loader_file) + partitions = parse_loader_json(workspace, loader_file, contigs_map) print(*partitions, sep="\n") sys.exit(0) @@ -326,10 +341,24 @@ def main(): continue arrays = [] - for partition in partitions: - if contig_end < partition["begin"]["tiledb_column"] or contig_offset > partition["end"]["tiledb_column"]: + for idx, partition in enumerate(partitions): + if isinstance(partition["begin"]) == int: # Old style vidmap json + column_begin = partition["begin"] + if "end" in partition.keys(): + column_end = partition["end"] + elif idx + 1 < len(partitions): + column_end = partitions[idx + 1]["begin"] - 1 + else: + column_end = sys.maxsize + else: # Generated with vcf2genomicsdb_init + column_begin = partition["begin"]["tiledb_column"] + column_end = partition["end"]["tiledb_column"] + if contig_end < column_begin or contig_offset > column_end: continue - arrays.append(partition["array_name"]) + if "array_name" in partition.keys(): + arrays.append(partition["array_name"]) + elif "array" in partition.keys(): + arrays.append(partition["array"]) arrays_length = len(arrays) if arrays_length == 0: diff --git a/examples/run.sh b/examples/run.sh index 49c4b90..0689575 100755 --- a/examples/run.sh +++ b/examples/run.sh @@ -26,7 +26,9 @@ # # -WORKSPACE=${WORKSPACE:-my_workspace} +set -e + +export WORKSPACE=${WORKSPACE:-my_workspace} export CALLSET_FILE=${CALLSET_FILE:-$WORKSPACE/callset.json} export VIDMAP_FILE=${VIDMAP_FILE:-$WORKSPACE/vidmap.json} export LOADER_FILE=${LOADER_FILE:-$WORKSPACE/loader.json} @@ -97,11 +99,17 @@ if [[ -f vidmap.json ]]; then export VIDMAP_FILE="vidmap.json" fi +if [[ $(uname) == "Darwin" ]]; then + export MEASURE_PERFORMANCE="/usr/bin/time -l" +else + export MEASURE_PERFORMANCE="/usr/bin/time -v" +fi + run_query() { INTERVAL=$1 OUTPUT_FILE=$2 echo genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE - /usr/bin/time -l genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE + $MEASURE_PERFORMANCE genomicsdb_query -w $WORKSPACE -l $LOADER_FILE -c $CALLSET_FILE -v $VIDMAP_FILE -i $INTERVAL $SAMPLE_ARGS $FILTER_EXPR -o $OUTPUT_FILE -t $OUTPUT_FILE_TYPE } export -f run_query diff --git a/examples/test.sh b/examples/test.sh index e10e185..85b2667 100755 --- a/examples/test.sh +++ b/examples/test.sh @@ -39,7 +39,6 @@ # ~/GenomicsDB/install/bin/vcf2genomicsdb -r 2 ws/loader.json # ~/GenomicsDB/install/bin/vcf2genomicsdb -r 80 ws/loader.json -WORKSPACE=${WORKSPACE:-ws} INTERVAL_ARGS="-i 1:1-40000000 -i 2:3000-40000 -i 2:40001 -i 3" SAMPLE_ARGS="-s HG00096 -s HG00097 -s HG00099" VIDMAP_FILE=vidmap_file.json @@ -93,7 +92,13 @@ run_command() { fi } +if [[ -z $WORKSPACE ]]; then + tar xzf $(dirname $0)/examples_ws.tgz -C $TEMP_DIR + WORKSPACE=$TEMP_DIR/ws +fi + PATH=$(dirname $0):$PATH + run_command "genomicsdb_query" 2 run_command "genomicsdb_query -h" run_command "genomicsdb_query --version" @@ -102,11 +107,6 @@ run_command "genomicsdb_query --list-contigs" 2 run_command "genomicsdb_query -w $WORKSPACE" 1 run_command "genomicsdb_query -w $WORKSPACE -i xx -S XX" 1 -if [[ -z $WORKSPACE ]]; then - echo "Specify an existing workspace in env WORKSPACE" - exit 1 -fi - genomicsdb_query -w $WORKSPACE --list-contigs > $TEMP_DIR/contigs.list genomicsdb_query -w $WORKSPACE --list-samples > $TEMP_DIR/samples.list @@ -137,14 +137,15 @@ do fi done -run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096" -run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00097 -s HG00100 -s HG00096" -run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -s NON_EXISTENT_SAMPLE" -run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s NON_EXISTENT_SAMPLE" -run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -S $TEMP_DIR/samples.list" -run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list" -run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -f $FILTER" +run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -s NON_EXISTENT_SAMPLE -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s NON_EXISTENT_SAMPLE -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -S $TEMP_DIR/samples.list -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -f $FILTER -o $OUTPUT" +rm -f loader.json callset.json vidmap.json run_command "genomicsdb_cache -w $WORKSPACE $INTERVAL_ARGS" export TILEDB_CACHE=1 if [[ $WORKSPACE == *://* ]]; then @@ -152,8 +153,38 @@ if [[ $WORKSPACE == *://* ]]; then die "Could not cache workspace metadata for cloud URL=$WORKSPACE" fi echo "Running from cached metadata for workspace=$WORKSPACE..." - run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -s HG00097 -l loader.json -c callset.json -v vidmap.json" + run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -s HG00097 -l loader.json -c callset.json -v vidmap.json -o $OUTPUT" echo "Running from cached metadata for workspace=$WORKSPACE DONE" fi +rm -f loader.json callset.json vidmap.json + +if [[ $WORKSPACE == *://* ]]; then + cleanup + exit 0 +fi + +#################################################################### +# +# Check old style workspaces with genomicsdb_query/genomicsdb_cache +# +#################################################################### + +OLDSTYLE_DIR=$TEMP_DIR/old_style +mkdir -p $OLDSTYLE_DIR +tar xzf $(dirname $0)/../test/inputs/sanity.test.tgz -C $OLDSTYLE_DIR +WORKSPACE=$OLDSTYLE_DIR/ws + +run_command "genomicsdb_query -w $WORKSPACE --list-samples" +run_command "genomicsdb_query -w $WORKSPACE --list-contigs" +run_command "genomicsdb_query -w $WORKSPACE --list-partitions" +run_command "genomicsdb_query -w $WORKSPACE -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" + +OLDSTYLE_JSONS="-l $OLDSTYLE_DIR/loader.json -c $OLDSTYLE_DIR/callset_t0_1_2.json -v $OLDSTYLE_DIR/vid.json" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-samples" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-contigs" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-partitions" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" cleanup From a3bd0a16176165d0edb7ff70e4e18f4888e281ba Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Fri, 20 Dec 2024 15:59:54 -0800 Subject: [PATCH 2/8] Fix isinstance() --- examples/genomicsdb_common.py | 2 +- examples/genomicsdb_query | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/examples/genomicsdb_common.py b/examples/genomicsdb_common.py index 1d1bc66..c0617d2 100644 --- a/examples/genomicsdb_common.py +++ b/examples/genomicsdb_common.py @@ -53,7 +53,7 @@ def parse_vidmap_json(vidmap_file, intervals=None): if not intervals: intervals = [] for contig in contigs: - if isinstance(contig) == str: # Old style vidmap json + if isinstance(contig, str): # Old style vidmap json contig_name = contig contigs_map[contig] = { "length": contigs[contig]["length"], diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index 999c122..ae1f63d 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -47,7 +47,7 @@ def parse_callset_json(callset_file): callsets = callset["callsets"] samples = [] for callset in callsets: - if isinstance(callset) == str: # Old style vidmap json + if isinstance(callset, str): # Old style vidmap json samples.append(callset) else: # Generated with vcf2genomicsdb_init samples.append(callset["sample_name"]) @@ -69,8 +69,8 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None): callset["row_idx"] for sample in samples for callset in callsets - if (isinstance(callset) != str and sample == callset["sample_name"]) - or (isinstance(callset) == str and sample == callset) + if (not isinstance(callset, str) and sample == callset["sample_name"]) + or (isinstance(callset, str) and sample == callset) ] if len(rows) == 0: print(f"None of the samples{samples} specified were found in the workspace") @@ -342,7 +342,7 @@ def main(): arrays = [] for idx, partition in enumerate(partitions): - if isinstance(partition["begin"]) == int: # Old style vidmap json + if isinstance(partition["begin"], int): # Old style vidmap json column_begin = partition["begin"] if "end" in partition.keys(): column_end = partition["end"] From 732491c25b02f2851de97a3b1cc0589f73cfadf3 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Fri, 20 Dec 2024 16:36:28 -0800 Subject: [PATCH 3/8] No need to explicilty untar for examples/test.sh --- .github/workflows/basic.yml | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/.github/workflows/basic.yml b/.github/workflows/basic.yml index 7e18d4a..31093a3 100644 --- a/.github/workflows/basic.yml +++ b/.github/workflows/basic.yml @@ -96,8 +96,7 @@ jobs: run: | PYTHONPATH=. pytest PYTHONPATH=. python test/test.py - tar xzf examples/examples_ws.tgz - PYTHONPATH=. WORKSPACE=ws ./examples/test.sh + PYTHONPATH=. examples/test.sh - name: Cloud Test - Azurite run: | From 2b7b43e2b16878bdc09af4a2c6d6d6c65a1d66a4 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Fri, 20 Dec 2024 17:51:15 -0800 Subject: [PATCH 4/8] Remove unused args --- examples/genomicsdb_query | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index ae1f63d..ed15dd1 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -92,7 +92,7 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None): return row_tuples -def parse_loader_json(workspace, loader_file, contigs_map): +def parse_loader_json(loader_file): loader = json.loads(genomicsdb.read_entire_file(loader_file)) partitions = loader["column_partitions"] array_names = [partition["array_name"] for partition in partitions] @@ -249,7 +249,7 @@ def setup_gdb(): if args.list_partitions: contigs_map, _ = genomicsdb_common.parse_vidmap_json(vidmap_file) # parse loader.json for partitions - partitions = parse_loader_json(workspace, loader_file, contigs_map) + partitions = parse_loader_json(loader_file) print(*partitions, sep="\n") sys.exit(0) From 7e29837f8c481071f76719c383ad8880fe6b58c2 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Mon, 23 Dec 2024 11:07:52 -0800 Subject: [PATCH 5/8] Refinements --- examples/genomicsdb_query | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index ed15dd1..511af2d 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -45,12 +45,7 @@ from genomicsdb.protobuf import genomicsdb_export_config_pb2 as query_pb def parse_callset_json(callset_file): callset = json.loads(genomicsdb.read_entire_file(callset_file)) callsets = callset["callsets"] - samples = [] - for callset in callsets: - if isinstance(callset, str): # Old style vidmap json - samples.append(callset) - else: # Generated with vcf2genomicsdb_init - samples.append(callset["sample_name"]) + samples = [callset if isinstance(callset, str) else callset["sample_name"] for callset in callsets] return samples @@ -95,7 +90,7 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None): def parse_loader_json(loader_file): loader = json.loads(genomicsdb.read_entire_file(loader_file)) partitions = loader["column_partitions"] - array_names = [partition["array_name"] for partition in partitions] + array_names = [partition["array_name"] if "array_name" in partition else partition["array"] for partition in partitions] return [name.replace("$", ":", 1).replace("$", "-", 1) for name in array_names] From 8364653c84bad088fd208ef6fe1275bb81256901 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Mon, 23 Dec 2024 11:11:58 -0800 Subject: [PATCH 6/8] Lint --- examples/genomicsdb_query | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index 511af2d..5b7a96d 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -90,7 +90,9 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None): def parse_loader_json(loader_file): loader = json.loads(genomicsdb.read_entire_file(loader_file)) partitions = loader["column_partitions"] - array_names = [partition["array_name"] if "array_name" in partition else partition["array"] for partition in partitions] + array_names = [ + partition["array_name"] if "array_name" in partition else partition["array"] for partition in partitions + ] return [name.replace("$", ":", 1).replace("$", "-", 1) for name in array_names] From 02a3c5adf5c961f89e135f75a9e984e7ec75c72d Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Mon, 23 Dec 2024 12:52:31 -0800 Subject: [PATCH 7/8] Bug --- examples/genomicsdb_query | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index 5b7a96d..ab980b0 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -61,12 +61,13 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None): return None # Old style json has type(callset) == str whereas the one generated with vcf2genomicsdb_init is a list rows = [ - callset["row_idx"] + callsets[callset]["row_idx"] if isinstance(callset, str) else callset["row_idx"] for sample in samples for callset in callsets if (not isinstance(callset, str) and sample == callset["sample_name"]) or (isinstance(callset, str) and sample == callset) ] + breakpoint() if len(rows) == 0: print(f"None of the samples{samples} specified were found in the workspace") return [] From 5b125ed2d926a77e4a486d5f34db70ce060310e7 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Mon, 23 Dec 2024 12:55:03 -0800 Subject: [PATCH 8/8] remove breakpoint --- examples/genomicsdb_query | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index ab980b0..62c89b4 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -67,7 +67,6 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None): if (not isinstance(callset, str) and sample == callset["sample_name"]) or (isinstance(callset, str) and sample == callset) ] - breakpoint() if len(rows) == 0: print(f"None of the samples{samples} specified were found in the workspace") return []