From c13403944ff64a3e7f072da67ea4a3ab3ab57412 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Sun, 29 Dec 2024 10:16:48 -0800 Subject: [PATCH 1/8] Missed adding filter in the refactoring for multiprocessing --- examples/genomicsdb_query | 87 +++++++++++++++++++++++---------------- 1 file changed, 52 insertions(+), 35 deletions(-) diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index 39afb68..026e1be 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -320,6 +320,7 @@ class GenomicsDBExportConfig(NamedTuple): workspace: str vidmap_file: str callset_file: str + filter: str class GenomicsDBQueryConfig(NamedTuple): @@ -329,7 +330,6 @@ class GenomicsDBQueryConfig(NamedTuple): end: int array_name: str row_tuples: List[tuple] - filter: str class OutputConfig(NamedTuple): @@ -353,6 +353,8 @@ def configure_export(config: GenomicsDBExportConfig): export_config.callset_mapping_file = config.callset_file export_config.bypass_intersecting_intervals_phase = False export_config.enable_shared_posixfs_optimizations = True + if config.filter: + export_config.query_filter = config.filter return export_config @@ -384,45 +386,54 @@ def process(config): msg = f"array({query_config.array_name}) for interval({query_config.interval})" if not genomicsdb.array_exists(export_config.workspace, query_config.array_name): logging.error(msg + f" not imported into workspace({export_config.workspace})") - return + return -1 global gdb try: if gdb: logging.info("Found gdb to process " + msg) + else: + logging.error("Something wrong, gdb seems to be None") + return -1 except NameError: logging.info("Instantiating genomicsdb to process " + msg + "...") gdb = genomicsdb.connect_with_protobuf(configure_export(export_config)) logging.info("Instantiating genomicsdb to process " + msg + " DONE") query_protobuf = configure_query(query_config) - if output_config.type == "csv": - df = gdb.query_variant_calls(query_protobuf=query_protobuf, flatten_intervals=True) - df.to_csv(output_config.filename, index=False) - elif output_config.type == "json": - json_output = gdb.query_variant_calls(query_protobuf=query_protobuf, json_output=output_config.json_type) - with open(output_config.filename, "wb") as f: - f.write(json_output) - elif output_config.type == "arrow": - nbytes = 0 - writer = None - i = 0 - for out in gdb.query_variant_calls(query_protobuf=query_protobuf, arrow_output=True, batching=True): - reader = pa.ipc.open_stream(out) - for batch in reader: - if nbytes > output_config.max_arrow_bytes: - i += 1 - nbytes = 0 - if writer: - writer.close() - writer = None - if not writer: - writer = pq.ParquetWriter(f"{output_config.filename}__{i}.parquet", batch.schema) - nbytes += batch.nbytes - writer.write_batch(batch) - if writer: - writer.close() - writer = None + try: + if output_config.type == "csv": + df = gdb.query_variant_calls(query_protobuf=query_protobuf, flatten_intervals=True) + df.to_csv(output_config.filename, index=False) + elif output_config.type == "json": + json_output = gdb.query_variant_calls(query_protobuf=query_protobuf, json_output=output_config.json_type) + with open(output_config.filename, "wb") as f: + f.write(json_output) + elif output_config.type == "arrow": + nbytes = 0 + writer = None + i = 0 + for out in gdb.query_variant_calls(query_protobuf=query_protobuf, arrow_output=True, batching=True): + reader = pa.ipc.open_stream(out) + for batch in reader: + if nbytes > output_config.max_arrow_bytes: + i += 1 + nbytes = 0 + if writer: + writer.close() + writer = None + if not writer: + writer = pq.ParquetWriter(f"{output_config.filename}__{i}.parquet", batch.schema) + nbytes += batch.nbytes + writer.write_batch(batch) + if writer: + writer.close() + writer = None + except Exception as e: + logging.critical(f"Unexpected exception : {e}") + gdb = None + return -1 logging.info(f"Processed array {query_config.array_name} for interval {query_config.interval}") + return 0 def main(): @@ -445,7 +456,7 @@ def main(): max_arrow_bytes = parse_args_for_max_bytes(args.max_arrow_byte_size) print(f"Using {args.max_arrow_byte_size} number of bytes as hint for writing out parquet files") - export_config = GenomicsDBExportConfig(workspace, vidmap_file, callset_file) + export_config = GenomicsDBExportConfig(workspace, vidmap_file, callset_file, args.filter) configs = [] for interval in intervals: print(f"Processing interval({interval})...") @@ -457,7 +468,7 @@ def main(): print(f"\tArrays:{arrays} under consideration for interval({interval})") for idx, array in enumerate(arrays): - query_config = GenomicsDBQueryConfig(interval, contig, start, end, array, row_tuples, filter) + query_config = GenomicsDBQueryConfig(interval, contig, start, end, array, row_tuples) output_config = OutputConfig( generate_output_filename(output, output_type, interval, idx), output_type, @@ -466,13 +477,19 @@ def main(): ) configs.append(Config(export_config, query_config, output_config)) - if len(configs) == 1: - process(configs[0]) + if min(len(configs), args.nproc) == 1: + results = list(map(process, configs)) else: with multiprocessing.Pool(processes=min(len(configs), args.nproc)) as pool: - pool.map(process, configs) + results = list(pool.map(process, configs)) + + msg = "successfully" + for result in results: + if result != 0: + msg = "unsuccessfully. Check output for errors" + break - print(f"genomicsdb_query for workspace({workspace}) and intervals({intervals}) completed successfully") + print(f"genomicsdb_query for workspace({workspace}) and intervals({intervals}) completed {msg}" ) if __name__ == "__main__": From 3afa07d884b7bc99db2096b617b78c3881779f6a Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Sun, 29 Dec 2024 11:15:08 -0800 Subject: [PATCH 2/8] Lint --- examples/genomicsdb_query | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index 026e1be..98efd62 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -489,7 +489,7 @@ def main(): msg = "unsuccessfully. Check output for errors" break - print(f"genomicsdb_query for workspace({workspace}) and intervals({intervals}) completed {msg}" ) + print(f"genomicsdb_query for workspace({workspace}) and intervals({intervals}) completed {msg}") if __name__ == "__main__": From 929a086cef545f279b2d0e1d6d5c6b17611d7b73 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Sun, 29 Dec 2024 12:23:30 -0800 Subject: [PATCH 3/8] Support --list-partitions with -i/I specified --- examples/genomicsdb_common.py | 8 ++++++++ examples/genomicsdb_query | 12 +++++++++--- examples/test.sh | 8 ++++++++ 3 files changed, 25 insertions(+), 3 deletions(-) diff --git a/examples/genomicsdb_common.py b/examples/genomicsdb_common.py index 803e145..93d877a 100644 --- a/examples/genomicsdb_common.py +++ b/examples/genomicsdb_common.py @@ -86,6 +86,14 @@ def parse_interval(interval: str): raise RuntimeError(f"Interval {interval} could not be parsed") +def get_partitions(intervals, contigs_map, partitions): + arrays = [] + for interval in intervals: + _, _, _, array_names = get_arrays(interval, contigs_map, partitions) + arrays.extend([name.replace("$", ":", 1).replace("$", "-", 1) for name in array_names]) + return arrays + + def get_arrays(interval, contigs_map, partitions): contig, start, end = parse_interval(interval) if contig in contigs_map: diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index 98efd62..d0f3a1c 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -158,7 +158,7 @@ def setup(): parser.add_argument( "--list-partitions", action="store_true", - help="List interval partitions(genomicsdb arrays in the workspace) for the ingested samples in the workspace and exit", # noqa + help="List interval partitions(genomicsdb arrays in the workspace) for the given intervals(-i/--interval or -I/--interval-list) or all the intervals for the workspace and exit", # noqa ) parser.add_argument( "-i", @@ -257,7 +257,7 @@ def setup(): print(*samples, sep="\n") sys.exit(0) - if args.list_partitions: + if args.list_partitions and not args.interval and not args.interval_list: contigs_map, _ = genomicsdb_common.parse_vidmap_json(vidmap_file) # parse loader.json for partitions partitions = parse_loader_json(loader_file) @@ -274,12 +274,18 @@ def setup(): ) contigs_map, intervals = genomicsdb_common.parse_vidmap_json(vidmap_file, intervals or interval_list) - row_tuples = parse_callset_json_for_row_ranges(callset_file, samples or sample_list) # parse loader.json for partitions loader = json.loads(genomicsdb.read_entire_file(loader_file)) partitions = loader["column_partitions"] + if args.list_partitions: + partitions = genomicsdb_common.get_partitions(intervals, contigs_map, partitions) + print(*partitions, sep="\n") + sys.exit(0) + + row_tuples = parse_callset_json_for_row_ranges(callset_file, samples or sample_list) + return workspace, callset_file, vidmap_file, partitions, contigs_map, intervals, row_tuples, args diff --git a/examples/test.sh b/examples/test.sh index a65a2f6..39d82da 100755 --- a/examples/test.sh +++ b/examples/test.sh @@ -174,6 +174,13 @@ WORKSPACE=$OLDSTYLE_DIR/ws run_command "genomicsdb_query -w $WORKSPACE --list-samples" run_command "genomicsdb_query -w $WORKSPACE --list-contigs" run_command "genomicsdb_query -w $WORKSPACE --list-partitions" +run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS --list-partitions" +cmd="genomicsdb_query -w $WORKSPACE -i 1:1-10 --list-partitions" +PARTITION=$($cmd) +if [[ $PARTITION != "t0_1_2" ]]; then + echo "Expected output for $cmd shoule be t0_1_2, but got $PARTITION" + exit 1 +fi run_command "genomicsdb_query -w $WORKSPACE -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" @@ -182,6 +189,7 @@ run_command "genomicsdb_cache -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-samples" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-contigs" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-partitions" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS --list-partitions" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" From 244cd27a88bc150a89849461ef6d647bf244d41c Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Mon, 30 Dec 2024 10:57:30 -0800 Subject: [PATCH 4/8] Add --list-arrays command arg --- examples/genomicsdb_query | 33 ++++++++++++++++++++++++++++++--- examples/test.sh | 4 +++- 2 files changed, 33 insertions(+), 4 deletions(-) diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index d0f3a1c..c7a4f87 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -96,13 +96,13 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None): return row_tuples -def parse_loader_json(loader_file): +def parse_loader_json(loader_file, interval_form=True): loader = json.loads(genomicsdb.read_entire_file(loader_file)) partitions = loader["column_partitions"] array_names = [ partition["array_name"] if "array_name" in partition else partition["array"] for partition in partitions ] - return [name.replace("$", ":", 1).replace("$", "-", 1) for name in array_names] + return [name.replace("$", ":", 1).replace("$", "-", 1) if interval_form else name for name in array_names] def check_nproc(value): @@ -160,6 +160,11 @@ def setup(): action="store_true", help="List interval partitions(genomicsdb arrays in the workspace) for the given intervals(-i/--interval or -I/--interval-list) or all the intervals for the workspace and exit", # noqa ) + parser.add_argument( + "--list-arrays", + action="store_true", + help="Sanity check against partitions found in loader and vid mapping files and list genomicsdb arrays physically found in the workspace and exit", # noqa + ) parser.add_argument( "-i", "--interval", @@ -258,12 +263,34 @@ def setup(): sys.exit(0) if args.list_partitions and not args.interval and not args.interval_list: - contigs_map, _ = genomicsdb_common.parse_vidmap_json(vidmap_file) # parse loader.json for partitions partitions = parse_loader_json(loader_file) print(*partitions, sep="\n") sys.exit(0) + if args.list_arrays: + # parse loader.json for partitions + partitions = parse_loader_json(loader_file, False) + errors_found = False + found_partitions = [] + for partition in partitions: + if genomicsdb.array_exists(workspace, partition): + found_partitions.append(partition) + else: + logging.error(f"Array({partition}) does not seem to exist in workspace({workspace})") + errors_found = True + if errors_found: + print( + f"Check vid mapping({vidmap_file}) and loader({loader_file}) files for inconsistencies w.r.t workspace({workspace})" + ) # noqa + if len(found_partitions) > 0: + print("Following arrays found in workspace -") + print(*found_partitions, sep="\n") + sys.exit(1) + else: + print(*partitions, sep="\n") + sys.exit(0) + intervals = args.interval interval_list = args.interval_list samples = args.sample diff --git a/examples/test.sh b/examples/test.sh index 39d82da..62951b0 100755 --- a/examples/test.sh +++ b/examples/test.sh @@ -178,9 +178,10 @@ run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS --list-partitions" cmd="genomicsdb_query -w $WORKSPACE -i 1:1-10 --list-partitions" PARTITION=$($cmd) if [[ $PARTITION != "t0_1_2" ]]; then - echo "Expected output for $cmd shoule be t0_1_2, but got $PARTITION" + echo "Expected output for $cmd should be t0_1_2, but got $PARTITION" exit 1 fi +run_command "genomicsdb_query -w $WORKSPACE --list-arrays" 1 run_command "genomicsdb_query -w $WORKSPACE -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" @@ -189,6 +190,7 @@ run_command "genomicsdb_cache -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-samples" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-contigs" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-partitions" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSON --list-arrays" 1 run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS --list-partitions" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" From e96ec6886f76345cbd5b8edc92ef3539b1d359bd Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Thu, 2 Jan 2025 16:24:05 -0800 Subject: [PATCH 5/8] Sanity check vidmap json, modify error processing --- examples/genomicsdb_common.py | 45 ++++++++++--------- examples/genomicsdb_query | 82 ++++++++++++++--------------------- examples/test.sh | 2 - 3 files changed, 58 insertions(+), 71 deletions(-) diff --git a/examples/genomicsdb_common.py b/examples/genomicsdb_common.py index 93d877a..30ed0db 100644 --- a/examples/genomicsdb_common.py +++ b/examples/genomicsdb_common.py @@ -25,8 +25,8 @@ # import json +import logging import os -import re import sys import genomicsdb @@ -53,37 +53,41 @@ def parse_vidmap_json(vidmap_file, intervals=None): all_intervals = not intervals if not intervals: intervals = [] + column_offset = 0 for contig in contigs: if isinstance(contig, str): # Old style vidmap json contig_name = contig - contigs_map[contig] = { + contig_elem = { "length": contigs[contig]["length"], "tiledb_column_offset": contigs[contig]["tiledb_column_offset"], } else: # Generated with vcf2genomicsdb_init contig_name = contig["name"] - contigs_map[contig["name"]] = contig + contig_elem = contig + # Sanity check of column offsets + if column_offset != contig_elem["tiledb_column_offset"]: + logging.critical( + f"vidmap file({vidmap_file}) has wrong column offset for {contig_name}. Cannot process vidmap file from contigs({contig_name})" # noqa + ) + break + contigs_map[contig_name] = contig_elem + column_offset += contig_elem["length"] if all_intervals: intervals.append(contig_name) return contigs_map, intervals -interval_pattern = re.compile(r"(.*):(.*)-(.*)|(.*):(.*)|(.*)") - - -# get tiledb offsets for interval def parse_interval(interval: str): - result = re.match(interval_pattern, interval) - if result: - length = len(result.groups()) - if length == 6: - if result.group(1) and result.group(2) and result.group(3): - return result.group(1), int(result.group(2)), int(result.group(3)) - elif result.group(4) and result.group(5): - return result.group(4), int(result.group(5)), None - elif result.group(6): - return result.group(6), 1, None - raise RuntimeError(f"Interval {interval} could not be parsed") + results = interval.split(":") + contig = results[0] + if len(results) > 1: + results = results[1].split("-") + if len(results) > 1: + return contig, int(results[0]), int(results[1]) + else: + return contig, int(results[0]), None + else: + return contig, 1, None def get_partitions(intervals, contigs_map, partitions): @@ -105,11 +109,12 @@ def get_arrays(interval, contigs_map, partitions): end = length contig_end = contigs_map[contig]["tiledb_column_offset"] + length - 1 else: - print(f"Contig({contig}) not found in vidmap.json") + logging.error(f"Contig({contig}) not found in vidmap.json") + return 0, 0, 0, [] arrays = [] for idx, partition in enumerate(partitions): - if isinstance(partition["begin"], int): # Old style vidmap json + if isinstance(partition["begin"], int): # Old style loader json column_begin = partition["begin"] if "end" in partition.keys(): column_end = partition["end"] diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index c7a4f87..545caf2 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -160,11 +160,6 @@ def setup(): action="store_true", help="List interval partitions(genomicsdb arrays in the workspace) for the given intervals(-i/--interval or -I/--interval-list) or all the intervals for the workspace and exit", # noqa ) - parser.add_argument( - "--list-arrays", - action="store_true", - help="Sanity check against partitions found in loader and vid mapping files and list genomicsdb arrays physically found in the workspace and exit", # noqa - ) parser.add_argument( "-i", "--interval", @@ -252,63 +247,45 @@ def setup(): ): raise RuntimeError(f"callset({callset_file}) vidmap({vidmap_file}) or loader({loader_file}) not found") - if args.list_contigs: - _, intervals = genomicsdb_common.parse_vidmap_json(vidmap_file) - print(*intervals, sep="\n") - sys.exit(0) - + # List samples if args.list_samples: samples = parse_callset_json(callset_file) print(*samples, sep="\n") sys.exit(0) - if args.list_partitions and not args.interval and not args.interval_list: - # parse loader.json for partitions - partitions = parse_loader_json(loader_file) - print(*partitions, sep="\n") - sys.exit(0) - - if args.list_arrays: - # parse loader.json for partitions - partitions = parse_loader_json(loader_file, False) - errors_found = False - found_partitions = [] - for partition in partitions: - if genomicsdb.array_exists(workspace, partition): - found_partitions.append(partition) - else: - logging.error(f"Array({partition}) does not seem to exist in workspace({workspace})") - errors_found = True - if errors_found: - print( - f"Check vid mapping({vidmap_file}) and loader({loader_file}) files for inconsistencies w.r.t workspace({workspace})" - ) # noqa - if len(found_partitions) > 0: - print("Following arrays found in workspace -") - print(*found_partitions, sep="\n") - sys.exit(1) - else: - print(*partitions, sep="\n") - sys.exit(0) - intervals = args.interval interval_list = args.interval_list samples = args.sample sample_list = args.sample_list - if not intervals and not samples and not interval_list and not sample_list: - raise RuntimeError( - "Specify at least one of either -i/-interval -I/--interval-list -s/--sample -S/--sample-list has to be specified" # noqa - ) + if not args.list_partitions and not args.list_contigs: + if not intervals and not samples and not interval_list and not sample_list: + raise RuntimeError( + "Specify at least one of either -i/-interval -I/--interval-list -s/--sample -S/--sample-list has to be specified" # noqa + ) contigs_map, intervals = genomicsdb_common.parse_vidmap_json(vidmap_file, intervals or interval_list) - # parse loader.json for partitions + # List contigs + if args.list_contigs: + if intervals: + for interval in intervals: + if interval.split(":")[0] in contigs_map.keys(): + print(interval) + else: + print(*contigs_map.keys(), sep="\n") + sys.exit(0) + loader = json.loads(genomicsdb.read_entire_file(loader_file)) partitions = loader["column_partitions"] + # List partitions if args.list_partitions: - partitions = genomicsdb_common.get_partitions(intervals, contigs_map, partitions) - print(*partitions, sep="\n") + if args.interval or args.interval_list: + partition_names = genomicsdb_common.get_partitions(intervals, contigs_map, partitions) + else: + # just parse loader.json for partitions + partition_names = parse_loader_json(loader_file) + print(*partition_names, sep="\n") sys.exit(0) row_tuples = parse_callset_json_for_row_ranges(callset_file, samples or sample_list) @@ -496,7 +473,7 @@ def main(): contig, start, end, arrays = genomicsdb_common.get_arrays(interval, contigs_map, partitions) if len(arrays) == 0: - print(f"No arrays in the workspace matched input interval({interval})") + logging.error(f"No arrays in the workspace matched input interval({interval})") continue print(f"\tArrays:{arrays} under consideration for interval({interval})") @@ -510,6 +487,9 @@ def main(): ) configs.append(Config(export_config, query_config, output_config)) + if len(configs) == 0: + print("Nothing to process!!. Check output for possible errors") + sys.exit(1) if min(len(configs), args.nproc) == 1: results = list(map(process, configs)) else: @@ -519,11 +499,15 @@ def main(): msg = "successfully" for result in results: if result != 0: - msg = "unsuccessfully. Check output for errors" + msg = "unsuccessfully. Check output for possible errors" break print(f"genomicsdb_query for workspace({workspace}) and intervals({intervals}) completed {msg}") if __name__ == "__main__": - main() + try: + main() + except RuntimeError as e: + logging.error(e) + sys.exit(1) diff --git a/examples/test.sh b/examples/test.sh index 62951b0..0cbc0ce 100755 --- a/examples/test.sh +++ b/examples/test.sh @@ -181,7 +181,6 @@ if [[ $PARTITION != "t0_1_2" ]]; then echo "Expected output for $cmd should be t0_1_2, but got $PARTITION" exit 1 fi -run_command "genomicsdb_query -w $WORKSPACE --list-arrays" 1 run_command "genomicsdb_query -w $WORKSPACE -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" @@ -190,7 +189,6 @@ run_command "genomicsdb_cache -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-samples" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-contigs" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-partitions" -run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSON --list-arrays" 1 run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS --list-partitions" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" From 7f251737c9e14f0e5579200a80a91c74af359e2e Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Thu, 2 Jan 2025 16:29:56 -0800 Subject: [PATCH 6/8] Lint --- examples/genomicsdb_common.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/genomicsdb_common.py b/examples/genomicsdb_common.py index 30ed0db..cdc5ea8 100644 --- a/examples/genomicsdb_common.py +++ b/examples/genomicsdb_common.py @@ -67,7 +67,7 @@ def parse_vidmap_json(vidmap_file, intervals=None): # Sanity check of column offsets if column_offset != contig_elem["tiledb_column_offset"]: logging.critical( - f"vidmap file({vidmap_file}) has wrong column offset for {contig_name}. Cannot process vidmap file from contigs({contig_name})" # noqa + f"vidmap file({vidmap_file}) has wrong column offset for {contig_name}. Cannot process vidmap file from contigs({contig_name})" # noqa ) break contigs_map[contig_name] = contig_elem From 1478663fc8b3695326ee0c96de1da3b79e0d9ed1 Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Thu, 2 Jan 2025 23:04:58 -0800 Subject: [PATCH 7/8] Support listing and passing fields/atrributes as argument to genomicsdb_query --- examples/genomicsdb_query | 53 +++++++++++++++++++++++++++++++++------ examples/test.sh | 8 ++++++ 2 files changed, 54 insertions(+), 7 deletions(-) diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index 545caf2..b581272 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -96,6 +96,25 @@ def parse_callset_json_for_row_ranges(callset_file, samples=None): return row_tuples +def parse_vidmap_json_for_attributes(vidmap_file, attributes=None): + if attributes is None: + return ["GT"] + else: + vidmap = json.loads(genomicsdb.read_entire_file(vidmap_file)) + fields = vidmap["fields"] + if isinstance(fields, list): + fields = [field["name"] for field in fields] + else: # Old style vidmap json + fields = fields.keys() + if len(attributes) == 0: + return fields + attributes = attributes.replace(" ", "").split(",") + not_found = [attribute for attribute in attributes if attribute not in fields] + if len(not_found) > 0: + raise RuntimeError(f"Attributes({not_found}) not found in vid mapping({vidmap_file})") + return attributes + + def parse_loader_json(loader_file, interval_form=True): loader = json.loads(genomicsdb.read_entire_file(loader_file)) partitions = loader["column_partitions"] @@ -119,7 +138,7 @@ def check_nproc(value): def setup(): parser = argparse.ArgumentParser( prog="query", - description="GenomicsDB simple query with samples/intervals/filter as inputs", + description="GenomicsDB simple query with samples/intervals/attributes/filter as inputs", formatter_class=argparse.RawTextHelpFormatter, usage="%(prog)s [options]", ) @@ -153,7 +172,12 @@ def setup(): ) parser.add_argument("--list-samples", action="store_true", help="List samples ingested into the workspace and exit") parser.add_argument( - "--list-contigs", action="store_true", help="List contigs for the ingested samples in the workspace and exit" + "--list-contigs", action="store_true", help="List contigs configured in vid mapping for the workspace and exit" + ) + parser.add_argument( + "--list-fields", + action="store_true", + help="List genomic fields configured in vid mapping for the workspace and exit", ) parser.add_argument( "--list-partitions", @@ -186,6 +210,12 @@ def setup(): required=False, help="sample file containing list of samples, one sample per line, to operate upon. \nNote: \n\t1. -s/--sample and -S/--sample-list are mutually exclusive \n\t2. either samples and/or intervals using -i/-I/-s/-S options has to be specified", # noqa ) + parser.add_argument( + "-a", + "--attributes", + required=False, + help="Optional - comma separated list of genomic attributes or fields described in the vid mapping for the query, eg. GT,AC,PL,DP... Defaults to GT", # noqa + ) parser.add_argument( "-f", "--filter", @@ -253,6 +283,12 @@ def setup(): print(*samples, sep="\n") sys.exit(0) + # List fields + if args.list_fields: + fields = parse_vidmap_json_for_attributes(vidmap_file, attributes="") + print(*fields, sep="\n") + sys.exit(0) + intervals = args.interval interval_list = args.interval_list samples = args.sample @@ -260,7 +296,7 @@ def setup(): if not args.list_partitions and not args.list_contigs: if not intervals and not samples and not interval_list and not sample_list: raise RuntimeError( - "Specify at least one of either -i/-interval -I/--interval-list -s/--sample -S/--sample-list has to be specified" # noqa + "one of either -i/-interval -I/--interval-list -s/--sample -S/--sample-list has to be specified" # noqa ) contigs_map, intervals = genomicsdb_common.parse_vidmap_json(vidmap_file, intervals or interval_list) @@ -289,8 +325,9 @@ def setup(): sys.exit(0) row_tuples = parse_callset_json_for_row_ranges(callset_file, samples or sample_list) + attributes = parse_vidmap_json_for_attributes(vidmap_file, args.attributes) - return workspace, callset_file, vidmap_file, partitions, contigs_map, intervals, row_tuples, args + return workspace, callset_file, vidmap_file, partitions, contigs_map, intervals, row_tuples, attributes, args def generate_output_filename(output, output_type, interval, idx): @@ -330,6 +367,7 @@ class GenomicsDBExportConfig(NamedTuple): workspace: str vidmap_file: str callset_file: str + attributes: str filter: str @@ -358,11 +396,12 @@ class Config(NamedTuple): def configure_export(config: GenomicsDBExportConfig): export_config = query_pb.ExportConfiguration() export_config.workspace = config.workspace - export_config.attributes.extend(["REF", "ALT", "GT"]) export_config.vid_mapping_file = config.vidmap_file export_config.callset_mapping_file = config.callset_file export_config.bypass_intersecting_intervals_phase = False export_config.enable_shared_posixfs_optimizations = True + if config.attributes: + export_config.attributes.extend(config.attributes) if config.filter: export_config.query_filter = config.filter return export_config @@ -447,7 +486,7 @@ def process(config): def main(): - workspace, callset_file, vidmap_file, partitions, contigs_map, intervals, row_tuples, args = setup() + workspace, callset_file, vidmap_file, partitions, contigs_map, intervals, row_tuples, attributes, args = setup() if row_tuples is not None and len(row_tuples) == 0: return @@ -466,7 +505,7 @@ def main(): max_arrow_bytes = parse_args_for_max_bytes(args.max_arrow_byte_size) print(f"Using {args.max_arrow_byte_size} number of bytes as hint for writing out parquet files") - export_config = GenomicsDBExportConfig(workspace, vidmap_file, callset_file, args.filter) + export_config = GenomicsDBExportConfig(workspace, vidmap_file, callset_file, attributes, args.filter) configs = [] for interval in intervals: print(f"Processing interval({interval})...") diff --git a/examples/test.sh b/examples/test.sh index 0cbc0ce..a4a648d 100755 --- a/examples/test.sh +++ b/examples/test.sh @@ -44,6 +44,7 @@ SAMPLE_ARGS="-s HG00096 -s HG00097 -s HG00099" VIDMAP_FILE=vidmap_file.json LOADER_FILE=loader_file.json FILTER='ISHOMREF' +FIELDS="DP,GT" if [[ $(uname) == "Darwin" ]]; then TEMP_DIR=$(mktemp -d -t test-examples) @@ -141,9 +142,12 @@ run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -s NON_EXISTENT_SAMPLE -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s NON_EXISTENT_SAMPLE -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -s HG00096 -s NON_EXISTENT_SAMPLE -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE -I $TEMP_DIR/contigs.list -S $TEMP_DIR/samples.list -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -f $FILTER -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -a $FIELDS -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -a NON_EXISTENT_FIELD,$FIELDS -o $OUTPUT" 1 rm -f loader.json callset.json vidmap.json run_command "genomicsdb_cache -w $WORKSPACE $INTERVAL_ARGS" @@ -173,6 +177,7 @@ WORKSPACE=$OLDSTYLE_DIR/ws run_command "genomicsdb_query -w $WORKSPACE --list-samples" run_command "genomicsdb_query -w $WORKSPACE --list-contigs" +run_command "genomicsdb_query -w $WORKSPACE --list-fields" run_command "genomicsdb_query -w $WORKSPACE --list-partitions" run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS --list-partitions" cmd="genomicsdb_query -w $WORKSPACE -i 1:1-10 --list-partitions" @@ -183,14 +188,17 @@ if [[ $PARTITION != "t0_1_2" ]]; then fi run_command "genomicsdb_query -w $WORKSPACE -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE $INTERVAL_ARGS -S $TEMP_DIR/samples.list -a GT -o $OUTPUT" OLDSTYLE_JSONS="-l $OLDSTYLE_DIR/loader.json -c $OLDSTYLE_DIR/callset_t0_1_2.json -v $OLDSTYLE_DIR/vid.json" run_command "genomicsdb_cache -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-samples" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-contigs" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-fields" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS --list-partitions" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS --list-partitions" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS -s HG00097 -s HG00100 -s HG00096 -o $OUTPUT" run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS -S $TEMP_DIR/samples.list -o $OUTPUT" +run_command "genomicsdb_query -w $WORKSPACE $OLDSTYLE_JSONS $INTERVAL_ARGS -S $TEMP_DIR/samples.list -a GT -o $OUTPUT" cleanup From a32500bef8f0a1b1a61ade9a6ae898ae874e0f1b Mon Sep 17 00:00:00 2001 From: Nalini Ganapati Date: Fri, 3 Jan 2025 11:24:40 -0800 Subject: [PATCH 8/8] Change error message --- examples/genomicsdb_query | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/genomicsdb_query b/examples/genomicsdb_query index b581272..f952d55 100755 --- a/examples/genomicsdb_query +++ b/examples/genomicsdb_query @@ -538,7 +538,7 @@ def main(): msg = "successfully" for result in results: if result != 0: - msg = "unsuccessfully. Check output for possible errors" + msg = "unsuccessfully for some arrays. Check output for errors" break print(f"genomicsdb_query for workspace({workspace}) and intervals({intervals}) completed {msg}")