Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 32 additions & 19 deletions examples/genomicsdb_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,8 @@
#

import json
import logging
import os
import re
import sys

import genomicsdb
Expand All @@ -53,37 +53,49 @@ def parse_vidmap_json(vidmap_file, intervals=None):
all_intervals = not intervals
if not intervals:
intervals = []
column_offset = 0
for contig in contigs:
if isinstance(contig, str): # Old style vidmap json
contig_name = contig
contigs_map[contig] = {
contig_elem = {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may not need this, but doesn't contig_elem also have a name in the vcf2genomicsdb_init case?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, it does in the vcf2genomicsdb_init case, but are really ignoring as the contigs map is a dict for faster lookup.

"length": contigs[contig]["length"],
"tiledb_column_offset": contigs[contig]["tiledb_column_offset"],
}
else: # Generated with vcf2genomicsdb_init
contig_name = contig["name"]
contigs_map[contig["name"]] = contig
contig_elem = contig
# Sanity check of column offsets
if column_offset != contig_elem["tiledb_column_offset"]:
logging.critical(
f"vidmap file({vidmap_file}) has wrong column offset for {contig_name}. Cannot process vidmap file from contigs({contig_name})" # noqa
)
break
contigs_map[contig_name] = contig_elem
column_offset += contig_elem["length"]
if all_intervals:
intervals.append(contig_name)
return contigs_map, intervals


interval_pattern = re.compile(r"(.*):(.*)-(.*)|(.*):(.*)|(.*)")
def parse_interval(interval: str):
results = interval.split(":")
contig = results[0]
if len(results) > 1:
results = results[1].split("-")
if len(results) > 1:
return contig, int(results[0]), int(results[1])
else:
return contig, int(results[0]), None
else:
return contig, 1, None


# get tiledb offsets for interval
def parse_interval(interval: str):
result = re.match(interval_pattern, interval)
if result:
length = len(result.groups())
if length == 6:
if result.group(1) and result.group(2) and result.group(3):
return result.group(1), int(result.group(2)), int(result.group(3))
elif result.group(4) and result.group(5):
return result.group(4), int(result.group(5)), None
elif result.group(6):
return result.group(6), 1, None
raise RuntimeError(f"Interval {interval} could not be parsed")
def get_partitions(intervals, contigs_map, partitions):
arrays = []
for interval in intervals:
_, _, _, array_names = get_arrays(interval, contigs_map, partitions)
arrays.extend([name.replace("$", ":", 1).replace("$", "-", 1) for name in array_names])
return arrays


def get_arrays(interval, contigs_map, partitions):
Expand All @@ -97,11 +109,12 @@ def get_arrays(interval, contigs_map, partitions):
end = length
contig_end = contigs_map[contig]["tiledb_column_offset"] + length - 1
else:
print(f"Contig({contig}) not found in vidmap.json")
logging.error(f"Contig({contig}) not found in vidmap.json")
return 0, 0, 0, []

arrays = []
for idx, partition in enumerate(partitions):
if isinstance(partition["begin"], int): # Old style vidmap json
if isinstance(partition["begin"], int): # Old style loader json
column_begin = partition["begin"]
if "end" in partition.keys():
column_end = partition["end"]
Expand Down
Loading