diff --git a/.DS_Store b/.DS_Store index a6eb702..46f65c5 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/VariantFormatter/__init__.py b/VariantFormatter/__init__.py index 0e70799..7d04091 100644 --- a/VariantFormatter/__init__.py +++ b/VariantFormatter/__init__.py @@ -1,20 +1,20 @@ -import pkg_resources +import importlib.metadata import re import warnings # Pull in use_scm_version=True enabled version number _is_released_version = False try: - __version__ = pkg_resources.get_distribution("VariantFormatter").version + __version__ = importlib.metadata.version("VariantFormatter") if re.match(r"^\d+\.\d+\.\d+$", __version__) is not None: _is_released_version = True -except pkg_resources.DistributionNotFound as e: +except importlib.metadata.PackageNotFoundError: warnings.warn("can't get __version__ because %s package isn't installed" % __package__, Warning) __version__ = None # -# Copyright (C) 2016-2025 VariantValidator Contributors +# Copyright (C) 2016-2026 VariantValidator Contributors # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as diff --git a/VariantFormatter/formatter.py b/VariantFormatter/formatter.py index 9df413b..df4f7aa 100644 --- a/VariantFormatter/formatter.py +++ b/VariantFormatter/formatter.py @@ -413,7 +413,7 @@ def gap_checker(hgvs_transcript, hgvs_genomic, genome_build, vfo, transcript_mod # -# Copyright (C) 2016-2025 VariantValidator Contributors +# Copyright (C) 2016-2026 VariantValidator Contributors # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as diff --git a/VariantFormatter/gapGenes.py b/VariantFormatter/gapGenes.py index 9d703cb..cf0da00 100644 --- a/VariantFormatter/gapGenes.py +++ b/VariantFormatter/gapGenes.py @@ -225,7 +225,7 @@ def fully_normalize(hgvs_tx, hgvs_genomic, hn, reverse_normalizer, vm, vfo): # -# Copyright (C) 2016-2025 VariantValidator Contributors +# Copyright (C) 2016-2026 VariantValidator Contributors # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as diff --git a/VariantFormatter/simpleVariantFormatter.py b/VariantFormatter/simpleVariantFormatter.py index 4c613b4..388b04c 100644 --- a/VariantFormatter/simpleVariantFormatter.py +++ b/VariantFormatter/simpleVariantFormatter.py @@ -169,7 +169,7 @@ def format(batch_input, genome_build, transcript_model=None, specify_transcripts # -# Copyright (C) 2016-2025 VariantValidator Contributors +# Copyright (C) 2016-2026 VariantValidator Contributors # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as diff --git a/VariantFormatter/variantformatter.py b/VariantFormatter/variantformatter.py index a83fd5f..c27e31b 100644 --- a/VariantFormatter/variantformatter.py +++ b/VariantFormatter/variantformatter.py @@ -388,23 +388,27 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None else: transcript_list = formatter.fetch_aligned_transcripts(g_hgvs, self.transcript_model, self.vfo, genome_build) - # Remove malform IDs - cp_transcript_list = copy.copy(transcript_list) - transcript_list = [] - for tx in cp_transcript_list: + # De-dup and remove malformed IDs (EG ENS seqs with different GRCh37 and GRCh38 ver but same id and version no.) + # Also sort by the number of main chr like mappings, this avoids pyliftover where possible, since we re-use the + # first non-failing hit for all liftover. (Where mapping details exist the order is, ref, alt, strand ....) + transcript_dict = {} + for tx in transcript_list: # Known UTA ID malforms if re.search('\/', tx[0]): continue - else: - transcript_list.append(tx) - + # don't exclude any tx, even if we have no chr like mapping + if tx[0] not in transcript_dict: + transcript_dict[tx[0]] = +1 + # but add to the sort number if we do have a main chr like mapping + if len(tx) > 2 and (tx[1].startswith('NC_00') or tx[1] in ['NC_012920.1', 'NC_001807.4']): + transcript_dict[tx[0]] = transcript_dict[tx[0]] +1 + + transcript_list = sorted(transcript_dict.keys(),key=lambda k:transcript_dict[k],reverse=True) # Create a variable to trap direct g_g liftover g_to_g_lift = {} # Create transcript level descriptions - for tx_alignment_data in transcript_list: - tx_id = tx_alignment_data[0] - + for tx_id in transcript_list: # Get transcript annotations try: annotation = vfo.db.get_transcript_annotation(tx_id) @@ -461,7 +465,6 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None if tx_id not in str(overlapping_tx): continue - hgvs_transcript_dict = formatter.hgvs_genomic2hgvs_transcript(g_hgvs, tx_id, self.vfo) # Gap checking @@ -496,6 +499,7 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None hgvs_protein_tlc = formatter.hgvs_transcript2hgvs_protein(am_i_gapped['hgvs_transcript'], self.genome_build, self.vfo) + hgvs_protein_tlc = formatter.remove_reference(hgvs_protein_tlc) # Handle edits that have been stringified try: hgvs_protein_tlc.posedit.edit.ref @@ -577,6 +581,29 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None specified_tx_variant=specified_tx_variant ) + # Use PyLiftover if needed to add missing lifts to primary assembly + if current_lift[build_to.lower()] == {}: + direct_lift = lo.liftover(self.genomic_descriptions.g_hgvs, + self.genomic_descriptions.selected_build, + build_to, + vfo.splign_normalizer, + vfo.reverse_splign_normalizer, + None, + vfo, + specify_tx=tx_id, + liftover_level=self.liftover, + gap_map=formatter.gap_checker, + vfo=self.vfo, + specified_tx_variant=specified_tx_variant, + force_pyliftover=True + ) + + current_lift[build_to.lower()] = direct_lift[build_to.lower()] + if build_to == "GRCh37": + current_lift["hg19"] = direct_lift[build_to.lower()] + if build_to == "GRCh38": + current_lift["hg39"] = direct_lift[build_to.lower()] + if "am_i_gapped" in current_lift.keys(): if order_my_tp['gapped_alignment_warning'] == "": order_my_tp['gapped_alignment_warning'] = current_lift['am_i_gapped'][ @@ -591,6 +618,12 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None elif order_my_tp['transcript_variant_error'] is not None and g_to_g_lift != {}: current_lift = g_to_g_lift + # first edit liftover to text, as required for output + for key, val in current_lift.items(): + for chr_type in val.keys(): + current_lift[key][chr_type]['hgvs_genomic_description'] = \ + current_lift[key][chr_type]['hgvs_genomic_description'].format( + {'max_ref_length': 0}) # Copy the liftover and split into primary and alt cp_current_lift = copy.deepcopy(current_lift) scaff_lift = copy.deepcopy(current_lift) @@ -662,6 +695,13 @@ def stucture_data(self): liftover_level=self.liftover ) + # First edit liftover to text, as required for output + for key, val in current_lift.items(): + for chr_type in val.keys(): + current_lift[key][chr_type]['hgvs_genomic_description'] = \ + current_lift[key][chr_type]['hgvs_genomic_description'].format( + {'max_ref_length': 0}) + # Copy the liftover and split into primary and alt cp_current_lift = copy.deepcopy(current_lift) scaff_lift = copy.deepcopy(current_lift) @@ -701,7 +741,7 @@ def collect_metadata(self): return meta # -# Copyright (C) 2016-2025 VariantValidator Contributors +# Copyright (C) 2016-2026 VariantValidator Contributors # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as