openvar · Peter-J-Freeman · May 1, 2025 · May 20, 2025 · May 20, 2025 · Jul 22, 2025
diff --git a/.DS_Store b/.DS_Store
diff --git a/VariantFormatter/__init__.py b/VariantFormatter/__init__.py
@@ -1,20 +1,20 @@
-import pkg_resources
+import importlib.metadata
 import re
 import warnings
 
 # Pull in use_scm_version=True enabled version number
 _is_released_version = False
 try:
-    __version__ = pkg_resources.get_distribution("VariantFormatter").version
+    __version__ = importlib.metadata.version("VariantFormatter")
     if re.match(r"^\d+\.\d+\.\d+$", __version__) is not None:
         _is_released_version = True
-except pkg_resources.DistributionNotFound as e:
+except importlib.metadata.PackageNotFoundError:
     warnings.warn("can't get __version__ because %s package isn't installed" % __package__, Warning)
     __version__ = None
 
 
 # <LICENSE>
-# Copyright (C) 2016-2025 VariantValidator Contributors
+# Copyright (C) 2016-2026 VariantValidator Contributors
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as

diff --git a/VariantFormatter/formatter.py b/VariantFormatter/formatter.py
@@ -413,7 +413,7 @@ def gap_checker(hgvs_transcript, hgvs_genomic, genome_build, vfo, transcript_mod
 
 
 # <LICENSE>
-# Copyright (C) 2016-2025 VariantValidator Contributors
+# Copyright (C) 2016-2026 VariantValidator Contributors
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as

diff --git a/VariantFormatter/gapGenes.py b/VariantFormatter/gapGenes.py
@@ -225,7 +225,7 @@ def fully_normalize(hgvs_tx, hgvs_genomic, hn, reverse_normalizer, vm, vfo):
 
 
 # <LICENSE>
-# Copyright (C) 2016-2025 VariantValidator Contributors
+# Copyright (C) 2016-2026 VariantValidator Contributors
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as

diff --git a/VariantFormatter/simpleVariantFormatter.py b/VariantFormatter/simpleVariantFormatter.py
@@ -169,7 +169,7 @@ def format(batch_input, genome_build, transcript_model=None, specify_transcripts
 
 
 # <LICENSE>
-# Copyright (C) 2016-2025 VariantValidator Contributors
+# Copyright (C) 2016-2026 VariantValidator Contributors
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as

diff --git a/VariantFormatter/variantformatter.py b/VariantFormatter/variantformatter.py
@@ -388,23 +388,27 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
         else:
             transcript_list = formatter.fetch_aligned_transcripts(g_hgvs, self.transcript_model, self.vfo, genome_build)
 
-        # Remove malform IDs
-        cp_transcript_list = copy.copy(transcript_list)
-        transcript_list = []
-        for tx in cp_transcript_list:
+        # De-dup and remove malformed IDs (EG ENS seqs with different GRCh37 and GRCh38 ver but same id and version no.)
+        # Also sort by the number of main chr like mappings, this avoids pyliftover where possible, since we re-use the
+        # first non-failing hit for all liftover. (Where mapping details exist the order is, ref, alt, strand ....)
+        transcript_dict = {}
+        for tx in transcript_list:
             # Known UTA ID malforms
             if re.search('\/', tx[0]):
                 continue
-            else:
-                transcript_list.append(tx)
-
+            # don't exclude any tx, even if we have no chr like mapping
+            if tx[0] not in transcript_dict:
+                transcript_dict[tx[0]] = +1
+            # but add to the sort number if we do have a main chr like mapping
+            if len(tx) > 2 and (tx[1].startswith('NC_00') or tx[1] in ['NC_012920.1', 'NC_001807.4']):
+                transcript_dict[tx[0]] = transcript_dict[tx[0]] +1
+
+        transcript_list = sorted(transcript_dict.keys(),key=lambda k:transcript_dict[k],reverse=True)
         # Create a variable to trap direct g_g liftover
         g_to_g_lift = {}
 
         # Create transcript level descriptions
-        for tx_alignment_data in transcript_list:
-            tx_id = tx_alignment_data[0]
-
+        for tx_id in transcript_list:
             # Get transcript annotations
             try:
                 annotation = vfo.db.get_transcript_annotation(tx_id)
@@ -461,7 +465,6 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
                 if tx_id not in str(overlapping_tx):
                     continue
 
-
             hgvs_transcript_dict = formatter.hgvs_genomic2hgvs_transcript(g_hgvs, tx_id, self.vfo)
 
             # Gap checking
@@ -496,6 +499,7 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
                             hgvs_protein_tlc = formatter.hgvs_transcript2hgvs_protein(am_i_gapped['hgvs_transcript'],
                                                                                       self.genome_build,
                                                                                       self.vfo)
+                            hgvs_protein_tlc = formatter.remove_reference(hgvs_protein_tlc)
                             # Handle edits that have been stringified
                             try:
                                 hgvs_protein_tlc.posedit.edit.ref
@@ -577,6 +581,29 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
                                                specified_tx_variant=specified_tx_variant
                                                )
 
+                    # Use PyLiftover if needed to add missing lifts to primary assembly
+                    if current_lift[build_to.lower()] == {}:
+                        direct_lift = lo.liftover(self.genomic_descriptions.g_hgvs,
+                                                   self.genomic_descriptions.selected_build,
+                                                   build_to,
+                                                   vfo.splign_normalizer,
+                                                   vfo.reverse_splign_normalizer,
+                                                   None,
+                                                   vfo,
+                                                   specify_tx=tx_id,
+                                                   liftover_level=self.liftover,
+                                                   gap_map=formatter.gap_checker,
+                                                   vfo=self.vfo,
+                                                   specified_tx_variant=specified_tx_variant,
+                                                   force_pyliftover=True
+                                                   )
+
+                        current_lift[build_to.lower()] = direct_lift[build_to.lower()]
+                        if build_to == "GRCh37":
+                            current_lift["hg19"] = direct_lift[build_to.lower()]
+                        if build_to == "GRCh38":
+                            current_lift["hg39"] = direct_lift[build_to.lower()]
+
                     if "am_i_gapped" in current_lift.keys():
                         if order_my_tp['gapped_alignment_warning'] == "":
                             order_my_tp['gapped_alignment_warning'] = current_lift['am_i_gapped'][
@@ -591,6 +618,12 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None
                 elif order_my_tp['transcript_variant_error'] is not None and g_to_g_lift != {}:
                     current_lift = g_to_g_lift
 
+                # first edit liftover to text, as required for output
+                for key, val in current_lift.items():
+                    for chr_type in val.keys():
+                        current_lift[key][chr_type]['hgvs_genomic_description'] = \
+                            current_lift[key][chr_type]['hgvs_genomic_description'].format(
+                                {'max_ref_length': 0})
                 # Copy the liftover and split into primary and alt
                 cp_current_lift = copy.deepcopy(current_lift)
                 scaff_lift = copy.deepcopy(current_lift)
@@ -662,6 +695,13 @@ def stucture_data(self):
                                                liftover_level=self.liftover
                                                )
 
+                    # First edit liftover to text, as required for output
+                    for key, val in current_lift.items():
+                        for chr_type in val.keys():
+                            current_lift[key][chr_type]['hgvs_genomic_description'] = \
+                                current_lift[key][chr_type]['hgvs_genomic_description'].format(
+                                    {'max_ref_length': 0})
+
                     # Copy the liftover and split into primary and alt
                     cp_current_lift = copy.deepcopy(current_lift)
                     scaff_lift = copy.deepcopy(current_lift)
@@ -701,7 +741,7 @@ def collect_metadata(self):
         return meta
 
 # <LICENSE>
-# Copyright (C) 2016-2025 VariantValidator Contributors
+# Copyright (C) 2016-2026 VariantValidator Contributors
 #
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as