From e022a148038910813dd93df554c895d0baa3e06a Mon Sep 17 00:00:00 2001 From: Peter-J-Freeman Date: Tue, 20 May 2025 13:30:41 +0100 Subject: [PATCH 1/5] Remove reference bases from protein dups and dels --- .DS_Store | Bin 10244 -> 10244 bytes VariantFormatter/variantformatter.py | 1 + 2 files changed, 1 insertion(+) diff --git a/.DS_Store b/.DS_Store index a6eb702bbd38cf95a6e8c2994c9fae13978c0615..46f65c576d86ee692873d6c8b911497ffd04bcb1 100644 GIT binary patch delta 48 zcmZn(XbISmBFVH?d-4WJi^=mOx!7kckM`Vn;n?K$lFl4Zw&P?4NwducQdf8Z;f@o! delta 48 zcmZn(XbISmBFVI7)8q}37L(^ma Date: Tue, 22 Jul 2025 19:15:00 +0100 Subject: [PATCH 2/5] remove pkg_resources --- VariantFormatter/__init__.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/VariantFormatter/__init__.py b/VariantFormatter/__init__.py index 0e70799..722c745 100644 --- a/VariantFormatter/__init__.py +++ b/VariantFormatter/__init__.py @@ -1,14 +1,14 @@ -import pkg_resources +import importlib.metadata import re import warnings # Pull in use_scm_version=True enabled version number _is_released_version = False try: - __version__ = pkg_resources.get_distribution("VariantFormatter").version + __version__ = importlib.metadata.version("VariantFormatter") if re.match(r"^\d+\.\d+\.\d+$", __version__) is not None: _is_released_version = True -except pkg_resources.DistributionNotFound as e: +except importlib.metadata.PackageNotFoundError: warnings.warn("can't get __version__ because %s package isn't installed" % __package__, Warning) __version__ = None From 1667f89958a76281450a4000f68cede750597a39 Mon Sep 17 00:00:00 2001 From: Peter-J-Freeman Date: Wed, 6 Aug 2025 15:10:42 +0100 Subject: [PATCH 3/5] Code updates for liftover --- VariantFormatter/variantformatter.py | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/VariantFormatter/variantformatter.py b/VariantFormatter/variantformatter.py index d145df8..4477bda 100644 --- a/VariantFormatter/variantformatter.py +++ b/VariantFormatter/variantformatter.py @@ -550,6 +550,29 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None specified_tx_variant=specified_tx_variant ) + # Use PyLiftover if needed to add missing lifts to primary assembly + if current_lift[build_to.lower()] == {}: + direct_lift = lo.liftover(self.genomic_descriptions.g_hgvs, + self.genomic_descriptions.selected_build, + build_to, + vfo.splign_normalizer, + vfo.reverse_splign_normalizer, + None, + vfo, + specify_tx=tx_id, + liftover_level=self.liftover, + gap_map=formatter.gap_checker, + vfo=self.vfo, + specified_tx_variant=specified_tx_variant, + force_pyliftover=True + ) + + current_lift[build_to.lower()] = direct_lift[build_to.lower()] + if build_to == "GRCh37": + current_lift["hg19"] = direct_lift[build_to.lower()] + if build_to == "GRCh38": + current_lift["hg39"] = direct_lift[build_to.lower()] + if "am_i_gapped" in current_lift.keys(): if order_my_tp['gapped_alignment_warning'] == "": order_my_tp['gapped_alignment_warning'] = current_lift['am_i_gapped'][ From 89457ef8db4d47ca3984d3c4c329025b661e2703 Mon Sep 17 00:00:00 2001 From: John-F-Wagstaff Date: Mon, 6 Oct 2025 16:53:42 +0100 Subject: [PATCH 4/5] Handle hgvs object output from lift-over Also run a de-dup step pre lift to reduce grind, sorted to prioritise transcript based lift over pyliftover based lift. --- VariantFormatter/variantformatter.py | 38 ++++++++++++++++++++-------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/VariantFormatter/variantformatter.py b/VariantFormatter/variantformatter.py index 930e3a0..2db6024 100644 --- a/VariantFormatter/variantformatter.py +++ b/VariantFormatter/variantformatter.py @@ -388,23 +388,27 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None else: transcript_list = formatter.fetch_aligned_transcripts(g_hgvs, self.transcript_model, self.vfo, genome_build) - # Remove malform IDs - cp_transcript_list = copy.copy(transcript_list) - transcript_list = [] - for tx in cp_transcript_list: + # De-dup and remove malformed IDs (EG ENS seqs with different GRCh37 and GRCh38 ver but same id and version no.) + # Also sort by the number of main chr like mappings, this avoids pyliftover where possible, since we re-use the + # first non-failing hit for all liftover. (Where mapping details exist the order is, ref, alt, strand ....) + transcript_dict = {} + for tx in transcript_list: # Known UTA ID malforms if re.search('\/', tx[0]): continue - else: - transcript_list.append(tx) - + # don't exclude any tx, even if we have no chr like mapping + if tx[0] not in transcript_dict: + transcript_dict[tx[0]] = +1 + # but add to the sort number if we do have a main chr like mapping + if len(tx) > 2 and (tx[1].startswith('NC_00') or tx[1] in ['NC_012920.1', 'NC_001807.4']): + transcript_dict[tx[0]] = transcript_dict[tx[0]] +1 + + transcript_list = sorted(transcript_dict.keys(),key=lambda k:transcript_dict[k],reverse=True) # Create a variable to trap direct g_g liftover g_to_g_lift = {} # Create transcript level descriptions - for tx_alignment_data in transcript_list: - tx_id = tx_alignment_data[0] - + for tx_id in transcript_list: # Get transcript annotations try: annotation = vfo.db.get_transcript_annotation(tx_id) @@ -461,7 +465,6 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None if tx_id not in str(overlapping_tx): continue - hgvs_transcript_dict = formatter.hgvs_genomic2hgvs_transcript(g_hgvs, tx_id, self.vfo) # Gap checking @@ -615,6 +618,12 @@ def __init__(self, variant_description, genome_build, vfo, transcript_model=None elif order_my_tp['transcript_variant_error'] is not None and g_to_g_lift != {}: current_lift = g_to_g_lift + # first edit liftover to text, as required for output + for key, val in current_lift.items(): + for chr_type in val.keys(): + current_lift[key][chr_type]['hgvs_genomic_description'] = \ + current_lift[key][chr_type]['hgvs_genomic_description'].format( + {'max_ref_length': 0}) # Copy the liftover and split into primary and alt cp_current_lift = copy.deepcopy(current_lift) scaff_lift = copy.deepcopy(current_lift) @@ -686,6 +695,13 @@ def stucture_data(self): liftover_level=self.liftover ) + # First edit liftover to text, as required for output + for key, val in current_lift.items(): + for chr_type in val.keys(): + current_lift[key][chr_type]['hgvs_genomic_description'] = \ + current_lift[key][chr_type]['hgvs_genomic_description'].format( + {'max_ref_length': 0}) + # Copy the liftover and split into primary and alt cp_current_lift = copy.deepcopy(current_lift) scaff_lift = copy.deepcopy(current_lift) From 834453d3ac0ff14484ea22cab8a8a7db04e38eed Mon Sep 17 00:00:00 2001 From: Peter-J-Freeman Date: Tue, 6 Jan 2026 11:12:53 +0000 Subject: [PATCH 5/5] Update copyright statement for 2026 --- VariantFormatter/__init__.py | 2 +- VariantFormatter/formatter.py | 2 +- VariantFormatter/gapGenes.py | 2 +- VariantFormatter/simpleVariantFormatter.py | 2 +- VariantFormatter/variantformatter.py | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/VariantFormatter/__init__.py b/VariantFormatter/__init__.py index 722c745..7d04091 100644 --- a/VariantFormatter/__init__.py +++ b/VariantFormatter/__init__.py @@ -14,7 +14,7 @@ # -# Copyright (C) 2016-2025 VariantValidator Contributors +# Copyright (C) 2016-2026 VariantValidator Contributors # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as diff --git a/VariantFormatter/formatter.py b/VariantFormatter/formatter.py index 9df413b..df4f7aa 100644 --- a/VariantFormatter/formatter.py +++ b/VariantFormatter/formatter.py @@ -413,7 +413,7 @@ def gap_checker(hgvs_transcript, hgvs_genomic, genome_build, vfo, transcript_mod # -# Copyright (C) 2016-2025 VariantValidator Contributors +# Copyright (C) 2016-2026 VariantValidator Contributors # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as diff --git a/VariantFormatter/gapGenes.py b/VariantFormatter/gapGenes.py index 9d703cb..cf0da00 100644 --- a/VariantFormatter/gapGenes.py +++ b/VariantFormatter/gapGenes.py @@ -225,7 +225,7 @@ def fully_normalize(hgvs_tx, hgvs_genomic, hn, reverse_normalizer, vm, vfo): # -# Copyright (C) 2016-2025 VariantValidator Contributors +# Copyright (C) 2016-2026 VariantValidator Contributors # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as diff --git a/VariantFormatter/simpleVariantFormatter.py b/VariantFormatter/simpleVariantFormatter.py index 4c613b4..388b04c 100644 --- a/VariantFormatter/simpleVariantFormatter.py +++ b/VariantFormatter/simpleVariantFormatter.py @@ -169,7 +169,7 @@ def format(batch_input, genome_build, transcript_model=None, specify_transcripts # -# Copyright (C) 2016-2025 VariantValidator Contributors +# Copyright (C) 2016-2026 VariantValidator Contributors # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as diff --git a/VariantFormatter/variantformatter.py b/VariantFormatter/variantformatter.py index 2db6024..c27e31b 100644 --- a/VariantFormatter/variantformatter.py +++ b/VariantFormatter/variantformatter.py @@ -741,7 +741,7 @@ def collect_metadata(self): return meta # -# Copyright (C) 2016-2025 VariantValidator Contributors +# Copyright (C) 2016-2026 VariantValidator Contributors # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as