From edfd89be6c7e2ba8395c3abbf6b74df9e30a29cf Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Wed, 29 Jan 2025 14:20:11 -0500 Subject: [PATCH 01/11] addjust script printing --- reform.py | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/reform.py b/reform.py index 949342f..7e5655b 100755 --- a/reform.py +++ b/reform.py @@ -27,7 +27,11 @@ def main(): ## Sequential processing for index in range(iterations): - ## Read the new fasta (to be inserted into the ref genome) + # Start interation + print("-------------------------------------------") + print(f"Begin modification from in{index+1}.fa") + print("-------------------------------------------") + ## Read the new fasta (to be inserted into the ref genome) try: record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0] except IndexError: @@ -91,16 +95,17 @@ def main(): SeqIO.write([new_record], f, "fasta") else: SeqIO.write([chrom_seqs[s]], f, "fasta") - print("New fasta file created: ", new_fasta) - - print("Preparing to create new annotation file") + if (index == iterations-1): + print("------------------------------------------") + print("Reform Complete") + print("------------------------------------------") + print("New fasta file created: ", new_fasta) ## Read in new GFF features from in_gff in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) - print(in_arg.ref_gff) if index < iterations - 1: temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext) temp_gff_name = temp_gff.name @@ -117,7 +122,9 @@ def main(): else: new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) prev_gff_path = new_gff_path - print("New {} file created: {} ".format(annotation_ext.upper(), prev_gff_path)) + if (index == iterations-1): + print(f"New fasta file created: {new_gff_path}") + print(f"Original Annotation: {in_arg.ref_gff}") def index_fasta(fasta_path): @@ -394,6 +401,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, # and ends after down_position elif gff_feat_start <= position and gff_feat_end > down_position: print("Feature split") + print(line) # Which side of the feature depends on the strand (we add this as a comment) (x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5") From ee82a45cb5542585fb8fcbcc3cd8d65848f821b5 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Mon, 3 Feb 2025 13:39:04 -0500 Subject: [PATCH 02/11] add just print statement --- reform.py | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/reform.py b/reform.py index 7e5655b..d164c5a 100755 --- a/reform.py +++ b/reform.py @@ -8,10 +8,10 @@ from Bio.SeqRecord import SeqRecord try: import pgzip as gzip_module - print("Using pgzip for gzip operations.") + print("\nUsing pgzip for gzip operations.\n") except ImportError: import gzip as gzip_module - print("pgzip not found, falling back to gzip.") + print("\npgzip not found, falling back to gzip.\n") def main(): @@ -33,14 +33,26 @@ def main(): print("-------------------------------------------") ## Read the new fasta (to be inserted into the ref genome) try: + filename_fa = in_arg.in_fasta[index] + if not os.path.exists(filename_fa): + raise FileNotFoundError(f"Error: File {filename_fa} does not exist.") + real_path_fa = os.path.realpath(filename_fa) record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0] except IndexError: - filename = in_arg.in_fasta[index].name - raise ValueError(f"Error: {filename} is not a valid FASTA file.") + raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.") except Exception as e: raise ValueError(f"Error parsing FASTA file: {str(e)}") + print("Preparing to create new FASTA file") + print(f"Original FASTA: {real_path_fa}") + filename_gff = in_arg.in_gff[index] + if not os.path.exists(filename_gff): + raise FileNotFoundError(f"Error: File {filename_gff} does not exist.") + real_path_gff = os.path.realpath(filename_gff) + print("Preparing to create new annotation file") + print(f"Original Annotation: {real_path_gff}") + print() ### print new line - ## Generate index of sequences from ref reference fasta + ## Generate index of sequences from ref reference fasta if prev_fasta_path: chrom_seqs = index_fasta(prev_fasta_path) os.remove(prev_fasta_path) @@ -95,11 +107,6 @@ def main(): SeqIO.write([new_record], f, "fasta") else: SeqIO.write([chrom_seqs[s]], f, "fasta") - if (index == iterations-1): - print("------------------------------------------") - print("Reform Complete") - print("------------------------------------------") - print("New fasta file created: ", new_fasta) ## Read in new GFF features from in_gff in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) @@ -123,8 +130,11 @@ def main(): new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) prev_gff_path = new_gff_path if (index == iterations-1): - print(f"New fasta file created: {new_gff_path}") - print(f"Original Annotation: {in_arg.ref_gff}") + print("------------------------------------------") + print("Reform Complete") + print("------------------------------------------") + print(f"New .fa file created: {os.path.realpath(new_fasta)}") + print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}") def index_fasta(fasta_path): From 58058ceb4a9a1ce3a3b4af8d1c8ac95377ae9bc8 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Wed, 5 Feb 2025 13:21:57 -0500 Subject: [PATCH 03/11] split modify process --- reform.py | 219 +++++++++++++++++++++++++++++------------------------- 1 file changed, 116 insertions(+), 103 deletions(-) diff --git a/reform.py b/reform.py index d164c5a..0d03024 100755 --- a/reform.py +++ b/reform.py @@ -31,111 +31,118 @@ def main(): print("-------------------------------------------") print(f"Begin modification from in{index+1}.fa") print("-------------------------------------------") - ## Read the new fasta (to be inserted into the ref genome) - try: - filename_fa = in_arg.in_fasta[index] - if not os.path.exists(filename_fa): - raise FileNotFoundError(f"Error: File {filename_fa} does not exist.") - real_path_fa = os.path.realpath(filename_fa) - record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0] - except IndexError: - raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.") - except Exception as e: - raise ValueError(f"Error parsing FASTA file: {str(e)}") - print("Preparing to create new FASTA file") - print(f"Original FASTA: {real_path_fa}") - filename_gff = in_arg.in_gff[index] - if not os.path.exists(filename_gff): - raise FileNotFoundError(f"Error: File {filename_gff} does not exist.") - real_path_gff = os.path.realpath(filename_gff) - print("Preparing to create new annotation file") - print(f"Original Annotation: {real_path_gff}") - print() ### print new line + if hasattr(in_arg, 'chrom'): + # Modify existing chrom seq + new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path \ + = modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \ + iterations, prev_gff_path) - ## Generate index of sequences from ref reference fasta - if prev_fasta_path: - chrom_seqs = index_fasta(prev_fasta_path) - os.remove(prev_fasta_path) - else: - chrom_seqs = index_fasta(in_arg.ref_fasta) - - ## Obtain the sequence of the chromosome we want to modify - seq = chrom_seqs[in_arg.chrom] - seq_str = str(seq.seq) - - ## Get the position to insert the new sequence - positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str, prev_modifications) - position = positions['position'] - down_position = positions['down_position'] - - ## Save current modification which include position(index) and length changed. - if position == down_position: - length_changed = len(str(record.seq)) - else: - length_changed = len(str(record.seq)) - (down_position - position - 1) - prev_modifications.append((position,length_changed)) - - if position != down_position: - print("Removing nucleotides from position {} - {}".format(position, down_position)) - print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}" - .format(record.description, in_arg.in_fasta[index], position, in_arg.chrom)) - - ## Build the new chromosome sequence with the inserted_seq - ## If the chromosome sequence length is in the header, replace it with new length - new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:] - chrom_length = str(len(seq_str)) - new_length = str(len(new_seq)) - new_record = SeqRecord( - Seq(new_seq), - id=seq.id, - description=seq.description.replace(chrom_length, new_length) + print("------------------------------------------") + print("Reform Complete") + print("------------------------------------------") + print(f"New .fa file created: {os.path.realpath(new_fasta)}") + print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}") + +def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, iterations, prev_gff_path): + ## Read FASTA + record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path) + ## Read annotation (gff/gtf) + read_gff(in_arg, index) + ## Obtain the sequence of the chromosome we want to modify + seq = chrom_seqs[in_arg.chrom] + seq_str = str(seq.seq) + ## Get the position to insert the new sequence + positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str, prev_modifications) + position = positions['position'] + down_position = positions['down_position'] + ## Save current modification which include position(index) and length changed. + if position == down_position: + length_changed = len(str(record.seq)) + else: + length_changed = len(str(record.seq)) - (down_position - position - 1) + prev_modifications.append((position,length_changed)) + if position != down_position: + print("Removing nucleotides from position {} - {}".format(position, down_position)) + print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}" + .format(record.description, in_arg.in_fasta[index], position, in_arg.chrom)) + ## Build the new chromosome sequence with the inserted_seq + ## If the chromosome sequence length is in the header, replace it with new length + new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:] + chrom_length = str(len(seq_str)) + new_length = str(len(new_seq)) + new_record = SeqRecord( + Seq(new_seq), + id=seq.id, + description=seq.description.replace(chrom_length, new_length) ) - - ## Create new fasta file with modified chromosome - if index < iterations - 1: - new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa') - new_fasta_file.close() - new_fasta = new_fasta_file.name - prev_fasta_path = new_fasta - else: - ref_basename, _ = get_ref_basename(in_arg.ref_fasta) - ref_name = ref_basename - new_fasta = ref_name + '_reformed.fa' - with open(new_fasta, "w") as f: - for s in chrom_seqs: - if s == seq.id: - SeqIO.write([new_record], f, "fasta") - else: - SeqIO.write([chrom_seqs[s]], f, "fasta") - - ## Read in new GFF features from in_gff - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) - - ## Create a temp file for gff, if index is not equal to last iteration - annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) - if index < iterations - 1: - temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext) - temp_gff_name = temp_gff.name - temp_gff.close() - if prev_gff_path: - new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) - os.remove(prev_gff_path) + ## Create new fasta file with modified chromosome + if index < iterations - 1: + new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa') + new_fasta_file.close() + new_fasta = new_fasta_file.name + prev_fasta_path = new_fasta + else: + ref_basename, _ = get_ref_basename(in_arg.ref_fasta) + ref_name = ref_basename + new_fasta = ref_name + '_reformed.fa' + with open(new_fasta, "w") as f: + for s in chrom_seqs: + if s == seq.id: + SeqIO.write([new_record], f, "fasta") else: - new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + SeqIO.write([chrom_seqs[s]], f, "fasta") + ## Read in new GFF features from in_gff + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) + ## Create a temp file for gff, if index is not equal to last iteration + annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) + if index < iterations - 1: + temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext) + temp_gff_name = temp_gff.name + temp_gff.close() + if prev_gff_path: + new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + os.remove(prev_gff_path) else: - new_gff_name = annotation_name + '_reformed' + annotation_ext - if prev_gff_path: - new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) - else: - new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) - prev_gff_path = new_gff_path - if (index == iterations-1): - print("------------------------------------------") - print("Reform Complete") - print("------------------------------------------") - print(f"New .fa file created: {os.path.realpath(new_fasta)}") - print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}") + new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + else: + new_gff_name = annotation_name + '_reformed' + annotation_ext + if prev_gff_path: + new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + else: + new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + prev_gff_path = new_gff_path + return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path, +def read_fasta(in_arg, index, prev_fasta_path): + ## Read the new fasta (to be inserted into the ref genome) + try: + filename_fa = in_arg.in_fasta[index] + if not os.path.exists(filename_fa): + raise FileNotFoundError(f"Error: File {filename_fa} does not exist.") + real_path_fa = os.path.realpath(filename_fa) + record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0] + except IndexError: + raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.") + except Exception as e: + raise ValueError(f"Error parsing FASTA file: {str(e)}") + print("Preparing to create new FASTA file") + print(f"Original FASTA: {real_path_fa}") + ## Generate index of sequences from ref reference fasta + if prev_fasta_path: + chrom_seqs = index_fasta(prev_fasta_path) + os.remove(prev_fasta_path) + else: + chrom_seqs = index_fasta(in_arg.ref_fasta) + return record, chrom_seqs + +def read_gff(in_arg, index): + filename_gff = in_arg.in_gff[index] + if not os.path.exists(filename_gff): + raise FileNotFoundError(f"Error: File {filename_gff} does not exist.") + real_path_gff = os.path.realpath(filename_gff) + print("Preparing to create new annotation file") + print(f"Original Annotation: {real_path_gff}") + print() ### print new line def index_fasta(fasta_path): ''' @@ -562,9 +569,11 @@ def rename_id(line): def get_input_args(): parser = argparse.ArgumentParser() - - parser.add_argument('--chrom', type = str, required = True, - help = "Chromosome name (String)") + chrom_group = parser.add_mutually_exclusive_group(required=True) + chrom_group.add_argument('--chrom', type=str, + help="Chromosome name (String)") + chrom_group.add_argument('--new_chrom', type=str, + help="New Chromosome name (String)") parser.add_argument('--in_fasta', type=str, required=True, help="Path(s) to new sequence(s) to be inserted into reference genome in fasta format") parser.add_argument('--in_gff', type=str, required=True, @@ -610,6 +619,10 @@ def get_input_args(): print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.") exit() + if in_args.new_chrom and (in_args.position or in_args.upstream_fasta or in_args.downstream_fasta): + parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.") + exit() + return in_args, iterations if __name__ == "__main__": From f1aecaa4f31211055f0137dadae03f5e28964ce9 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Fri, 7 Feb 2025 21:47:44 -0500 Subject: [PATCH 04/11] add new chrom --- reform.py | 193 +++++++++++++++++++++++++++++++++------ temp.py | 208 ++++++++++++++++++++++++++++++++++++++++++ test_data/17/gold.fa | 4 + test_data/17/gold.gtf | 8 ++ test_data/17/in.fa | 2 + test_data/17/in.gtf | 4 + test_data/17/ref.fa | 2 + test_data/17/ref.gtf | 4 + test_reform.py | 39 ++++++++ 9 files changed, 434 insertions(+), 30 deletions(-) create mode 100644 temp.py create mode 100644 test_data/17/gold.fa create mode 100644 test_data/17/gold.gtf create mode 100644 test_data/17/in.fa create mode 100644 test_data/17/in.gtf create mode 100644 test_data/17/ref.fa create mode 100644 test_data/17/ref.gtf diff --git a/reform.py b/reform.py index 0d03024..fb59319 100755 --- a/reform.py +++ b/reform.py @@ -31,11 +31,15 @@ def main(): print("-------------------------------------------") print(f"Begin modification from in{index+1}.fa") print("-------------------------------------------") - if hasattr(in_arg, 'chrom'): + if hasattr(in_arg, 'chrom') and in_arg.chrom is not None: # Modify existing chrom seq new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path \ = modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \ iterations, prev_gff_path) + else: + # Add new chrom + new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \ + add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) print("------------------------------------------") print("Reform Complete") @@ -44,7 +48,11 @@ def main(): print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}") def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, iterations, prev_gff_path): - ## Read FASTA + """ + Modifies a specified and existing chromosome sequence by inserting/replacing a new sequence at a given + position and updates the corresponding FASTA and GFF files. + """ + ## Read FASTA record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path) ## Read annotation (gff/gtf) read_gff(in_arg, index) @@ -111,9 +119,69 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications else: new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) prev_gff_path = new_gff_path - return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path, + return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path + +def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations): + ## Read FASTA + record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path) + ## Read annotation (gff/gtf) + read_gff(in_arg, index) + ## Check if new chromosome existed in sequence + if in_arg.new_chrom in chrom_seqs: + raise ValueError(f"Chromosome {in_arg.new_chrom} already exists in the FASTA file.") + + ## Build the new chromosome sequence by append new sequence below + ## If the chromosome sequence length is in the header, replace it with new length + new_seq = str(record.seq) + new_length = str(len(new_seq)) + new_length = str(len(new_seq)) + new_record = SeqRecord( + Seq(new_seq), + id=in_arg.new_chrom, # Use new_chrom + description=f"{in_arg.new_chrom}" + ) + ## Create new fasta file with modified chromosome + if index < iterations - 1: + new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa') + new_fasta_file.close() + new_fasta = new_fasta_file.name + prev_fasta_path = new_fasta + else: + ref_basename, _ = get_ref_basename(in_arg.ref_fasta) + ref_name = ref_basename + new_fasta = ref_name + '_reformed.fa' + ## Write all the original chromosomes first, then new chromosomes. + with open(new_fasta, "w") as f: + for s in chrom_seqs: + SeqIO.write([chrom_seqs[s]], f, "fasta") + SeqIO.write([new_record], f, "fasta") + ## Read in new GFF features from in_gff + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) + ## Create a temp file for gff, if index is not equal to last iteration + annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) + if index < iterations - 1: + temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext) + temp_gff_name = temp_gff.name + temp_gff.close() + if prev_gff_path: + new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom) + os.remove(prev_gff_path) + else: + new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom) + else: + new_gff_name = annotation_name + '_reformed' + annotation_ext + if prev_gff_path: + new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom) + else: + new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom) + prev_gff_path = new_gff_path + return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path def read_fasta(in_arg, index, prev_fasta_path): + """ + Reads the FASTA file, extracts the sequence to be inserted, + and indexes the reference genome chromosome sequences. + """ ## Read the new fasta (to be inserted into the ref genome) try: filename_fa = in_arg.in_fasta[index] @@ -136,6 +204,9 @@ def read_fasta(in_arg, index, prev_fasta_path): return record, chrom_seqs def read_gff(in_arg, index): + """ + Reads the GFF file, verifies its existence, and prints its file path information. + """ filename_gff = in_arg.in_gff[index] if not os.path.exists(filename_gff): raise FileNotFoundError(f"Error: File {filename_gff} does not exist.") @@ -535,6 +606,63 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, os.remove(ref_gff_path) return new_gff_name +def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_id): + """ + Appends new annotations to an existing GFF file without modifying existing features. + """ + ref_gff_path = ref_gff + ## Handle compressed .gz GFF files + if ref_gff.endswith('.gz'): + with gzip_module.open(ref_gff, 'rt') as f: + with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: + for line in f: + tmp_f.write(line) + tmp_f.flush() + ref_gff_path = tmp_f.name + ## Open new GFF file for writing + with open(new_gff_name, "w") as gff_out: + ## Copy all existing annotations to new GFF file + with open(ref_gff_path, "r", encoding="utf-8") as f: + for line in f: + gff_out.write(line) + ## Append new annotations if present + if in_gff_lines: + print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.") + for new_annotation in in_gff_lines: + if new_annotation: + gff_out.write("\t".join(new_annotation)) + + ## Cleanup temp file if needed + if ref_gff.endswith('.gz') and ref_gff_path != ref_gff: + try: + os.remove(ref_gff_path) + except Exception as e: + print(f"Warning: Failed to delete temp file {ref_gff_path}: {e}") + + # with open(new_gff_name, "w") as gff_out: + # ref_gff_path = ref_gff + # if ref_gff.endswith('.gz'): + # with gzip_module.open(ref_gff, 'rt') as f: + # with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: + # for line in f: + # tmp_f.write(line) + # tmp_f.flush() + # ref_gff_path = tmp_f.name + # ## Copy all existing annotations to new GFF file + # with open(ref_gff_path, "r") as f: + # for line in f: + # gff_out.write(line) + # # Append new annotations + # print(f"Appending new annotations to chromosome {chrom_id}.") + # for new_annotation in in_gff_lines: + # if new_annotation: + # gff_out.write("\t".join(new_annotation) + "\n") + + # # Cleanup temp file if needed + # if ref_gff.endswith('.gz'): + # os.remove(ref_gff_path) + return new_gff_name + def format_comment(comment, ext): ''' Format comment according to ext (GFF or GTF) and return @@ -592,36 +720,41 @@ def get_input_args(): in_args = parser.parse_args() in_args.in_fasta = in_args.in_fasta.split(',') in_args.in_gff = in_args.in_gff.split(',') - if in_args.upstream_fasta: - in_args.upstream_fasta = in_args.upstream_fasta.split(',') - if in_args.downstream_fasta: - in_args.downstream_fasta = in_args.downstream_fasta.split(',') - - if in_args.position is None and (in_args.upstream_fasta is None or in_args.downstream_fasta is None): - print("** Error: You must provide either the position, or the upstream and downstream sequences.") - exit() - - if not in_args.position and len(in_args.upstream_fasta) != len(in_args.downstream_fasta): - print("** Error: The number of upstream_fasta and downstream_fasta files does not match.") + if (len(in_args.in_fasta) != len(in_args.in_gff)): + print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.") exit() - - if in_args.position is not None: - try: - in_args.position = list(map(int, in_args.position.split(','))) - iterations = iterations = len(in_args.position) - except ValueError: - print("** Error: Position must be a comma-separated list of integers, like 1,5,-1.") + else: + iterations = len(in_args.in_fasta) + ## Modify existing chrom + if in_args.chrom: + if in_args.upstream_fasta: + in_args.upstream_fasta = in_args.upstream_fasta.split(',') + if in_args.downstream_fasta: + in_args.downstream_fasta = in_args.downstream_fasta.split(',') + if in_args.position is None and (in_args.upstream_fasta is None or in_args.downstream_fasta is None): + print("** Error: You must provide either the position, or the upstream and downstream sequences.") + exit() + if in_args.position is not None: + try: + in_args.position = list(map(int, in_args.position.split(','))) + except ValueError: + print("** Error: Position must be a comma-separated list of integers, like 1,5,-1.") + exit() + if iterations != len(in_args.position): + print("** Error: Position must be a equal to number of input FASTA") + exit() + else: + if iterations != len(in_args.upstream_fasta): + print("** Error: Upstream FASTA must be a equal to number of input FASTA") + exit() + if not in_args.position and len(in_args.upstream_fasta) != len(in_args.downstream_fasta): + print("** Error: The number of upstream_fasta and downstream_fasta files does not match.") exit() + ## Add new chrom else: - iterations = len(in_args.upstream_fasta) - - if (len(in_args.in_fasta) != len(in_args.in_gff)) or (len(in_args.in_fasta) != iterations): - print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.") - exit() - - if in_args.new_chrom and (in_args.position or in_args.upstream_fasta or in_args.downstream_fasta): - parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.") - exit() + if in_args.position or in_args.upstream_fasta or in_args.downstream_fasta: + parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.") + exit() return in_args, iterations diff --git a/temp.py b/temp.py new file mode 100644 index 0000000..077400a --- /dev/null +++ b/temp.py @@ -0,0 +1,208 @@ +def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, chrom_id, new_seq_length): + ''' + Goes line by line through a gff file to remove existing features + (or parts of existing features) and/or insert new features. + In the process of adding new features, existing features + may be cut-off on either end or split into two. + This attempts to handle all cases. + new_gff_name: the name of the new gff file to create + ref_gff: the reference gff file to modify + in_gff_lines: a list of lists where each nested list is a list of + columns (in gff format) associated with each new feature to insert + position: start position of removal of existing sequence + down_position: end position of removal of existing sequence + chrom_id: the ID of the chromosome to modify + new_seq_length: the length of the new sequence being added to the chromosome + ''' + with open(new_gff_name, "w") as gff_out: + in_gff_lines_appended = False + split_features = [] + last_seen_chrom_id = None + gff_ext = new_gff_name.split('.')[-1] + ref_gff_path = ref_gff + if ref_gff.endswith('.gz'): + with gzip_module.open(ref_gff, 'rt') as f: + ## Create a tempfile to store uncompressde content + with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: + tmp_f.write(f.read()) + ref_gff_path = tmp_f.name + with open(ref_gff_path, "r") as f: + for line in f: + line_elements = line.split('\t') + if line.startswith("#"): + if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id: + ## Edit the length of the chromosome + original_length = int(line_elements[3]) + new_length = original_length - (down_position - position) + new_seq_length + line = line.replace(str(original_length), str(new_length)) + gff_out.write(line) + else: + gff_chrom_id = line_elements[0] + gff_feat_start = int(line_elements[3]) + gff_feat_end = int(line_elements[4]) + gff_feat_type = line_elements[2] + gff_feat_strand = line_elements[6] + gff_comments = line_elements[8].strip() + + # If we've seen at least one chromosome + # and the last chromosome seen was the chromosome of interest (i.e. chrom_id) + # and now we're on a new chromosome in the original gff file + # and the in_gff_lines have not yet been appended: + # we assume they need to be appended to the end of the chromosome + # so append them before proceeding + if (last_seen_chrom_id is not None + and last_seen_chrom_id == chrom_id + and gff_chrom_id != last_seen_chrom_id + and not in_gff_lines_appended): + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) + + last_seen_chrom_id = gff_chrom_id + + # If this is not the chromosome of interest + # Or if the current feature ends before any + # modification (which occurs at position) + # Then simply write the feature as is (no modification) + # (remember gff co-ordinates are 1 based and position + # is 0 based, therefor check less than *or equal to*) + if gff_chrom_id != chrom_id or gff_feat_end <= position: + gff_out.write(line) + + # Modify chromosome feature length (if feature is + # "chromosome" or "region") + elif gff_feat_type in ['chromosome', 'region']: + original_length = gff_feat_end + new_length = original_length - (down_position - position) + new_seq_length + line = line.replace(str(original_length), str(new_length)) + gff_out.write(line) + + # Split feature into 2 if feature starts before position + # and ends after down_position + elif gff_feat_start <= position and gff_feat_end > down_position: + print("Feature split") + print(line) + # Which side of the feature depends on the strand (we add this as a comment) + (x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5") + + new_comment = format_comment( + "original feature split by inserted sequence, this is the {} prime end".format(x), + gff_ext + ) + # Modified feature ends at 'position' + modified_line = modify_gff_line( + line_elements, + end = position, + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # The downstream flank(s) will be added immediately after the + # in_gff (new) features are written. + # First, attempt to rename IDs (to indicate split) + renamed_id_attributes = rename_id(line) + new_comment = format_comment( + "original feature split by inserted sequence, this is the {} prime end".format(y), + gff_ext + ) + split_features.append( + ( + line_elements, + position + new_seq_length + 1, + gff_feat_end + new_seq_length - (down_position - position), + renamed_id_attributes + new_comment + ) + ) + + # Change end position of feature to cut off point (position) if the + # feature ends within the deletion (between position & down_position) + elif gff_feat_start <= position and gff_feat_end <= down_position: + # Which side of the feature depends on the strand (we add this as a comment) + x = "3" if gff_feat_strand == "+" else "5" + print("Feature cut off - {} prime side of feature cut off ({} strand)" + .format(x, gff_feat_strand)) + new_comment = format_comment( + "{} prime side of feature cut-off by inserted sequence".format(x), + gff_ext + ) + modified_line = modify_gff_line( + line_elements, + end = position, + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # Skip this feature if it falls entirely within the deletion + elif gff_feat_start > position and gff_feat_end <= down_position: + print("Skip feature (this feature was removed from sequence)") + continue + + else: + if not in_gff_lines_appended: + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) + + # Change start position of feature to after cutoff point if + # the feature starts within the deletion + if (gff_feat_start > position + and gff_feat_start <= down_position + and gff_feat_end > down_position): + x = "5" if gff_feat_strand == "+" else "3" + print("Feature cut off - {} prime side of feature cut off ({} strand)" + .format(x, gff_feat_strand)) + new_comment = format_comment( + "{} prime side of feature cut-off by inserted sequence".format(x), + gff_ext + ) + modified_line = modify_gff_line( + line_elements, + start = position + new_seq_length + 1, + end = gff_feat_end + new_seq_length - (down_position - position), + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # Offset all downstream feature positions by offset length + elif gff_feat_start > down_position: + offset_length = new_seq_length - (down_position - position) + modified_line = modify_gff_line( + line_elements, + start = gff_feat_start + offset_length, + end = gff_feat_end + offset_length + ) + gff_out.write(modified_line) + + else: + print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements)) + exit() + + # If we've iterated over the entire original gff + # (i.e. out of the for loop which iterates over it) + # and still haven't written in_gff_lines + # and the last chromosome seen was the chromosome of interest (i.e. chrom_id): + # we assume they need to be appended to the end of the genome + # (i.e. end of the last chromosome) + # so append them now + if (last_seen_chrom_id is not None + and last_seen_chrom_id == chrom_id + and not in_gff_lines_appended): + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) + + # Checking to ensure in_gff_lines written + if not in_gff_lines_appended: + print("** Error: Something went wrong, in_gff not added to reference gff. Exiting") + exit() + # Remove temp gtf file + if ref_gff.endswith('.gz'): + os.remove(ref_gff_path) + return new_gff_name + + +""" +python3 ../../reform.py \ + --new_chrom="Y" \ + --in_fasta=in.fa \ + --in_gff=in.gtf \ + --ref_fasta=ref.fa \ + --ref_gff=ref.gtf +""" \ No newline at end of file diff --git a/test_data/17/gold.fa b/test_data/17/gold.fa new file mode 100644 index 0000000..79a60e4 --- /dev/null +++ b/test_data/17/gold.fa @@ -0,0 +1,4 @@ +>X +ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK +>Y +AAAATTTTGGGGCCCC diff --git a/test_data/17/gold.gtf b/test_data/17/gold.gtf new file mode 100644 index 0000000..83f530e --- /dev/null +++ b/test_data/17/gold.gtf @@ -0,0 +1,8 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; diff --git a/test_data/17/in.fa b/test_data/17/in.fa new file mode 100644 index 0000000..2b55705 --- /dev/null +++ b/test_data/17/in.fa @@ -0,0 +1,2 @@ +>Y +AAAATTTTGGGGCCCC diff --git a/test_data/17/in.gtf b/test_data/17/in.gtf new file mode 100644 index 0000000..7dc556d --- /dev/null +++ b/test_data/17/in.gtf @@ -0,0 +1,4 @@ +Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; diff --git a/test_data/17/ref.fa b/test_data/17/ref.fa new file mode 100644 index 0000000..6fa4d80 --- /dev/null +++ b/test_data/17/ref.fa @@ -0,0 +1,2 @@ +>X +ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK diff --git a/test_data/17/ref.gtf b/test_data/17/ref.gtf new file mode 100644 index 0000000..6737be8 --- /dev/null +++ b/test_data/17/ref.gtf @@ -0,0 +1,4 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; diff --git a/test_reform.py b/test_reform.py index a10d75e..61054b6 100644 --- a/test_reform.py +++ b/test_reform.py @@ -684,5 +684,44 @@ def test_case_16(self): os.chdir(wd) + def test_case_17(self): + """ + Case 17: + Testing Reform which adding new chrom + """ + + wd = os.getcwd() + os.chdir('test_data/17/') + + command = """ + python3 ../../reform.py \ + --new_chrom="Y" \ + --in_fasta=in.fa \ + --in_gff=in.gtf \ + --ref_fasta=ref.fa \ + --ref_gff=ref.gtf \ + """ + + response = subprocess.getoutput(command) + print(response) + + with open('gold.gtf', 'r') as f: + gold_gff = f.read() + with open('ref_reformed.gtf', 'r') as f: + new_gff = f.read() + print("Testing gtf") + self.assertListEqual(list(gold_gff), list(new_gff)) + print("Done") + + with open('gold.fa', 'r') as f: + gold_fa = f.read() + with open('ref_reformed.fa', 'r') as f: + new_fa = f.read() + print("Testing Fasta") + self.assertListEqual(list(gold_fa), list(new_fa)) + print("Done") + + os.chdir(wd) + if __name__ == '__main__': unittest.main() From 05a6e15be09e0be00e7aa40d60f54549312a7305 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Fri, 7 Feb 2025 22:03:34 -0500 Subject: [PATCH 05/11] remove redundancy comments and temp file --- reform.py | 35 ++------- temp.py | 208 ------------------------------------------------------ 2 files changed, 7 insertions(+), 236 deletions(-) delete mode 100644 temp.py diff --git a/reform.py b/reform.py index fb59319..dd0ed84 100755 --- a/reform.py +++ b/reform.py @@ -32,12 +32,12 @@ def main(): print(f"Begin modification from in{index+1}.fa") print("-------------------------------------------") if hasattr(in_arg, 'chrom') and in_arg.chrom is not None: - # Modify existing chrom seq - new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path \ - = modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \ - iterations, prev_gff_path) + ## Modify existing chrom seq + new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \ + modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \ + iterations, prev_gff_path) else: - # Add new chrom + ## Add new chrom new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \ add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) @@ -118,7 +118,9 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) else: new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + prev_gff_path = new_gff_path + return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations): @@ -246,7 +248,6 @@ def get_ref_basename(filepath): name, ext = os.path.splitext(base) return name, ext - def modify_gff_line(elements, start=None, end=None, comment=None): ''' Modifies an existing GFF line and returns the modified line. Currently, you can @@ -639,28 +640,6 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i except Exception as e: print(f"Warning: Failed to delete temp file {ref_gff_path}: {e}") - # with open(new_gff_name, "w") as gff_out: - # ref_gff_path = ref_gff - # if ref_gff.endswith('.gz'): - # with gzip_module.open(ref_gff, 'rt') as f: - # with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: - # for line in f: - # tmp_f.write(line) - # tmp_f.flush() - # ref_gff_path = tmp_f.name - # ## Copy all existing annotations to new GFF file - # with open(ref_gff_path, "r") as f: - # for line in f: - # gff_out.write(line) - # # Append new annotations - # print(f"Appending new annotations to chromosome {chrom_id}.") - # for new_annotation in in_gff_lines: - # if new_annotation: - # gff_out.write("\t".join(new_annotation) + "\n") - - # # Cleanup temp file if needed - # if ref_gff.endswith('.gz'): - # os.remove(ref_gff_path) return new_gff_name def format_comment(comment, ext): diff --git a/temp.py b/temp.py deleted file mode 100644 index 077400a..0000000 --- a/temp.py +++ /dev/null @@ -1,208 +0,0 @@ -def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, chrom_id, new_seq_length): - ''' - Goes line by line through a gff file to remove existing features - (or parts of existing features) and/or insert new features. - In the process of adding new features, existing features - may be cut-off on either end or split into two. - This attempts to handle all cases. - new_gff_name: the name of the new gff file to create - ref_gff: the reference gff file to modify - in_gff_lines: a list of lists where each nested list is a list of - columns (in gff format) associated with each new feature to insert - position: start position of removal of existing sequence - down_position: end position of removal of existing sequence - chrom_id: the ID of the chromosome to modify - new_seq_length: the length of the new sequence being added to the chromosome - ''' - with open(new_gff_name, "w") as gff_out: - in_gff_lines_appended = False - split_features = [] - last_seen_chrom_id = None - gff_ext = new_gff_name.split('.')[-1] - ref_gff_path = ref_gff - if ref_gff.endswith('.gz'): - with gzip_module.open(ref_gff, 'rt') as f: - ## Create a tempfile to store uncompressde content - with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: - tmp_f.write(f.read()) - ref_gff_path = tmp_f.name - with open(ref_gff_path, "r") as f: - for line in f: - line_elements = line.split('\t') - if line.startswith("#"): - if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id: - ## Edit the length of the chromosome - original_length = int(line_elements[3]) - new_length = original_length - (down_position - position) + new_seq_length - line = line.replace(str(original_length), str(new_length)) - gff_out.write(line) - else: - gff_chrom_id = line_elements[0] - gff_feat_start = int(line_elements[3]) - gff_feat_end = int(line_elements[4]) - gff_feat_type = line_elements[2] - gff_feat_strand = line_elements[6] - gff_comments = line_elements[8].strip() - - # If we've seen at least one chromosome - # and the last chromosome seen was the chromosome of interest (i.e. chrom_id) - # and now we're on a new chromosome in the original gff file - # and the in_gff_lines have not yet been appended: - # we assume they need to be appended to the end of the chromosome - # so append them before proceeding - if (last_seen_chrom_id is not None - and last_seen_chrom_id == chrom_id - and gff_chrom_id != last_seen_chrom_id - and not in_gff_lines_appended): - in_gff_lines_appended = write_in_gff_lines( - gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) - - last_seen_chrom_id = gff_chrom_id - - # If this is not the chromosome of interest - # Or if the current feature ends before any - # modification (which occurs at position) - # Then simply write the feature as is (no modification) - # (remember gff co-ordinates are 1 based and position - # is 0 based, therefor check less than *or equal to*) - if gff_chrom_id != chrom_id or gff_feat_end <= position: - gff_out.write(line) - - # Modify chromosome feature length (if feature is - # "chromosome" or "region") - elif gff_feat_type in ['chromosome', 'region']: - original_length = gff_feat_end - new_length = original_length - (down_position - position) + new_seq_length - line = line.replace(str(original_length), str(new_length)) - gff_out.write(line) - - # Split feature into 2 if feature starts before position - # and ends after down_position - elif gff_feat_start <= position and gff_feat_end > down_position: - print("Feature split") - print(line) - # Which side of the feature depends on the strand (we add this as a comment) - (x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5") - - new_comment = format_comment( - "original feature split by inserted sequence, this is the {} prime end".format(x), - gff_ext - ) - # Modified feature ends at 'position' - modified_line = modify_gff_line( - line_elements, - end = position, - comment = gff_comments + new_comment - ) - gff_out.write(modified_line) - - # The downstream flank(s) will be added immediately after the - # in_gff (new) features are written. - # First, attempt to rename IDs (to indicate split) - renamed_id_attributes = rename_id(line) - new_comment = format_comment( - "original feature split by inserted sequence, this is the {} prime end".format(y), - gff_ext - ) - split_features.append( - ( - line_elements, - position + new_seq_length + 1, - gff_feat_end + new_seq_length - (down_position - position), - renamed_id_attributes + new_comment - ) - ) - - # Change end position of feature to cut off point (position) if the - # feature ends within the deletion (between position & down_position) - elif gff_feat_start <= position and gff_feat_end <= down_position: - # Which side of the feature depends on the strand (we add this as a comment) - x = "3" if gff_feat_strand == "+" else "5" - print("Feature cut off - {} prime side of feature cut off ({} strand)" - .format(x, gff_feat_strand)) - new_comment = format_comment( - "{} prime side of feature cut-off by inserted sequence".format(x), - gff_ext - ) - modified_line = modify_gff_line( - line_elements, - end = position, - comment = gff_comments + new_comment - ) - gff_out.write(modified_line) - - # Skip this feature if it falls entirely within the deletion - elif gff_feat_start > position and gff_feat_end <= down_position: - print("Skip feature (this feature was removed from sequence)") - continue - - else: - if not in_gff_lines_appended: - in_gff_lines_appended = write_in_gff_lines( - gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) - - # Change start position of feature to after cutoff point if - # the feature starts within the deletion - if (gff_feat_start > position - and gff_feat_start <= down_position - and gff_feat_end > down_position): - x = "5" if gff_feat_strand == "+" else "3" - print("Feature cut off - {} prime side of feature cut off ({} strand)" - .format(x, gff_feat_strand)) - new_comment = format_comment( - "{} prime side of feature cut-off by inserted sequence".format(x), - gff_ext - ) - modified_line = modify_gff_line( - line_elements, - start = position + new_seq_length + 1, - end = gff_feat_end + new_seq_length - (down_position - position), - comment = gff_comments + new_comment - ) - gff_out.write(modified_line) - - # Offset all downstream feature positions by offset length - elif gff_feat_start > down_position: - offset_length = new_seq_length - (down_position - position) - modified_line = modify_gff_line( - line_elements, - start = gff_feat_start + offset_length, - end = gff_feat_end + offset_length - ) - gff_out.write(modified_line) - - else: - print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements)) - exit() - - # If we've iterated over the entire original gff - # (i.e. out of the for loop which iterates over it) - # and still haven't written in_gff_lines - # and the last chromosome seen was the chromosome of interest (i.e. chrom_id): - # we assume they need to be appended to the end of the genome - # (i.e. end of the last chromosome) - # so append them now - if (last_seen_chrom_id is not None - and last_seen_chrom_id == chrom_id - and not in_gff_lines_appended): - in_gff_lines_appended = write_in_gff_lines( - gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) - - # Checking to ensure in_gff_lines written - if not in_gff_lines_appended: - print("** Error: Something went wrong, in_gff not added to reference gff. Exiting") - exit() - # Remove temp gtf file - if ref_gff.endswith('.gz'): - os.remove(ref_gff_path) - return new_gff_name - - -""" -python3 ../../reform.py \ - --new_chrom="Y" \ - --in_fasta=in.fa \ - --in_gff=in.gtf \ - --ref_fasta=ref.fa \ - --ref_gff=ref.gtf -""" \ No newline at end of file From b2c1bb446c143eac14555f07678cc4795c801304 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Mon, 3 Mar 2025 15:40:51 -0500 Subject: [PATCH 06/11] Fix bugs --- reform.py | 56 +++++++++++++++++++++---------------------- test_data/18/gold.fa | 8 +++++++ test_data/18/gold.gtf | 16 +++++++++++++ test_data/18/in1.fa | 2 ++ test_data/18/in1.gtf | 4 ++++ test_data/18/in2.fa | 2 ++ test_data/18/in2.gtf | 4 ++++ test_data/18/in3.fa | 2 ++ test_data/18/in3.gtf | 4 ++++ test_data/18/ref.fa | 2 ++ test_data/18/ref.gtf | 4 ++++ test_reform.py | 41 ++++++++++++++++++++++++++++++- 12 files changed, 116 insertions(+), 29 deletions(-) create mode 100644 test_data/18/gold.fa create mode 100644 test_data/18/gold.gtf create mode 100644 test_data/18/in1.fa create mode 100644 test_data/18/in1.gtf create mode 100644 test_data/18/in2.fa create mode 100644 test_data/18/in2.gtf create mode 100644 test_data/18/in3.fa create mode 100644 test_data/18/in3.gtf create mode 100644 test_data/18/ref.fa create mode 100644 test_data/18/ref.gtf diff --git a/reform.py b/reform.py index dd0ed84..eb918fc 100755 --- a/reform.py +++ b/reform.py @@ -55,12 +55,13 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications ## Read FASTA record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path) ## Read annotation (gff/gtf) - read_gff(in_arg, index) + check_gff(in_arg, index) ## Obtain the sequence of the chromosome we want to modify - seq = chrom_seqs[in_arg.chrom] - seq_str = str(seq.seq) + existing_seq = chrom_seqs[in_arg.chrom] + existing_seq_str = str(existing_seq.seq) + ## Get the position to insert the new sequence - positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str, prev_modifications) + positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, existing_seq_str, prev_modifications) position = positions['position'] down_position = positions['down_position'] ## Save current modification which include position(index) and length changed. @@ -75,13 +76,13 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications .format(record.description, in_arg.in_fasta[index], position, in_arg.chrom)) ## Build the new chromosome sequence with the inserted_seq ## If the chromosome sequence length is in the header, replace it with new length - new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:] - chrom_length = str(len(seq_str)) + new_seq = existing_seq_str[:position] + str(record.seq) + existing_seq_str[down_position:] + chrom_length = str(len(existing_seq_str)) new_length = str(len(new_seq)) new_record = SeqRecord( Seq(new_seq), - id=seq.id, - description=seq.description.replace(chrom_length, new_length) + id=existing_seq.id, + description=existing_seq.description.replace(chrom_length, new_length) ) ## Create new fasta file with modified chromosome if index < iterations - 1: @@ -95,7 +96,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications new_fasta = ref_name + '_reformed.fa' with open(new_fasta, "w") as f: for s in chrom_seqs: - if s == seq.id: + if s == existing_seq.id: SeqIO.write([new_record], f, "fasta") else: SeqIO.write([chrom_seqs[s]], f, "fasta") @@ -108,16 +109,16 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications temp_gff_name = temp_gff.name temp_gff.close() if prev_gff_path: - new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq))) os.remove(prev_gff_path) else: - new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq))) else: new_gff_name = annotation_name + '_reformed' + annotation_ext if prev_gff_path: - new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq))) else: - new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq))) prev_gff_path = new_gff_path @@ -127,20 +128,17 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) ## Read FASTA record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path) ## Read annotation (gff/gtf) - read_gff(in_arg, index) + check_gff(in_arg, index) ## Check if new chromosome existed in sequence - if in_arg.new_chrom in chrom_seqs: - raise ValueError(f"Chromosome {in_arg.new_chrom} already exists in the FASTA file.") - + if in_arg.new_chrom[index] in chrom_seqs: + raise ValueError(f"Chromosome {in_arg.new_chrom[index]} already exists in the FASTA file.") ## Build the new chromosome sequence by append new sequence below - ## If the chromosome sequence length is in the header, replace it with new length + ## Using new_chrom as the id of the new chromosome new_seq = str(record.seq) - new_length = str(len(new_seq)) - new_length = str(len(new_seq)) new_record = SeqRecord( Seq(new_seq), - id=in_arg.new_chrom, # Use new_chrom - description=f"{in_arg.new_chrom}" + id=in_arg.new_chrom[index], # Use new_chrom + description=f"{in_arg.new_chrom[index]}" ) ## Create new fasta file with modified chromosome if index < iterations - 1: @@ -166,16 +164,16 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) temp_gff_name = temp_gff.name temp_gff.close() if prev_gff_path: - new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom) + new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index]) os.remove(prev_gff_path) else: - new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom) + new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index]) else: new_gff_name = annotation_name + '_reformed' + annotation_ext if prev_gff_path: - new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom) + new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index]) else: - new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom) + new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index]) prev_gff_path = new_gff_path return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path @@ -205,7 +203,7 @@ def read_fasta(in_arg, index, prev_fasta_path): chrom_seqs = index_fasta(in_arg.ref_fasta) return record, chrom_seqs -def read_gff(in_arg, index): +def check_gff(in_arg, index): """ Reads the GFF file, verifies its existence, and prints its file path information. """ @@ -680,7 +678,7 @@ def get_input_args(): chrom_group.add_argument('--chrom', type=str, help="Chromosome name (String)") chrom_group.add_argument('--new_chrom', type=str, - help="New Chromosome name (String)") + help="Comma-separated new chromosome name (String)") parser.add_argument('--in_fasta', type=str, required=True, help="Path(s) to new sequence(s) to be inserted into reference genome in fasta format") parser.add_argument('--in_gff', type=str, required=True, @@ -734,6 +732,8 @@ def get_input_args(): if in_args.position or in_args.upstream_fasta or in_args.downstream_fasta: parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.") exit() + ## Convert new_chrom from string to list + in_args.new_chrom = in_args.new_chrom.split(',') return in_args, iterations diff --git a/test_data/18/gold.fa b/test_data/18/gold.fa new file mode 100644 index 0000000..0194856 --- /dev/null +++ b/test_data/18/gold.fa @@ -0,0 +1,8 @@ +>X +ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK +>Y +AAAATTTTGGGGCCCC +>H +GGGGAATTCCCCGGGG +>M +CCCCGGGGAAAATTTT diff --git a/test_data/18/gold.gtf b/test_data/18/gold.gtf new file mode 100644 index 0000000..7d876ea --- /dev/null +++ b/test_data/18/gold.gtf @@ -0,0 +1,16 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref stop_codon 14 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +M ref exon 1 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref CDS 4 14 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref start_codon 1 3 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref stop_codon 14 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; diff --git a/test_data/18/in1.fa b/test_data/18/in1.fa new file mode 100644 index 0000000..2b55705 --- /dev/null +++ b/test_data/18/in1.fa @@ -0,0 +1,2 @@ +>Y +AAAATTTTGGGGCCCC diff --git a/test_data/18/in1.gtf b/test_data/18/in1.gtf new file mode 100644 index 0000000..7dc556d --- /dev/null +++ b/test_data/18/in1.gtf @@ -0,0 +1,4 @@ +Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; diff --git a/test_data/18/in2.fa b/test_data/18/in2.fa new file mode 100644 index 0000000..0b5cf0c --- /dev/null +++ b/test_data/18/in2.fa @@ -0,0 +1,2 @@ +>H +GGGGAATTCCCCGGGG diff --git a/test_data/18/in2.gtf b/test_data/18/in2.gtf new file mode 100644 index 0000000..f4e79ad --- /dev/null +++ b/test_data/18/in2.gtf @@ -0,0 +1,4 @@ +H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref stop_codon 14 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; diff --git a/test_data/18/in3.fa b/test_data/18/in3.fa new file mode 100644 index 0000000..38306a6 --- /dev/null +++ b/test_data/18/in3.fa @@ -0,0 +1,2 @@ +>M +CCCCGGGGAAAATTTT diff --git a/test_data/18/in3.gtf b/test_data/18/in3.gtf new file mode 100644 index 0000000..ee9257c --- /dev/null +++ b/test_data/18/in3.gtf @@ -0,0 +1,4 @@ +M ref exon 1 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref CDS 4 14 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref start_codon 1 3 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref stop_codon 14 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; diff --git a/test_data/18/ref.fa b/test_data/18/ref.fa new file mode 100644 index 0000000..6fa4d80 --- /dev/null +++ b/test_data/18/ref.fa @@ -0,0 +1,2 @@ +>X +ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK diff --git a/test_data/18/ref.gtf b/test_data/18/ref.gtf new file mode 100644 index 0000000..6737be8 --- /dev/null +++ b/test_data/18/ref.gtf @@ -0,0 +1,4 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; diff --git a/test_reform.py b/test_reform.py index 61054b6..0d421c2 100644 --- a/test_reform.py +++ b/test_reform.py @@ -699,7 +699,46 @@ def test_case_17(self): --in_fasta=in.fa \ --in_gff=in.gtf \ --ref_fasta=ref.fa \ - --ref_gff=ref.gtf \ + --ref_gff=ref.gtf + """ + + response = subprocess.getoutput(command) + print(response) + + with open('gold.gtf', 'r') as f: + gold_gff = f.read() + with open('ref_reformed.gtf', 'r') as f: + new_gff = f.read() + print("Testing gtf") + self.assertListEqual(list(gold_gff), list(new_gff)) + print("Done") + + with open('gold.fa', 'r') as f: + gold_fa = f.read() + with open('ref_reformed.fa', 'r') as f: + new_fa = f.read() + print("Testing Fasta") + self.assertListEqual(list(gold_fa), list(new_fa)) + print("Done") + + os.chdir(wd) + + def test_case_18(self): + """ + Case 18: + Testing Reform which adding multiple new chroms + """ + + wd = os.getcwd() + os.chdir('test_data/18/') + + command = """ + python3 ../../reform.py \ + --new_chrom="Y,H,M" \ + --in_fasta=in1.fa,in2.fa,in3.fa \ + --in_gff=in1.gtf,in2.gtf,in3.gtf \ + --ref_fasta=ref.fa \ + --ref_gff=ref.gtf """ response = subprocess.getoutput(command) From bf158ee990ba5ac605cc728f499ae1d8f3ab5cb5 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Fri, 14 Mar 2025 14:25:12 -0400 Subject: [PATCH 07/11] Read meta-data from input gtf/gff --- reform.py | 63 ++++++++++++++++++++++++++++++++----------- test_data/18/gold.gtf | 1 + test_data/18/in1.gtf | 1 + 3 files changed, 49 insertions(+), 16 deletions(-) diff --git a/reform.py b/reform.py index eb918fc..c978a7a 100755 --- a/reform.py +++ b/reform.py @@ -101,7 +101,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications else: SeqIO.write([chrom_seqs[s]], f, "fasta") ## Read in new GFF features from in_gff - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], False) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) if index < iterations - 1: @@ -156,7 +156,7 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) SeqIO.write([chrom_seqs[s]], f, "fasta") SeqIO.write([new_record], f, "fasta") ## Read in new GFF features from in_gff - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], True) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) if index < iterations - 1: @@ -269,7 +269,7 @@ def modify_gff_line(elements, start=None, end=None, comment=None): ## Return the modified line return("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(elements[0], elements[1], elements[2], start, end, elements[5], elements[6], elements[7], comment)) -def get_in_gff_lines(in_gff): +def get_in_gff_lines(in_gff, is_add_new_chrom): ''' Takes a gff file and returns a list of lists where each parent list item is a single line of the gff file @@ -278,16 +278,38 @@ def get_in_gff_lines(in_gff): with open(in_gff, "r") as f: in_gff_lines = [] for line in f: - line_elements = line.split('\t') - # Skip comment lines - if line.startswith("#"): + # Skip empty lines + if not line.strip(): continue - # Ensure lines have 9 columns - if len(line_elements) != 9: - print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) - print(line_elements) - exit() - in_gff_lines.append(line_elements) + + # Handle differently based on whether we're adding a new chromosome or modifying existing + if not is_add_new_chrom: + # Skip comment lines when modifying existing chromosome + if line.startswith("#"): + continue + # Parse with tab delimiter for regular lines + line_elements = line.split('\t') + # Ensure lines have 9 columns when modifying existing chromosome + if len(line_elements) != 9: + print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) + print(line_elements) + exit() + in_gff_lines.append(line_elements) + else: + # For adding new chromosomes, process all header and feature lines + if line.startswith("#"): + # Select appropriate delimiter based on content + if '\t' in line: + line_elements = line.split('\t') + else: + line_elements = line.split() + if line_elements[0] == "##sequence-region": + line_elements[-1] += '\n' + in_gff_lines.append(line_elements) + elif len(line.split('\t')) == 9: + # Normal feature line with 9 tab-delimited columns + in_gff_lines.append(line.split('\t')) + print(in_gff_lines) return in_gff_lines def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_modifications): @@ -436,15 +458,22 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, ref_gff_path = tmp_f.name with open(ref_gff_path, "r") as f: for line in f: - line_elements = line.split('\t') + # For header lines, handle both tab and space-delimited formats if line.startswith("#"): - if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id: + if '\t' in line: + line_elements = line.split('\t') + else: + line_elements = line.split() + + if line.startswith("##sequence-region") and line_elements[1] == chrom_id: ## Edit the length of the chromosome original_length = int(line_elements[3]) new_length = original_length - (down_position - position) + new_seq_length line = line.replace(str(original_length), str(new_length)) gff_out.write(line) else: + # Regular feature lines are always tab-delimited + line_elements = line.split('\t') gff_chrom_id = line_elements[0] gff_feat_start = int(line_elements[3]) gff_feat_end = int(line_elements[4]) @@ -628,7 +657,9 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i if in_gff_lines: print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.") for new_annotation in in_gff_lines: - if new_annotation: + if new_annotation[0] == "##sequence-region": + gff_out.write(" ".join(new_annotation)) + elif new_annotation: gff_out.write("\t".join(new_annotation)) ## Cleanup temp file if needed @@ -739,4 +770,4 @@ def get_input_args(): if __name__ == "__main__": main() - + diff --git a/test_data/18/gold.gtf b/test_data/18/gold.gtf index 7d876ea..de2f092 100644 --- a/test_data/18/gold.gtf +++ b/test_data/18/gold.gtf @@ -2,6 +2,7 @@ X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +##sequence-region NC_000001.11 1 248956422 Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; diff --git a/test_data/18/in1.gtf b/test_data/18/in1.gtf index 7dc556d..86a6f67 100644 --- a/test_data/18/in1.gtf +++ b/test_data/18/in1.gtf @@ -1,3 +1,4 @@ +##sequence-region NC_000001.11 1 248956422 Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; From d7047cd3e3d78e9293c23542a6f43202b8402d6a Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Mon, 14 Apr 2025 17:49:23 -0400 Subject: [PATCH 08/11] feature improvement --- reform.py | 93 ++++++++++++++++++++++++++++--------------- test_data/18/gold.gtf | 3 +- test_data/18/in1.gtf | 2 +- test_data/18/in2.gtf | 1 + 4 files changed, 66 insertions(+), 33 deletions(-) diff --git a/reform.py b/reform.py index c978a7a..3661bb0 100755 --- a/reform.py +++ b/reform.py @@ -100,8 +100,8 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications SeqIO.write([new_record], f, "fasta") else: SeqIO.write([chrom_seqs[s]], f, "fasta") - ## Read in new GFF features from in_gff - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], False) + ## Read in new GFF features from in_gff, False means modify existing chrom + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) if index < iterations - 1: @@ -156,7 +156,8 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) SeqIO.write([chrom_seqs[s]], f, "fasta") SeqIO.write([new_record], f, "fasta") ## Read in new GFF features from in_gff - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], True) + ## Pass the new_chrom name from command line and the length of the new sequence to correct ##sequence-region line + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], in_arg.new_chrom[index], len(new_seq)) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) if index < iterations - 1: @@ -268,8 +269,27 @@ def modify_gff_line(elements, start=None, end=None, comment=None): ## Return the modified line return("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(elements[0], elements[1], elements[2], start, end, elements[5], elements[6], elements[7], comment)) - -def get_in_gff_lines(in_gff, is_add_new_chrom): + +def valid_gff_line(line_elements): + ''' + Checks if the splited line is a valid GFF line. + Returns True if valid, False otherwise. + ''' + if not line_elements[0].startswith("##sequence-region"): + if len(line_elements) != 9: + print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) + print(line_elements) + return False + else: + ## Check if ##sequence-region line has 4 columns, the reason why use 5 here is because last element is + ## spliting format indicator. + if len(line_elements) != 5: + print("** ERROR: ##sequence-region line does not have 4 columns, it has", len(line_elements) - 1) + print(line_elements) + return False + return True + +def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None): ''' Takes a gff file and returns a list of lists where each parent list item is a single line of the gff file @@ -283,33 +303,42 @@ def get_in_gff_lines(in_gff, is_add_new_chrom): continue # Handle differently based on whether we're adding a new chromosome or modifying existing - if not is_add_new_chrom: - # Skip comment lines when modifying existing chromosome - if line.startswith("#"): - continue - # Parse with tab delimiter for regular lines - line_elements = line.split('\t') - # Ensure lines have 9 columns when modifying existing chromosome - if len(line_elements) != 9: - print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) - print(line_elements) + if line.startswith("##sequence-region"): + ## Paste ##sequence-region line which only exists in gtf/gff for adding new chromosome. + ## Select user used delimiter based on content + if '\t' in line: + line_elements = line.split('\t') + line_elements.append('\t') ## Add tab to the end as a format indicator + else: + line_elements = line.split() + line_elements.append(' ') ## Add whitespace to the end as a format indicator + if not valid_gff_line(line_elements): exit() - in_gff_lines.append(line_elements) + # Validate new_chrom value and correct if needed + original_chrom = line_elements[1] + if original_chrom != new_chrom: + print(f"** INFO: Updating chromosome name from {original_chrom} to {new_chrom} to fit the\ + input from new_chrom parameter in command-line.") + line_elements[1] = new_chrom + # Validate sequence_length value and correct if needed + original_start, original_end = line_elements[2], line_elements[3] + if original_start != "1": + print(f"** INFO: Updating start position from {original_start} to 1 to fit the\ + format requirement of annotation file.") + line_elements[2] = "1" + if original_end != str(sequence_length): + print(f"** INFO: Updating sequence length from {original_end} to {sequence_length} to fit the\ + length of sequence in input FASTA file.") + line_elements[3] = str(sequence_length) + elif line.startswith("#"): + ## Ignore other comment lines + continue else: - # For adding new chromosomes, process all header and feature lines - if line.startswith("#"): - # Select appropriate delimiter based on content - if '\t' in line: - line_elements = line.split('\t') - else: - line_elements = line.split() - if line_elements[0] == "##sequence-region": - line_elements[-1] += '\n' - in_gff_lines.append(line_elements) - elif len(line.split('\t')) == 9: - # Normal feature line with 9 tab-delimited columns - in_gff_lines.append(line.split('\t')) - print(in_gff_lines) + ## Split, check and add feature lines + line_elements = line.split('\t') + if not valid_gff_line(line_elements): + exit() + in_gff_lines.append(line_elements) return in_gff_lines def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_modifications): @@ -658,7 +687,9 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.") for new_annotation in in_gff_lines: if new_annotation[0] == "##sequence-region": - gff_out.write(" ".join(new_annotation)) + ## Use predefined format for sequence-region line + ## Remove format indicater, and add new line + gff_out.write(new_annotation[-1].join(new_annotation[:-1])+'\n') elif new_annotation: gff_out.write("\t".join(new_annotation)) diff --git a/test_data/18/gold.gtf b/test_data/18/gold.gtf index de2f092..29476bf 100644 --- a/test_data/18/gold.gtf +++ b/test_data/18/gold.gtf @@ -2,11 +2,12 @@ X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; -##sequence-region NC_000001.11 1 248956422 +##sequence-region Y 1 16 Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +##sequence-region H 1 16 H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1"; H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1"; diff --git a/test_data/18/in1.gtf b/test_data/18/in1.gtf index 86a6f67..326d9fa 100644 --- a/test_data/18/in1.gtf +++ b/test_data/18/in1.gtf @@ -1,4 +1,4 @@ -##sequence-region NC_000001.11 1 248956422 +##sequence-region Y.11 1 16 Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; diff --git a/test_data/18/in2.gtf b/test_data/18/in2.gtf index f4e79ad..9e0b425 100644 --- a/test_data/18/in2.gtf +++ b/test_data/18/in2.gtf @@ -1,3 +1,4 @@ +##sequence-region F.11 3 27 H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1"; H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1"; From eccc15cdab99019814ccc2f724c3d835d5d1929e Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Mon, 21 Apr 2025 20:25:36 -0400 Subject: [PATCH 09/11] Resolve comments --- reform.py | 37 ++++++++++++++++++++++++++++++++----- test_data/19/gold.fa | 8 ++++++++ test_data/19/gold.gtf | 18 ++++++++++++++++++ test_data/19/in1.fa | 2 ++ test_data/19/in1.gtf | 5 +++++ test_data/19/in2.fa | 2 ++ test_data/19/in2.gtf | 5 +++++ test_data/19/in3.fa | 2 ++ test_data/19/in3.gtf | 5 +++++ test_data/19/ref.fa | 2 ++ test_data/19/ref.gtf | 4 ++++ test_reform.py | 40 ++++++++++++++++++++++++++++++++++++++++ 12 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 test_data/19/gold.fa create mode 100644 test_data/19/gold.gtf create mode 100644 test_data/19/in1.fa create mode 100644 test_data/19/in1.gtf create mode 100644 test_data/19/in2.fa create mode 100644 test_data/19/in2.gtf create mode 100644 test_data/19/in3.fa create mode 100644 test_data/19/in3.gtf create mode 100644 test_data/19/ref.fa create mode 100644 test_data/19/ref.gtf diff --git a/reform.py b/reform.py index 3661bb0..ec5ecbc 100755 --- a/reform.py +++ b/reform.py @@ -101,7 +101,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications else: SeqIO.write([chrom_seqs[s]], f, "fasta") ## Read in new GFF features from in_gff, False means modify existing chrom - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) + in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], existing_chrom=in_arg.chrom, new_chrom=None) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) if index < iterations - 1: @@ -157,7 +157,7 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) SeqIO.write([new_record], f, "fasta") ## Read in new GFF features from in_gff ## Pass the new_chrom name from command line and the length of the new sequence to correct ##sequence-region line - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], in_arg.new_chrom[index], len(new_seq)) + in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], new_chrom=in_arg.new_chrom[index], sequence_length=len(new_seq)) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) if index < iterations - 1: @@ -190,6 +190,19 @@ def read_fasta(in_arg, index, prev_fasta_path): raise FileNotFoundError(f"Error: File {filename_fa} does not exist.") real_path_fa = os.path.realpath(filename_fa) record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0] + # Check for mismatch between FASTA record ID and command line chromosome name + if hasattr(in_arg, 'new_chrom') and in_arg.new_chrom is not None: + if record.id != in_arg.new_chrom[index]: + print(f"** WARNING: Mismatch detected between chromosome name in input FASTA ({record.id}) " + f"and command line parameter ({in_arg.new_chrom[index]}).") + print(f"Using command line chromosome name: {in_arg.new_chrom[index]}") + # The actual override happens in add_new_chrom_seq where a new SeqRecord is created + elif hasattr(in_arg, 'chrom') and in_arg.chrom is not None: + if record.id != in_arg.chrom: + print(f"** WARNING: Mismatch detected between chromosome name in input FASTA ({record.id}) " + f"and command line parameter ({in_arg.chrom}).") + print(f"Using command line chromosome name: {in_arg.chrom}") + # The actual override happens in modify_existing_chrom_seq where the existing sequence is modified except IndexError: raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.") except Exception as e: @@ -289,7 +302,7 @@ def valid_gff_line(line_elements): return False return True -def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None): +def get_in_gff_lines(in_gff=None, existing_chrom=None, new_chrom=None, sequence_length=None): ''' Takes a gff file and returns a list of lists where each parent list item is a single line of the gff file @@ -303,7 +316,7 @@ def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None): continue # Handle differently based on whether we're adding a new chromosome or modifying existing - if line.startswith("##sequence-region"): + if line.startswith("##sequence-region") and new_chrom is not None: ## Paste ##sequence-region line which only exists in gtf/gff for adding new chromosome. ## Select user used delimiter based on content if '\t' in line: @@ -336,6 +349,11 @@ def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None): else: ## Split, check and add feature lines line_elements = line.split('\t') + chorme_id = existing_chrom if existing_chrom else new_chrom + if line_elements[0] != chorme_id: + print("** Warning: The chromosome name in the GFF file does not match the new chromosome name.") + print(f"Correct the chromosome name {line_elements[0]} to {chorme_id}") + line_elements[0] = chorme_id if not valid_gff_line(line_elements): exit() in_gff_lines.append(line_elements) @@ -667,6 +685,7 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i """ Appends new annotations to an existing GFF file without modifying existing features. """ + gff_splitor = '' ref_gff_path = ref_gff ## Handle compressed .gz GFF files if ref_gff.endswith('.gz'): @@ -681,6 +700,12 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i ## Copy all existing annotations to new GFF file with open(ref_gff_path, "r", encoding="utf-8") as f: for line in f: + if line.startswith("##sequence-region") and gff_splitor == '': + ## Select user used delimiter based on content + if '\t' in line: + gff_splitor = ('\t') + else: + gff_splitor = (' ') gff_out.write(line) ## Append new annotations if present if in_gff_lines: @@ -688,8 +713,10 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i for new_annotation in in_gff_lines: if new_annotation[0] == "##sequence-region": ## Use predefined format for sequence-region line + if gff_splitor == '': + gff_splitor = new_annotation[-1] ## Remove format indicater, and add new line - gff_out.write(new_annotation[-1].join(new_annotation[:-1])+'\n') + gff_out.write(gff_splitor.join(new_annotation[:-1])+'\n') elif new_annotation: gff_out.write("\t".join(new_annotation)) diff --git a/test_data/19/gold.fa b/test_data/19/gold.fa new file mode 100644 index 0000000..0194856 --- /dev/null +++ b/test_data/19/gold.fa @@ -0,0 +1,8 @@ +>X +ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK +>Y +AAAATTTTGGGGCCCC +>H +GGGGAATTCCCCGGGG +>M +CCCCGGGGAAAATTTT diff --git a/test_data/19/gold.gtf b/test_data/19/gold.gtf new file mode 100644 index 0000000..29476bf --- /dev/null +++ b/test_data/19/gold.gtf @@ -0,0 +1,18 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +##sequence-region Y 1 16 +Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +##sequence-region H 1 16 +H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref stop_codon 14 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +M ref exon 1 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref CDS 4 14 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref start_codon 1 3 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref stop_codon 14 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; diff --git a/test_data/19/in1.fa b/test_data/19/in1.fa new file mode 100644 index 0000000..53a0c2f --- /dev/null +++ b/test_data/19/in1.fa @@ -0,0 +1,2 @@ +>Z +AAAATTTTGGGGCCCC diff --git a/test_data/19/in1.gtf b/test_data/19/in1.gtf new file mode 100644 index 0000000..26db7ce --- /dev/null +++ b/test_data/19/in1.gtf @@ -0,0 +1,5 @@ +##sequence-region Y.11 1 16 +Z ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Z ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Z ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Z ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; diff --git a/test_data/19/in2.fa b/test_data/19/in2.fa new file mode 100644 index 0000000..baf880f --- /dev/null +++ b/test_data/19/in2.fa @@ -0,0 +1,2 @@ +>Y +GGGGAATTCCCCGGGG diff --git a/test_data/19/in2.gtf b/test_data/19/in2.gtf new file mode 100644 index 0000000..9e0b425 --- /dev/null +++ b/test_data/19/in2.gtf @@ -0,0 +1,5 @@ +##sequence-region F.11 3 27 +H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref stop_codon 14 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; diff --git a/test_data/19/in3.fa b/test_data/19/in3.fa new file mode 100644 index 0000000..38306a6 --- /dev/null +++ b/test_data/19/in3.fa @@ -0,0 +1,2 @@ +>M +CCCCGGGGAAAATTTT diff --git a/test_data/19/in3.gtf b/test_data/19/in3.gtf new file mode 100644 index 0000000..46fe5e2 --- /dev/null +++ b/test_data/19/in3.gtf @@ -0,0 +1,5 @@ +##Test Data +M ref exon 1 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +K ref CDS 4 14 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +Z ref start_codon 1 3 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref stop_codon 14 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; diff --git a/test_data/19/ref.fa b/test_data/19/ref.fa new file mode 100644 index 0000000..6fa4d80 --- /dev/null +++ b/test_data/19/ref.fa @@ -0,0 +1,2 @@ +>X +ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK diff --git a/test_data/19/ref.gtf b/test_data/19/ref.gtf new file mode 100644 index 0000000..6737be8 --- /dev/null +++ b/test_data/19/ref.gtf @@ -0,0 +1,4 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; diff --git a/test_reform.py b/test_reform.py index 0d421c2..2768106 100644 --- a/test_reform.py +++ b/test_reform.py @@ -762,5 +762,45 @@ def test_case_18(self): os.chdir(wd) + def test_case_19(self): + """ + Case 19: + Testing Reform with incorrect new chrom and comments in input FASTA + and annotation files. Also, testing reform with more formal printing + """ + + wd = os.getcwd() + os.chdir('test_data/19/') + + command = """ + python3 ../../reform.py \ + --new_chrom="Y,H,M" \ + --in_fasta=in1.fa,in2.fa,in3.fa \ + --in_gff=in1.gtf,in2.gtf,in3.gtf \ + --ref_fasta=ref.fa \ + --ref_gff=ref.gtf + """ + + response = subprocess.getoutput(command) + print(response) + + with open('gold.gtf', 'r') as f: + gold_gff = f.read() + with open('ref_reformed.gtf', 'r') as f: + new_gff = f.read() + print("Testing gtf") + self.assertListEqual(list(gold_gff), list(new_gff)) + print("Done") + + with open('gold.fa', 'r') as f: + gold_fa = f.read() + with open('ref_reformed.fa', 'r') as f: + new_fa = f.read() + print("Testing Fasta") + self.assertListEqual(list(gold_fa), list(new_fa)) + print("Done") + + os.chdir(wd) + if __name__ == '__main__': unittest.main() From df049016c14baddacc61b25b163ccff1422f9c90 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Mon, 21 Apr 2025 21:26:47 -0400 Subject: [PATCH 10/11] use f-string --- reform.py | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/reform.py b/reform.py index ec5ecbc..ea6557c 100755 --- a/reform.py +++ b/reform.py @@ -7,11 +7,11 @@ from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord try: - import pgzip as gzip_module - print("\nUsing pgzip for gzip operations.\n") + import pgzip as gzip_module + print(f"\nUsing pgzip for gzip operations.\n") except ImportError: - import gzip as gzip_module - print("\npgzip not found, falling back to gzip.\n") + import gzip as gzip_module + print(f"\npgzip not found, falling back to gzip.\n") def main(): @@ -42,7 +42,7 @@ def main(): add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) print("------------------------------------------") - print("Reform Complete") + print(f"Reform Complete") print("------------------------------------------") print(f"New .fa file created: {os.path.realpath(new_fasta)}") print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}") @@ -71,9 +71,8 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications length_changed = len(str(record.seq)) - (down_position - position - 1) prev_modifications.append((position,length_changed)) if position != down_position: - print("Removing nucleotides from position {} - {}".format(position, down_position)) - print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}" - .format(record.description, in_arg.in_fasta[index], position, in_arg.chrom)) + print(f"Removing nucleotides from position {position} - {down_position}") + print(f"Proceeding to insert sequence '{record.description}' from {in_arg.in_fasta[index]} at position {position} on chromosome {in_arg.chrom}") ## Build the new chromosome sequence with the inserted_seq ## If the chromosome sequence length is in the header, replace it with new length new_seq = existing_seq_str[:position] + str(record.seq) + existing_seq_str[down_position:] @@ -207,7 +206,7 @@ def read_fasta(in_arg, index, prev_fasta_path): raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.") except Exception as e: raise ValueError(f"Error parsing FASTA file: {str(e)}") - print("Preparing to create new FASTA file") + print(f"Preparing to create new FASTA file") print(f"Original FASTA: {real_path_fa}") ## Generate index of sequences from ref reference fasta if prev_fasta_path: @@ -290,14 +289,14 @@ def valid_gff_line(line_elements): ''' if not line_elements[0].startswith("##sequence-region"): if len(line_elements) != 9: - print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) + print(f"** ERROR: in_gff file does not have 9 columns, it has {len(line_elements)}") print(line_elements) return False else: ## Check if ##sequence-region line has 4 columns, the reason why use 5 here is because last element is ## spliting format indicator. if len(line_elements) != 5: - print("** ERROR: ##sequence-region line does not have 4 columns, it has", len(line_elements) - 1) + print(f"** ERROR: ##sequence-region line does not have 4 columns, it has {len(line_elements) - 1}") print(line_elements) return False return True @@ -376,7 +375,7 @@ def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_mo position += lc if position > len(seq_str): print("** ERROR: Position greater than length of chromosome.") - print("Chromosome: {}\Chromosome length: {}\nPosition: \n{}".format(chrom, len(seq_str), position)) + print(f"Chromosome: {chrom}\nChromosome length: {len(seq_str)}\nPosition: {position}") exit() elif position == -1: position = len(seq_str) @@ -403,8 +402,8 @@ def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_mo else: print("** ERROR: The upstream and downstream target sequences must be present and unique in the specified chromosome.") print("Chromosome: {}\n".format(chrom)) - print("Upstream sequence found {} times".format(upstream_seq_count)) - print("Downstream sequence found {} times".format(downstream_seq_count)) + print(f"Upstream sequence found {upstream_seq_count} times") + print(f"Downstream sequence found {downstream_seq_count} times") exit() else: print("** ERROR: You must specify a valid position or upstream and downstream sequences.") @@ -602,8 +601,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, elif gff_feat_start <= position and gff_feat_end <= down_position: # Which side of the feature depends on the strand (we add this as a comment) x = "3" if gff_feat_strand == "+" else "5" - print("Feature cut off - {} prime side of feature cut off ({} strand)" - .format(x, gff_feat_strand)) + print(f"Feature cut off - {x} prime side of feature cut off ({gff_feat_strand} strand)") new_comment = format_comment( "{} prime side of feature cut-off by inserted sequence".format(x), gff_ext @@ -631,8 +629,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, and gff_feat_start <= down_position and gff_feat_end > down_position): x = "5" if gff_feat_strand == "+" else "3" - print("Feature cut off - {} prime side of feature cut off ({} strand)" - .format(x, gff_feat_strand)) + print(f"Feature cut off - {x} prime side of feature cut off ({gff_feat_strand} strand)") new_comment = format_comment( "{} prime side of feature cut-off by inserted sequence".format(x), gff_ext @@ -656,7 +653,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, gff_out.write(modified_line) else: - print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements)) + print(f"** Error: Unknown case for GFF modification. Exiting {line_elements}") exit() # If we've iterated over the entire original gff @@ -738,7 +735,7 @@ def format_comment(comment, ext): elif ext.lower().startswith('gff'): new_comment = "; reform_comment={}".format(comment) else: - print("** Error: Unrecognized extension {} in format_comment(). Exiting".format(ext)) + print(f"** Error: Unrecognized extension {ext} in format_comment(). Exiting") exit() return new_comment @@ -750,15 +747,15 @@ def rename_id(line): attributes = line.split('\t')[8].strip() elements = attributes.split(';') if elements[0].startswith("ID="): - print("Renaming split feature {} --> {}_split".format(elements[0], elements[0])) + print(f"Renaming split feature {elements[0]} --> {elements[0]}_split") return ("{}_split;{}".format(elements[0], ';'.join(elements[1:]))) elif elements[0].startswith("gene_id "): gene_id = re.match(r'gene_id \"(.+)\"', elements[0])[1] - print('Renaming split feature {} --> {}_split'.format(gene_id, gene_id)) + print(f"Renaming split feature {gene_id} --> {gene_id}_split") return ('gene _id "{}_split";{}'.format(gene_id, ';'.join(elements[1:]))) else: - print("This feature will not be renamed because it does not has an ID/gene_id attribute:\n", line) + print(f"This feature will not be renamed because it does not have an ID/gene_id attribute:\n{line}") return attributes def get_input_args(): From 1c263aa19892bc40a55fdf027a8e9741d7652284 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Thu, 24 Apr 2025 21:43:11 -0400 Subject: [PATCH 11/11] Helper Function for Chromosome Length Calculation --- reform.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/reform.py b/reform.py index ea6557c..ee7abdd 100755 --- a/reform.py +++ b/reform.py @@ -514,7 +514,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, if line.startswith("##sequence-region") and line_elements[1] == chrom_id: ## Edit the length of the chromosome original_length = int(line_elements[3]) - new_length = original_length - (down_position - position) + new_seq_length + new_length = calculate_new_length(original_length, position, down_position, new_seq_length) line = line.replace(str(original_length), str(new_length)) gff_out.write(line) else: @@ -555,7 +555,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, # "chromosome" or "region") elif gff_feat_type in ['chromosome', 'region']: original_length = gff_feat_end - new_length = original_length - (down_position - position) + new_seq_length + new_length = calculate_new_length(original_length, position, down_position, new_seq_length) line = line.replace(str(original_length), str(new_length)) gff_out.write(line) @@ -757,7 +757,14 @@ def rename_id(line): else: print(f"This feature will not be renamed because it does not have an ID/gene_id attribute:\n{line}") return attributes - + +def calculate_new_length(original_length, position, down_position, new_seq_length): + """ + Calculate the new chromosome/region length after a modification. + Returns: The new calculated length + """ + return original_length - (down_position - position) + new_seq_length + def get_input_args(): parser = argparse.ArgumentParser() chrom_group = parser.add_mutually_exclusive_group(required=True)