From 58058ceb4a9a1ce3a3b4af8d1c8ac95377ae9bc8 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Wed, 5 Feb 2025 13:21:57 -0500 Subject: [PATCH 01/14] split modify process --- reform.py | 219 +++++++++++++++++++++++++++++------------------------- 1 file changed, 116 insertions(+), 103 deletions(-) diff --git a/reform.py b/reform.py index d164c5a..0d03024 100755 --- a/reform.py +++ b/reform.py @@ -31,111 +31,118 @@ def main(): print("-------------------------------------------") print(f"Begin modification from in{index+1}.fa") print("-------------------------------------------") - ## Read the new fasta (to be inserted into the ref genome) - try: - filename_fa = in_arg.in_fasta[index] - if not os.path.exists(filename_fa): - raise FileNotFoundError(f"Error: File {filename_fa} does not exist.") - real_path_fa = os.path.realpath(filename_fa) - record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0] - except IndexError: - raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.") - except Exception as e: - raise ValueError(f"Error parsing FASTA file: {str(e)}") - print("Preparing to create new FASTA file") - print(f"Original FASTA: {real_path_fa}") - filename_gff = in_arg.in_gff[index] - if not os.path.exists(filename_gff): - raise FileNotFoundError(f"Error: File {filename_gff} does not exist.") - real_path_gff = os.path.realpath(filename_gff) - print("Preparing to create new annotation file") - print(f"Original Annotation: {real_path_gff}") - print() ### print new line + if hasattr(in_arg, 'chrom'): + # Modify existing chrom seq + new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path \ + = modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \ + iterations, prev_gff_path) - ## Generate index of sequences from ref reference fasta - if prev_fasta_path: - chrom_seqs = index_fasta(prev_fasta_path) - os.remove(prev_fasta_path) - else: - chrom_seqs = index_fasta(in_arg.ref_fasta) - - ## Obtain the sequence of the chromosome we want to modify - seq = chrom_seqs[in_arg.chrom] - seq_str = str(seq.seq) - - ## Get the position to insert the new sequence - positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str, prev_modifications) - position = positions['position'] - down_position = positions['down_position'] - - ## Save current modification which include position(index) and length changed. - if position == down_position: - length_changed = len(str(record.seq)) - else: - length_changed = len(str(record.seq)) - (down_position - position - 1) - prev_modifications.append((position,length_changed)) - - if position != down_position: - print("Removing nucleotides from position {} - {}".format(position, down_position)) - print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}" - .format(record.description, in_arg.in_fasta[index], position, in_arg.chrom)) - - ## Build the new chromosome sequence with the inserted_seq - ## If the chromosome sequence length is in the header, replace it with new length - new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:] - chrom_length = str(len(seq_str)) - new_length = str(len(new_seq)) - new_record = SeqRecord( - Seq(new_seq), - id=seq.id, - description=seq.description.replace(chrom_length, new_length) + print("------------------------------------------") + print("Reform Complete") + print("------------------------------------------") + print(f"New .fa file created: {os.path.realpath(new_fasta)}") + print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}") + +def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, iterations, prev_gff_path): + ## Read FASTA + record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path) + ## Read annotation (gff/gtf) + read_gff(in_arg, index) + ## Obtain the sequence of the chromosome we want to modify + seq = chrom_seqs[in_arg.chrom] + seq_str = str(seq.seq) + ## Get the position to insert the new sequence + positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str, prev_modifications) + position = positions['position'] + down_position = positions['down_position'] + ## Save current modification which include position(index) and length changed. + if position == down_position: + length_changed = len(str(record.seq)) + else: + length_changed = len(str(record.seq)) - (down_position - position - 1) + prev_modifications.append((position,length_changed)) + if position != down_position: + print("Removing nucleotides from position {} - {}".format(position, down_position)) + print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}" + .format(record.description, in_arg.in_fasta[index], position, in_arg.chrom)) + ## Build the new chromosome sequence with the inserted_seq + ## If the chromosome sequence length is in the header, replace it with new length + new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:] + chrom_length = str(len(seq_str)) + new_length = str(len(new_seq)) + new_record = SeqRecord( + Seq(new_seq), + id=seq.id, + description=seq.description.replace(chrom_length, new_length) ) - - ## Create new fasta file with modified chromosome - if index < iterations - 1: - new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa') - new_fasta_file.close() - new_fasta = new_fasta_file.name - prev_fasta_path = new_fasta - else: - ref_basename, _ = get_ref_basename(in_arg.ref_fasta) - ref_name = ref_basename - new_fasta = ref_name + '_reformed.fa' - with open(new_fasta, "w") as f: - for s in chrom_seqs: - if s == seq.id: - SeqIO.write([new_record], f, "fasta") - else: - SeqIO.write([chrom_seqs[s]], f, "fasta") - - ## Read in new GFF features from in_gff - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) - - ## Create a temp file for gff, if index is not equal to last iteration - annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) - if index < iterations - 1: - temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext) - temp_gff_name = temp_gff.name - temp_gff.close() - if prev_gff_path: - new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) - os.remove(prev_gff_path) + ## Create new fasta file with modified chromosome + if index < iterations - 1: + new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa') + new_fasta_file.close() + new_fasta = new_fasta_file.name + prev_fasta_path = new_fasta + else: + ref_basename, _ = get_ref_basename(in_arg.ref_fasta) + ref_name = ref_basename + new_fasta = ref_name + '_reformed.fa' + with open(new_fasta, "w") as f: + for s in chrom_seqs: + if s == seq.id: + SeqIO.write([new_record], f, "fasta") else: - new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + SeqIO.write([chrom_seqs[s]], f, "fasta") + ## Read in new GFF features from in_gff + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) + ## Create a temp file for gff, if index is not equal to last iteration + annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) + if index < iterations - 1: + temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext) + temp_gff_name = temp_gff.name + temp_gff.close() + if prev_gff_path: + new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + os.remove(prev_gff_path) else: - new_gff_name = annotation_name + '_reformed' + annotation_ext - if prev_gff_path: - new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) - else: - new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) - prev_gff_path = new_gff_path - if (index == iterations-1): - print("------------------------------------------") - print("Reform Complete") - print("------------------------------------------") - print(f"New .fa file created: {os.path.realpath(new_fasta)}") - print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}") + new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + else: + new_gff_name = annotation_name + '_reformed' + annotation_ext + if prev_gff_path: + new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + else: + new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + prev_gff_path = new_gff_path + return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path, +def read_fasta(in_arg, index, prev_fasta_path): + ## Read the new fasta (to be inserted into the ref genome) + try: + filename_fa = in_arg.in_fasta[index] + if not os.path.exists(filename_fa): + raise FileNotFoundError(f"Error: File {filename_fa} does not exist.") + real_path_fa = os.path.realpath(filename_fa) + record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0] + except IndexError: + raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.") + except Exception as e: + raise ValueError(f"Error parsing FASTA file: {str(e)}") + print("Preparing to create new FASTA file") + print(f"Original FASTA: {real_path_fa}") + ## Generate index of sequences from ref reference fasta + if prev_fasta_path: + chrom_seqs = index_fasta(prev_fasta_path) + os.remove(prev_fasta_path) + else: + chrom_seqs = index_fasta(in_arg.ref_fasta) + return record, chrom_seqs + +def read_gff(in_arg, index): + filename_gff = in_arg.in_gff[index] + if not os.path.exists(filename_gff): + raise FileNotFoundError(f"Error: File {filename_gff} does not exist.") + real_path_gff = os.path.realpath(filename_gff) + print("Preparing to create new annotation file") + print(f"Original Annotation: {real_path_gff}") + print() ### print new line def index_fasta(fasta_path): ''' @@ -562,9 +569,11 @@ def rename_id(line): def get_input_args(): parser = argparse.ArgumentParser() - - parser.add_argument('--chrom', type = str, required = True, - help = "Chromosome name (String)") + chrom_group = parser.add_mutually_exclusive_group(required=True) + chrom_group.add_argument('--chrom', type=str, + help="Chromosome name (String)") + chrom_group.add_argument('--new_chrom', type=str, + help="New Chromosome name (String)") parser.add_argument('--in_fasta', type=str, required=True, help="Path(s) to new sequence(s) to be inserted into reference genome in fasta format") parser.add_argument('--in_gff', type=str, required=True, @@ -610,6 +619,10 @@ def get_input_args(): print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.") exit() + if in_args.new_chrom and (in_args.position or in_args.upstream_fasta or in_args.downstream_fasta): + parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.") + exit() + return in_args, iterations if __name__ == "__main__": From f1aecaa4f31211055f0137dadae03f5e28964ce9 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Fri, 7 Feb 2025 21:47:44 -0500 Subject: [PATCH 02/14] add new chrom --- reform.py | 193 +++++++++++++++++++++++++++++++++------ temp.py | 208 ++++++++++++++++++++++++++++++++++++++++++ test_data/17/gold.fa | 4 + test_data/17/gold.gtf | 8 ++ test_data/17/in.fa | 2 + test_data/17/in.gtf | 4 + test_data/17/ref.fa | 2 + test_data/17/ref.gtf | 4 + test_reform.py | 39 ++++++++ 9 files changed, 434 insertions(+), 30 deletions(-) create mode 100644 temp.py create mode 100644 test_data/17/gold.fa create mode 100644 test_data/17/gold.gtf create mode 100644 test_data/17/in.fa create mode 100644 test_data/17/in.gtf create mode 100644 test_data/17/ref.fa create mode 100644 test_data/17/ref.gtf diff --git a/reform.py b/reform.py index 0d03024..fb59319 100755 --- a/reform.py +++ b/reform.py @@ -31,11 +31,15 @@ def main(): print("-------------------------------------------") print(f"Begin modification from in{index+1}.fa") print("-------------------------------------------") - if hasattr(in_arg, 'chrom'): + if hasattr(in_arg, 'chrom') and in_arg.chrom is not None: # Modify existing chrom seq new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path \ = modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \ iterations, prev_gff_path) + else: + # Add new chrom + new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \ + add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) print("------------------------------------------") print("Reform Complete") @@ -44,7 +48,11 @@ def main(): print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}") def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, iterations, prev_gff_path): - ## Read FASTA + """ + Modifies a specified and existing chromosome sequence by inserting/replacing a new sequence at a given + position and updates the corresponding FASTA and GFF files. + """ + ## Read FASTA record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path) ## Read annotation (gff/gtf) read_gff(in_arg, index) @@ -111,9 +119,69 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications else: new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) prev_gff_path = new_gff_path - return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path, + return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path + +def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations): + ## Read FASTA + record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path) + ## Read annotation (gff/gtf) + read_gff(in_arg, index) + ## Check if new chromosome existed in sequence + if in_arg.new_chrom in chrom_seqs: + raise ValueError(f"Chromosome {in_arg.new_chrom} already exists in the FASTA file.") + + ## Build the new chromosome sequence by append new sequence below + ## If the chromosome sequence length is in the header, replace it with new length + new_seq = str(record.seq) + new_length = str(len(new_seq)) + new_length = str(len(new_seq)) + new_record = SeqRecord( + Seq(new_seq), + id=in_arg.new_chrom, # Use new_chrom + description=f"{in_arg.new_chrom}" + ) + ## Create new fasta file with modified chromosome + if index < iterations - 1: + new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa') + new_fasta_file.close() + new_fasta = new_fasta_file.name + prev_fasta_path = new_fasta + else: + ref_basename, _ = get_ref_basename(in_arg.ref_fasta) + ref_name = ref_basename + new_fasta = ref_name + '_reformed.fa' + ## Write all the original chromosomes first, then new chromosomes. + with open(new_fasta, "w") as f: + for s in chrom_seqs: + SeqIO.write([chrom_seqs[s]], f, "fasta") + SeqIO.write([new_record], f, "fasta") + ## Read in new GFF features from in_gff + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) + ## Create a temp file for gff, if index is not equal to last iteration + annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) + if index < iterations - 1: + temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext) + temp_gff_name = temp_gff.name + temp_gff.close() + if prev_gff_path: + new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom) + os.remove(prev_gff_path) + else: + new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom) + else: + new_gff_name = annotation_name + '_reformed' + annotation_ext + if prev_gff_path: + new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom) + else: + new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom) + prev_gff_path = new_gff_path + return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path def read_fasta(in_arg, index, prev_fasta_path): + """ + Reads the FASTA file, extracts the sequence to be inserted, + and indexes the reference genome chromosome sequences. + """ ## Read the new fasta (to be inserted into the ref genome) try: filename_fa = in_arg.in_fasta[index] @@ -136,6 +204,9 @@ def read_fasta(in_arg, index, prev_fasta_path): return record, chrom_seqs def read_gff(in_arg, index): + """ + Reads the GFF file, verifies its existence, and prints its file path information. + """ filename_gff = in_arg.in_gff[index] if not os.path.exists(filename_gff): raise FileNotFoundError(f"Error: File {filename_gff} does not exist.") @@ -535,6 +606,63 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, os.remove(ref_gff_path) return new_gff_name +def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_id): + """ + Appends new annotations to an existing GFF file without modifying existing features. + """ + ref_gff_path = ref_gff + ## Handle compressed .gz GFF files + if ref_gff.endswith('.gz'): + with gzip_module.open(ref_gff, 'rt') as f: + with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: + for line in f: + tmp_f.write(line) + tmp_f.flush() + ref_gff_path = tmp_f.name + ## Open new GFF file for writing + with open(new_gff_name, "w") as gff_out: + ## Copy all existing annotations to new GFF file + with open(ref_gff_path, "r", encoding="utf-8") as f: + for line in f: + gff_out.write(line) + ## Append new annotations if present + if in_gff_lines: + print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.") + for new_annotation in in_gff_lines: + if new_annotation: + gff_out.write("\t".join(new_annotation)) + + ## Cleanup temp file if needed + if ref_gff.endswith('.gz') and ref_gff_path != ref_gff: + try: + os.remove(ref_gff_path) + except Exception as e: + print(f"Warning: Failed to delete temp file {ref_gff_path}: {e}") + + # with open(new_gff_name, "w") as gff_out: + # ref_gff_path = ref_gff + # if ref_gff.endswith('.gz'): + # with gzip_module.open(ref_gff, 'rt') as f: + # with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: + # for line in f: + # tmp_f.write(line) + # tmp_f.flush() + # ref_gff_path = tmp_f.name + # ## Copy all existing annotations to new GFF file + # with open(ref_gff_path, "r") as f: + # for line in f: + # gff_out.write(line) + # # Append new annotations + # print(f"Appending new annotations to chromosome {chrom_id}.") + # for new_annotation in in_gff_lines: + # if new_annotation: + # gff_out.write("\t".join(new_annotation) + "\n") + + # # Cleanup temp file if needed + # if ref_gff.endswith('.gz'): + # os.remove(ref_gff_path) + return new_gff_name + def format_comment(comment, ext): ''' Format comment according to ext (GFF or GTF) and return @@ -592,36 +720,41 @@ def get_input_args(): in_args = parser.parse_args() in_args.in_fasta = in_args.in_fasta.split(',') in_args.in_gff = in_args.in_gff.split(',') - if in_args.upstream_fasta: - in_args.upstream_fasta = in_args.upstream_fasta.split(',') - if in_args.downstream_fasta: - in_args.downstream_fasta = in_args.downstream_fasta.split(',') - - if in_args.position is None and (in_args.upstream_fasta is None or in_args.downstream_fasta is None): - print("** Error: You must provide either the position, or the upstream and downstream sequences.") - exit() - - if not in_args.position and len(in_args.upstream_fasta) != len(in_args.downstream_fasta): - print("** Error: The number of upstream_fasta and downstream_fasta files does not match.") + if (len(in_args.in_fasta) != len(in_args.in_gff)): + print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.") exit() - - if in_args.position is not None: - try: - in_args.position = list(map(int, in_args.position.split(','))) - iterations = iterations = len(in_args.position) - except ValueError: - print("** Error: Position must be a comma-separated list of integers, like 1,5,-1.") + else: + iterations = len(in_args.in_fasta) + ## Modify existing chrom + if in_args.chrom: + if in_args.upstream_fasta: + in_args.upstream_fasta = in_args.upstream_fasta.split(',') + if in_args.downstream_fasta: + in_args.downstream_fasta = in_args.downstream_fasta.split(',') + if in_args.position is None and (in_args.upstream_fasta is None or in_args.downstream_fasta is None): + print("** Error: You must provide either the position, or the upstream and downstream sequences.") + exit() + if in_args.position is not None: + try: + in_args.position = list(map(int, in_args.position.split(','))) + except ValueError: + print("** Error: Position must be a comma-separated list of integers, like 1,5,-1.") + exit() + if iterations != len(in_args.position): + print("** Error: Position must be a equal to number of input FASTA") + exit() + else: + if iterations != len(in_args.upstream_fasta): + print("** Error: Upstream FASTA must be a equal to number of input FASTA") + exit() + if not in_args.position and len(in_args.upstream_fasta) != len(in_args.downstream_fasta): + print("** Error: The number of upstream_fasta and downstream_fasta files does not match.") exit() + ## Add new chrom else: - iterations = len(in_args.upstream_fasta) - - if (len(in_args.in_fasta) != len(in_args.in_gff)) or (len(in_args.in_fasta) != iterations): - print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.") - exit() - - if in_args.new_chrom and (in_args.position or in_args.upstream_fasta or in_args.downstream_fasta): - parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.") - exit() + if in_args.position or in_args.upstream_fasta or in_args.downstream_fasta: + parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.") + exit() return in_args, iterations diff --git a/temp.py b/temp.py new file mode 100644 index 0000000..077400a --- /dev/null +++ b/temp.py @@ -0,0 +1,208 @@ +def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, chrom_id, new_seq_length): + ''' + Goes line by line through a gff file to remove existing features + (or parts of existing features) and/or insert new features. + In the process of adding new features, existing features + may be cut-off on either end or split into two. + This attempts to handle all cases. + new_gff_name: the name of the new gff file to create + ref_gff: the reference gff file to modify + in_gff_lines: a list of lists where each nested list is a list of + columns (in gff format) associated with each new feature to insert + position: start position of removal of existing sequence + down_position: end position of removal of existing sequence + chrom_id: the ID of the chromosome to modify + new_seq_length: the length of the new sequence being added to the chromosome + ''' + with open(new_gff_name, "w") as gff_out: + in_gff_lines_appended = False + split_features = [] + last_seen_chrom_id = None + gff_ext = new_gff_name.split('.')[-1] + ref_gff_path = ref_gff + if ref_gff.endswith('.gz'): + with gzip_module.open(ref_gff, 'rt') as f: + ## Create a tempfile to store uncompressde content + with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: + tmp_f.write(f.read()) + ref_gff_path = tmp_f.name + with open(ref_gff_path, "r") as f: + for line in f: + line_elements = line.split('\t') + if line.startswith("#"): + if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id: + ## Edit the length of the chromosome + original_length = int(line_elements[3]) + new_length = original_length - (down_position - position) + new_seq_length + line = line.replace(str(original_length), str(new_length)) + gff_out.write(line) + else: + gff_chrom_id = line_elements[0] + gff_feat_start = int(line_elements[3]) + gff_feat_end = int(line_elements[4]) + gff_feat_type = line_elements[2] + gff_feat_strand = line_elements[6] + gff_comments = line_elements[8].strip() + + # If we've seen at least one chromosome + # and the last chromosome seen was the chromosome of interest (i.e. chrom_id) + # and now we're on a new chromosome in the original gff file + # and the in_gff_lines have not yet been appended: + # we assume they need to be appended to the end of the chromosome + # so append them before proceeding + if (last_seen_chrom_id is not None + and last_seen_chrom_id == chrom_id + and gff_chrom_id != last_seen_chrom_id + and not in_gff_lines_appended): + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) + + last_seen_chrom_id = gff_chrom_id + + # If this is not the chromosome of interest + # Or if the current feature ends before any + # modification (which occurs at position) + # Then simply write the feature as is (no modification) + # (remember gff co-ordinates are 1 based and position + # is 0 based, therefor check less than *or equal to*) + if gff_chrom_id != chrom_id or gff_feat_end <= position: + gff_out.write(line) + + # Modify chromosome feature length (if feature is + # "chromosome" or "region") + elif gff_feat_type in ['chromosome', 'region']: + original_length = gff_feat_end + new_length = original_length - (down_position - position) + new_seq_length + line = line.replace(str(original_length), str(new_length)) + gff_out.write(line) + + # Split feature into 2 if feature starts before position + # and ends after down_position + elif gff_feat_start <= position and gff_feat_end > down_position: + print("Feature split") + print(line) + # Which side of the feature depends on the strand (we add this as a comment) + (x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5") + + new_comment = format_comment( + "original feature split by inserted sequence, this is the {} prime end".format(x), + gff_ext + ) + # Modified feature ends at 'position' + modified_line = modify_gff_line( + line_elements, + end = position, + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # The downstream flank(s) will be added immediately after the + # in_gff (new) features are written. + # First, attempt to rename IDs (to indicate split) + renamed_id_attributes = rename_id(line) + new_comment = format_comment( + "original feature split by inserted sequence, this is the {} prime end".format(y), + gff_ext + ) + split_features.append( + ( + line_elements, + position + new_seq_length + 1, + gff_feat_end + new_seq_length - (down_position - position), + renamed_id_attributes + new_comment + ) + ) + + # Change end position of feature to cut off point (position) if the + # feature ends within the deletion (between position & down_position) + elif gff_feat_start <= position and gff_feat_end <= down_position: + # Which side of the feature depends on the strand (we add this as a comment) + x = "3" if gff_feat_strand == "+" else "5" + print("Feature cut off - {} prime side of feature cut off ({} strand)" + .format(x, gff_feat_strand)) + new_comment = format_comment( + "{} prime side of feature cut-off by inserted sequence".format(x), + gff_ext + ) + modified_line = modify_gff_line( + line_elements, + end = position, + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # Skip this feature if it falls entirely within the deletion + elif gff_feat_start > position and gff_feat_end <= down_position: + print("Skip feature (this feature was removed from sequence)") + continue + + else: + if not in_gff_lines_appended: + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) + + # Change start position of feature to after cutoff point if + # the feature starts within the deletion + if (gff_feat_start > position + and gff_feat_start <= down_position + and gff_feat_end > down_position): + x = "5" if gff_feat_strand == "+" else "3" + print("Feature cut off - {} prime side of feature cut off ({} strand)" + .format(x, gff_feat_strand)) + new_comment = format_comment( + "{} prime side of feature cut-off by inserted sequence".format(x), + gff_ext + ) + modified_line = modify_gff_line( + line_elements, + start = position + new_seq_length + 1, + end = gff_feat_end + new_seq_length - (down_position - position), + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # Offset all downstream feature positions by offset length + elif gff_feat_start > down_position: + offset_length = new_seq_length - (down_position - position) + modified_line = modify_gff_line( + line_elements, + start = gff_feat_start + offset_length, + end = gff_feat_end + offset_length + ) + gff_out.write(modified_line) + + else: + print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements)) + exit() + + # If we've iterated over the entire original gff + # (i.e. out of the for loop which iterates over it) + # and still haven't written in_gff_lines + # and the last chromosome seen was the chromosome of interest (i.e. chrom_id): + # we assume they need to be appended to the end of the genome + # (i.e. end of the last chromosome) + # so append them now + if (last_seen_chrom_id is not None + and last_seen_chrom_id == chrom_id + and not in_gff_lines_appended): + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) + + # Checking to ensure in_gff_lines written + if not in_gff_lines_appended: + print("** Error: Something went wrong, in_gff not added to reference gff. Exiting") + exit() + # Remove temp gtf file + if ref_gff.endswith('.gz'): + os.remove(ref_gff_path) + return new_gff_name + + +""" +python3 ../../reform.py \ + --new_chrom="Y" \ + --in_fasta=in.fa \ + --in_gff=in.gtf \ + --ref_fasta=ref.fa \ + --ref_gff=ref.gtf +""" \ No newline at end of file diff --git a/test_data/17/gold.fa b/test_data/17/gold.fa new file mode 100644 index 0000000..79a60e4 --- /dev/null +++ b/test_data/17/gold.fa @@ -0,0 +1,4 @@ +>X +ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK +>Y +AAAATTTTGGGGCCCC diff --git a/test_data/17/gold.gtf b/test_data/17/gold.gtf new file mode 100644 index 0000000..83f530e --- /dev/null +++ b/test_data/17/gold.gtf @@ -0,0 +1,8 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; diff --git a/test_data/17/in.fa b/test_data/17/in.fa new file mode 100644 index 0000000..2b55705 --- /dev/null +++ b/test_data/17/in.fa @@ -0,0 +1,2 @@ +>Y +AAAATTTTGGGGCCCC diff --git a/test_data/17/in.gtf b/test_data/17/in.gtf new file mode 100644 index 0000000..7dc556d --- /dev/null +++ b/test_data/17/in.gtf @@ -0,0 +1,4 @@ +Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; diff --git a/test_data/17/ref.fa b/test_data/17/ref.fa new file mode 100644 index 0000000..6fa4d80 --- /dev/null +++ b/test_data/17/ref.fa @@ -0,0 +1,2 @@ +>X +ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK diff --git a/test_data/17/ref.gtf b/test_data/17/ref.gtf new file mode 100644 index 0000000..6737be8 --- /dev/null +++ b/test_data/17/ref.gtf @@ -0,0 +1,4 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; diff --git a/test_reform.py b/test_reform.py index a10d75e..61054b6 100644 --- a/test_reform.py +++ b/test_reform.py @@ -684,5 +684,44 @@ def test_case_16(self): os.chdir(wd) + def test_case_17(self): + """ + Case 17: + Testing Reform which adding new chrom + """ + + wd = os.getcwd() + os.chdir('test_data/17/') + + command = """ + python3 ../../reform.py \ + --new_chrom="Y" \ + --in_fasta=in.fa \ + --in_gff=in.gtf \ + --ref_fasta=ref.fa \ + --ref_gff=ref.gtf \ + """ + + response = subprocess.getoutput(command) + print(response) + + with open('gold.gtf', 'r') as f: + gold_gff = f.read() + with open('ref_reformed.gtf', 'r') as f: + new_gff = f.read() + print("Testing gtf") + self.assertListEqual(list(gold_gff), list(new_gff)) + print("Done") + + with open('gold.fa', 'r') as f: + gold_fa = f.read() + with open('ref_reformed.fa', 'r') as f: + new_fa = f.read() + print("Testing Fasta") + self.assertListEqual(list(gold_fa), list(new_fa)) + print("Done") + + os.chdir(wd) + if __name__ == '__main__': unittest.main() From 05a6e15be09e0be00e7aa40d60f54549312a7305 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Fri, 7 Feb 2025 22:03:34 -0500 Subject: [PATCH 03/14] remove redundancy comments and temp file --- reform.py | 35 ++------- temp.py | 208 ------------------------------------------------------ 2 files changed, 7 insertions(+), 236 deletions(-) delete mode 100644 temp.py diff --git a/reform.py b/reform.py index fb59319..dd0ed84 100755 --- a/reform.py +++ b/reform.py @@ -32,12 +32,12 @@ def main(): print(f"Begin modification from in{index+1}.fa") print("-------------------------------------------") if hasattr(in_arg, 'chrom') and in_arg.chrom is not None: - # Modify existing chrom seq - new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path \ - = modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \ - iterations, prev_gff_path) + ## Modify existing chrom seq + new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \ + modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \ + iterations, prev_gff_path) else: - # Add new chrom + ## Add new chrom new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \ add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) @@ -118,7 +118,9 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) else: new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + prev_gff_path = new_gff_path + return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations): @@ -246,7 +248,6 @@ def get_ref_basename(filepath): name, ext = os.path.splitext(base) return name, ext - def modify_gff_line(elements, start=None, end=None, comment=None): ''' Modifies an existing GFF line and returns the modified line. Currently, you can @@ -639,28 +640,6 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i except Exception as e: print(f"Warning: Failed to delete temp file {ref_gff_path}: {e}") - # with open(new_gff_name, "w") as gff_out: - # ref_gff_path = ref_gff - # if ref_gff.endswith('.gz'): - # with gzip_module.open(ref_gff, 'rt') as f: - # with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: - # for line in f: - # tmp_f.write(line) - # tmp_f.flush() - # ref_gff_path = tmp_f.name - # ## Copy all existing annotations to new GFF file - # with open(ref_gff_path, "r") as f: - # for line in f: - # gff_out.write(line) - # # Append new annotations - # print(f"Appending new annotations to chromosome {chrom_id}.") - # for new_annotation in in_gff_lines: - # if new_annotation: - # gff_out.write("\t".join(new_annotation) + "\n") - - # # Cleanup temp file if needed - # if ref_gff.endswith('.gz'): - # os.remove(ref_gff_path) return new_gff_name def format_comment(comment, ext): diff --git a/temp.py b/temp.py deleted file mode 100644 index 077400a..0000000 --- a/temp.py +++ /dev/null @@ -1,208 +0,0 @@ -def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, chrom_id, new_seq_length): - ''' - Goes line by line through a gff file to remove existing features - (or parts of existing features) and/or insert new features. - In the process of adding new features, existing features - may be cut-off on either end or split into two. - This attempts to handle all cases. - new_gff_name: the name of the new gff file to create - ref_gff: the reference gff file to modify - in_gff_lines: a list of lists where each nested list is a list of - columns (in gff format) associated with each new feature to insert - position: start position of removal of existing sequence - down_position: end position of removal of existing sequence - chrom_id: the ID of the chromosome to modify - new_seq_length: the length of the new sequence being added to the chromosome - ''' - with open(new_gff_name, "w") as gff_out: - in_gff_lines_appended = False - split_features = [] - last_seen_chrom_id = None - gff_ext = new_gff_name.split('.')[-1] - ref_gff_path = ref_gff - if ref_gff.endswith('.gz'): - with gzip_module.open(ref_gff, 'rt') as f: - ## Create a tempfile to store uncompressde content - with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: - tmp_f.write(f.read()) - ref_gff_path = tmp_f.name - with open(ref_gff_path, "r") as f: - for line in f: - line_elements = line.split('\t') - if line.startswith("#"): - if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id: - ## Edit the length of the chromosome - original_length = int(line_elements[3]) - new_length = original_length - (down_position - position) + new_seq_length - line = line.replace(str(original_length), str(new_length)) - gff_out.write(line) - else: - gff_chrom_id = line_elements[0] - gff_feat_start = int(line_elements[3]) - gff_feat_end = int(line_elements[4]) - gff_feat_type = line_elements[2] - gff_feat_strand = line_elements[6] - gff_comments = line_elements[8].strip() - - # If we've seen at least one chromosome - # and the last chromosome seen was the chromosome of interest (i.e. chrom_id) - # and now we're on a new chromosome in the original gff file - # and the in_gff_lines have not yet been appended: - # we assume they need to be appended to the end of the chromosome - # so append them before proceeding - if (last_seen_chrom_id is not None - and last_seen_chrom_id == chrom_id - and gff_chrom_id != last_seen_chrom_id - and not in_gff_lines_appended): - in_gff_lines_appended = write_in_gff_lines( - gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) - - last_seen_chrom_id = gff_chrom_id - - # If this is not the chromosome of interest - # Or if the current feature ends before any - # modification (which occurs at position) - # Then simply write the feature as is (no modification) - # (remember gff co-ordinates are 1 based and position - # is 0 based, therefor check less than *or equal to*) - if gff_chrom_id != chrom_id or gff_feat_end <= position: - gff_out.write(line) - - # Modify chromosome feature length (if feature is - # "chromosome" or "region") - elif gff_feat_type in ['chromosome', 'region']: - original_length = gff_feat_end - new_length = original_length - (down_position - position) + new_seq_length - line = line.replace(str(original_length), str(new_length)) - gff_out.write(line) - - # Split feature into 2 if feature starts before position - # and ends after down_position - elif gff_feat_start <= position and gff_feat_end > down_position: - print("Feature split") - print(line) - # Which side of the feature depends on the strand (we add this as a comment) - (x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5") - - new_comment = format_comment( - "original feature split by inserted sequence, this is the {} prime end".format(x), - gff_ext - ) - # Modified feature ends at 'position' - modified_line = modify_gff_line( - line_elements, - end = position, - comment = gff_comments + new_comment - ) - gff_out.write(modified_line) - - # The downstream flank(s) will be added immediately after the - # in_gff (new) features are written. - # First, attempt to rename IDs (to indicate split) - renamed_id_attributes = rename_id(line) - new_comment = format_comment( - "original feature split by inserted sequence, this is the {} prime end".format(y), - gff_ext - ) - split_features.append( - ( - line_elements, - position + new_seq_length + 1, - gff_feat_end + new_seq_length - (down_position - position), - renamed_id_attributes + new_comment - ) - ) - - # Change end position of feature to cut off point (position) if the - # feature ends within the deletion (between position & down_position) - elif gff_feat_start <= position and gff_feat_end <= down_position: - # Which side of the feature depends on the strand (we add this as a comment) - x = "3" if gff_feat_strand == "+" else "5" - print("Feature cut off - {} prime side of feature cut off ({} strand)" - .format(x, gff_feat_strand)) - new_comment = format_comment( - "{} prime side of feature cut-off by inserted sequence".format(x), - gff_ext - ) - modified_line = modify_gff_line( - line_elements, - end = position, - comment = gff_comments + new_comment - ) - gff_out.write(modified_line) - - # Skip this feature if it falls entirely within the deletion - elif gff_feat_start > position and gff_feat_end <= down_position: - print("Skip feature (this feature was removed from sequence)") - continue - - else: - if not in_gff_lines_appended: - in_gff_lines_appended = write_in_gff_lines( - gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) - - # Change start position of feature to after cutoff point if - # the feature starts within the deletion - if (gff_feat_start > position - and gff_feat_start <= down_position - and gff_feat_end > down_position): - x = "5" if gff_feat_strand == "+" else "3" - print("Feature cut off - {} prime side of feature cut off ({} strand)" - .format(x, gff_feat_strand)) - new_comment = format_comment( - "{} prime side of feature cut-off by inserted sequence".format(x), - gff_ext - ) - modified_line = modify_gff_line( - line_elements, - start = position + new_seq_length + 1, - end = gff_feat_end + new_seq_length - (down_position - position), - comment = gff_comments + new_comment - ) - gff_out.write(modified_line) - - # Offset all downstream feature positions by offset length - elif gff_feat_start > down_position: - offset_length = new_seq_length - (down_position - position) - modified_line = modify_gff_line( - line_elements, - start = gff_feat_start + offset_length, - end = gff_feat_end + offset_length - ) - gff_out.write(modified_line) - - else: - print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements)) - exit() - - # If we've iterated over the entire original gff - # (i.e. out of the for loop which iterates over it) - # and still haven't written in_gff_lines - # and the last chromosome seen was the chromosome of interest (i.e. chrom_id): - # we assume they need to be appended to the end of the genome - # (i.e. end of the last chromosome) - # so append them now - if (last_seen_chrom_id is not None - and last_seen_chrom_id == chrom_id - and not in_gff_lines_appended): - in_gff_lines_appended = write_in_gff_lines( - gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) - - # Checking to ensure in_gff_lines written - if not in_gff_lines_appended: - print("** Error: Something went wrong, in_gff not added to reference gff. Exiting") - exit() - # Remove temp gtf file - if ref_gff.endswith('.gz'): - os.remove(ref_gff_path) - return new_gff_name - - -""" -python3 ../../reform.py \ - --new_chrom="Y" \ - --in_fasta=in.fa \ - --in_gff=in.gtf \ - --ref_fasta=ref.fa \ - --ref_gff=ref.gtf -""" \ No newline at end of file From b2c1bb446c143eac14555f07678cc4795c801304 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Mon, 3 Mar 2025 15:40:51 -0500 Subject: [PATCH 04/14] Fix bugs --- reform.py | 56 +++++++++++++++++++++---------------------- test_data/18/gold.fa | 8 +++++++ test_data/18/gold.gtf | 16 +++++++++++++ test_data/18/in1.fa | 2 ++ test_data/18/in1.gtf | 4 ++++ test_data/18/in2.fa | 2 ++ test_data/18/in2.gtf | 4 ++++ test_data/18/in3.fa | 2 ++ test_data/18/in3.gtf | 4 ++++ test_data/18/ref.fa | 2 ++ test_data/18/ref.gtf | 4 ++++ test_reform.py | 41 ++++++++++++++++++++++++++++++- 12 files changed, 116 insertions(+), 29 deletions(-) create mode 100644 test_data/18/gold.fa create mode 100644 test_data/18/gold.gtf create mode 100644 test_data/18/in1.fa create mode 100644 test_data/18/in1.gtf create mode 100644 test_data/18/in2.fa create mode 100644 test_data/18/in2.gtf create mode 100644 test_data/18/in3.fa create mode 100644 test_data/18/in3.gtf create mode 100644 test_data/18/ref.fa create mode 100644 test_data/18/ref.gtf diff --git a/reform.py b/reform.py index dd0ed84..eb918fc 100755 --- a/reform.py +++ b/reform.py @@ -55,12 +55,13 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications ## Read FASTA record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path) ## Read annotation (gff/gtf) - read_gff(in_arg, index) + check_gff(in_arg, index) ## Obtain the sequence of the chromosome we want to modify - seq = chrom_seqs[in_arg.chrom] - seq_str = str(seq.seq) + existing_seq = chrom_seqs[in_arg.chrom] + existing_seq_str = str(existing_seq.seq) + ## Get the position to insert the new sequence - positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str, prev_modifications) + positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, existing_seq_str, prev_modifications) position = positions['position'] down_position = positions['down_position'] ## Save current modification which include position(index) and length changed. @@ -75,13 +76,13 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications .format(record.description, in_arg.in_fasta[index], position, in_arg.chrom)) ## Build the new chromosome sequence with the inserted_seq ## If the chromosome sequence length is in the header, replace it with new length - new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:] - chrom_length = str(len(seq_str)) + new_seq = existing_seq_str[:position] + str(record.seq) + existing_seq_str[down_position:] + chrom_length = str(len(existing_seq_str)) new_length = str(len(new_seq)) new_record = SeqRecord( Seq(new_seq), - id=seq.id, - description=seq.description.replace(chrom_length, new_length) + id=existing_seq.id, + description=existing_seq.description.replace(chrom_length, new_length) ) ## Create new fasta file with modified chromosome if index < iterations - 1: @@ -95,7 +96,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications new_fasta = ref_name + '_reformed.fa' with open(new_fasta, "w") as f: for s in chrom_seqs: - if s == seq.id: + if s == existing_seq.id: SeqIO.write([new_record], f, "fasta") else: SeqIO.write([chrom_seqs[s]], f, "fasta") @@ -108,16 +109,16 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications temp_gff_name = temp_gff.name temp_gff.close() if prev_gff_path: - new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq))) os.remove(prev_gff_path) else: - new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq))) else: new_gff_name = annotation_name + '_reformed' + annotation_ext if prev_gff_path: - new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq))) else: - new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq))) + new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq))) prev_gff_path = new_gff_path @@ -127,20 +128,17 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) ## Read FASTA record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path) ## Read annotation (gff/gtf) - read_gff(in_arg, index) + check_gff(in_arg, index) ## Check if new chromosome existed in sequence - if in_arg.new_chrom in chrom_seqs: - raise ValueError(f"Chromosome {in_arg.new_chrom} already exists in the FASTA file.") - + if in_arg.new_chrom[index] in chrom_seqs: + raise ValueError(f"Chromosome {in_arg.new_chrom[index]} already exists in the FASTA file.") ## Build the new chromosome sequence by append new sequence below - ## If the chromosome sequence length is in the header, replace it with new length + ## Using new_chrom as the id of the new chromosome new_seq = str(record.seq) - new_length = str(len(new_seq)) - new_length = str(len(new_seq)) new_record = SeqRecord( Seq(new_seq), - id=in_arg.new_chrom, # Use new_chrom - description=f"{in_arg.new_chrom}" + id=in_arg.new_chrom[index], # Use new_chrom + description=f"{in_arg.new_chrom[index]}" ) ## Create new fasta file with modified chromosome if index < iterations - 1: @@ -166,16 +164,16 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) temp_gff_name = temp_gff.name temp_gff.close() if prev_gff_path: - new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom) + new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index]) os.remove(prev_gff_path) else: - new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom) + new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index]) else: new_gff_name = annotation_name + '_reformed' + annotation_ext if prev_gff_path: - new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom) + new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index]) else: - new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom) + new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index]) prev_gff_path = new_gff_path return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path @@ -205,7 +203,7 @@ def read_fasta(in_arg, index, prev_fasta_path): chrom_seqs = index_fasta(in_arg.ref_fasta) return record, chrom_seqs -def read_gff(in_arg, index): +def check_gff(in_arg, index): """ Reads the GFF file, verifies its existence, and prints its file path information. """ @@ -680,7 +678,7 @@ def get_input_args(): chrom_group.add_argument('--chrom', type=str, help="Chromosome name (String)") chrom_group.add_argument('--new_chrom', type=str, - help="New Chromosome name (String)") + help="Comma-separated new chromosome name (String)") parser.add_argument('--in_fasta', type=str, required=True, help="Path(s) to new sequence(s) to be inserted into reference genome in fasta format") parser.add_argument('--in_gff', type=str, required=True, @@ -734,6 +732,8 @@ def get_input_args(): if in_args.position or in_args.upstream_fasta or in_args.downstream_fasta: parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.") exit() + ## Convert new_chrom from string to list + in_args.new_chrom = in_args.new_chrom.split(',') return in_args, iterations diff --git a/test_data/18/gold.fa b/test_data/18/gold.fa new file mode 100644 index 0000000..0194856 --- /dev/null +++ b/test_data/18/gold.fa @@ -0,0 +1,8 @@ +>X +ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK +>Y +AAAATTTTGGGGCCCC +>H +GGGGAATTCCCCGGGG +>M +CCCCGGGGAAAATTTT diff --git a/test_data/18/gold.gtf b/test_data/18/gold.gtf new file mode 100644 index 0000000..7d876ea --- /dev/null +++ b/test_data/18/gold.gtf @@ -0,0 +1,16 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref stop_codon 14 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +M ref exon 1 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref CDS 4 14 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref start_codon 1 3 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref stop_codon 14 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; diff --git a/test_data/18/in1.fa b/test_data/18/in1.fa new file mode 100644 index 0000000..2b55705 --- /dev/null +++ b/test_data/18/in1.fa @@ -0,0 +1,2 @@ +>Y +AAAATTTTGGGGCCCC diff --git a/test_data/18/in1.gtf b/test_data/18/in1.gtf new file mode 100644 index 0000000..7dc556d --- /dev/null +++ b/test_data/18/in1.gtf @@ -0,0 +1,4 @@ +Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; diff --git a/test_data/18/in2.fa b/test_data/18/in2.fa new file mode 100644 index 0000000..0b5cf0c --- /dev/null +++ b/test_data/18/in2.fa @@ -0,0 +1,2 @@ +>H +GGGGAATTCCCCGGGG diff --git a/test_data/18/in2.gtf b/test_data/18/in2.gtf new file mode 100644 index 0000000..f4e79ad --- /dev/null +++ b/test_data/18/in2.gtf @@ -0,0 +1,4 @@ +H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref stop_codon 14 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; diff --git a/test_data/18/in3.fa b/test_data/18/in3.fa new file mode 100644 index 0000000..38306a6 --- /dev/null +++ b/test_data/18/in3.fa @@ -0,0 +1,2 @@ +>M +CCCCGGGGAAAATTTT diff --git a/test_data/18/in3.gtf b/test_data/18/in3.gtf new file mode 100644 index 0000000..ee9257c --- /dev/null +++ b/test_data/18/in3.gtf @@ -0,0 +1,4 @@ +M ref exon 1 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref CDS 4 14 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref start_codon 1 3 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref stop_codon 14 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; diff --git a/test_data/18/ref.fa b/test_data/18/ref.fa new file mode 100644 index 0000000..6fa4d80 --- /dev/null +++ b/test_data/18/ref.fa @@ -0,0 +1,2 @@ +>X +ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK diff --git a/test_data/18/ref.gtf b/test_data/18/ref.gtf new file mode 100644 index 0000000..6737be8 --- /dev/null +++ b/test_data/18/ref.gtf @@ -0,0 +1,4 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; diff --git a/test_reform.py b/test_reform.py index 61054b6..0d421c2 100644 --- a/test_reform.py +++ b/test_reform.py @@ -699,7 +699,46 @@ def test_case_17(self): --in_fasta=in.fa \ --in_gff=in.gtf \ --ref_fasta=ref.fa \ - --ref_gff=ref.gtf \ + --ref_gff=ref.gtf + """ + + response = subprocess.getoutput(command) + print(response) + + with open('gold.gtf', 'r') as f: + gold_gff = f.read() + with open('ref_reformed.gtf', 'r') as f: + new_gff = f.read() + print("Testing gtf") + self.assertListEqual(list(gold_gff), list(new_gff)) + print("Done") + + with open('gold.fa', 'r') as f: + gold_fa = f.read() + with open('ref_reformed.fa', 'r') as f: + new_fa = f.read() + print("Testing Fasta") + self.assertListEqual(list(gold_fa), list(new_fa)) + print("Done") + + os.chdir(wd) + + def test_case_18(self): + """ + Case 18: + Testing Reform which adding multiple new chroms + """ + + wd = os.getcwd() + os.chdir('test_data/18/') + + command = """ + python3 ../../reform.py \ + --new_chrom="Y,H,M" \ + --in_fasta=in1.fa,in2.fa,in3.fa \ + --in_gff=in1.gtf,in2.gtf,in3.gtf \ + --ref_fasta=ref.fa \ + --ref_gff=ref.gtf """ response = subprocess.getoutput(command) From bf158ee990ba5ac605cc728f499ae1d8f3ab5cb5 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Fri, 14 Mar 2025 14:25:12 -0400 Subject: [PATCH 05/14] Read meta-data from input gtf/gff --- reform.py | 63 ++++++++++++++++++++++++++++++++----------- test_data/18/gold.gtf | 1 + test_data/18/in1.gtf | 1 + 3 files changed, 49 insertions(+), 16 deletions(-) diff --git a/reform.py b/reform.py index eb918fc..c978a7a 100755 --- a/reform.py +++ b/reform.py @@ -101,7 +101,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications else: SeqIO.write([chrom_seqs[s]], f, "fasta") ## Read in new GFF features from in_gff - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], False) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) if index < iterations - 1: @@ -156,7 +156,7 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) SeqIO.write([chrom_seqs[s]], f, "fasta") SeqIO.write([new_record], f, "fasta") ## Read in new GFF features from in_gff - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], True) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) if index < iterations - 1: @@ -269,7 +269,7 @@ def modify_gff_line(elements, start=None, end=None, comment=None): ## Return the modified line return("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(elements[0], elements[1], elements[2], start, end, elements[5], elements[6], elements[7], comment)) -def get_in_gff_lines(in_gff): +def get_in_gff_lines(in_gff, is_add_new_chrom): ''' Takes a gff file and returns a list of lists where each parent list item is a single line of the gff file @@ -278,16 +278,38 @@ def get_in_gff_lines(in_gff): with open(in_gff, "r") as f: in_gff_lines = [] for line in f: - line_elements = line.split('\t') - # Skip comment lines - if line.startswith("#"): + # Skip empty lines + if not line.strip(): continue - # Ensure lines have 9 columns - if len(line_elements) != 9: - print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) - print(line_elements) - exit() - in_gff_lines.append(line_elements) + + # Handle differently based on whether we're adding a new chromosome or modifying existing + if not is_add_new_chrom: + # Skip comment lines when modifying existing chromosome + if line.startswith("#"): + continue + # Parse with tab delimiter for regular lines + line_elements = line.split('\t') + # Ensure lines have 9 columns when modifying existing chromosome + if len(line_elements) != 9: + print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) + print(line_elements) + exit() + in_gff_lines.append(line_elements) + else: + # For adding new chromosomes, process all header and feature lines + if line.startswith("#"): + # Select appropriate delimiter based on content + if '\t' in line: + line_elements = line.split('\t') + else: + line_elements = line.split() + if line_elements[0] == "##sequence-region": + line_elements[-1] += '\n' + in_gff_lines.append(line_elements) + elif len(line.split('\t')) == 9: + # Normal feature line with 9 tab-delimited columns + in_gff_lines.append(line.split('\t')) + print(in_gff_lines) return in_gff_lines def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_modifications): @@ -436,15 +458,22 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, ref_gff_path = tmp_f.name with open(ref_gff_path, "r") as f: for line in f: - line_elements = line.split('\t') + # For header lines, handle both tab and space-delimited formats if line.startswith("#"): - if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id: + if '\t' in line: + line_elements = line.split('\t') + else: + line_elements = line.split() + + if line.startswith("##sequence-region") and line_elements[1] == chrom_id: ## Edit the length of the chromosome original_length = int(line_elements[3]) new_length = original_length - (down_position - position) + new_seq_length line = line.replace(str(original_length), str(new_length)) gff_out.write(line) else: + # Regular feature lines are always tab-delimited + line_elements = line.split('\t') gff_chrom_id = line_elements[0] gff_feat_start = int(line_elements[3]) gff_feat_end = int(line_elements[4]) @@ -628,7 +657,9 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i if in_gff_lines: print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.") for new_annotation in in_gff_lines: - if new_annotation: + if new_annotation[0] == "##sequence-region": + gff_out.write(" ".join(new_annotation)) + elif new_annotation: gff_out.write("\t".join(new_annotation)) ## Cleanup temp file if needed @@ -739,4 +770,4 @@ def get_input_args(): if __name__ == "__main__": main() - + diff --git a/test_data/18/gold.gtf b/test_data/18/gold.gtf index 7d876ea..de2f092 100644 --- a/test_data/18/gold.gtf +++ b/test_data/18/gold.gtf @@ -2,6 +2,7 @@ X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +##sequence-region NC_000001.11 1 248956422 Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; diff --git a/test_data/18/in1.gtf b/test_data/18/in1.gtf index 7dc556d..86a6f67 100644 --- a/test_data/18/in1.gtf +++ b/test_data/18/in1.gtf @@ -1,3 +1,4 @@ +##sequence-region NC_000001.11 1 248956422 Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; From d7047cd3e3d78e9293c23542a6f43202b8402d6a Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Mon, 14 Apr 2025 17:49:23 -0400 Subject: [PATCH 06/14] feature improvement --- reform.py | 93 ++++++++++++++++++++++++++++--------------- test_data/18/gold.gtf | 3 +- test_data/18/in1.gtf | 2 +- test_data/18/in2.gtf | 1 + 4 files changed, 66 insertions(+), 33 deletions(-) diff --git a/reform.py b/reform.py index c978a7a..3661bb0 100755 --- a/reform.py +++ b/reform.py @@ -100,8 +100,8 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications SeqIO.write([new_record], f, "fasta") else: SeqIO.write([chrom_seqs[s]], f, "fasta") - ## Read in new GFF features from in_gff - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], False) + ## Read in new GFF features from in_gff, False means modify existing chrom + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) if index < iterations - 1: @@ -156,7 +156,8 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) SeqIO.write([chrom_seqs[s]], f, "fasta") SeqIO.write([new_record], f, "fasta") ## Read in new GFF features from in_gff - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], True) + ## Pass the new_chrom name from command line and the length of the new sequence to correct ##sequence-region line + in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], in_arg.new_chrom[index], len(new_seq)) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) if index < iterations - 1: @@ -268,8 +269,27 @@ def modify_gff_line(elements, start=None, end=None, comment=None): ## Return the modified line return("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(elements[0], elements[1], elements[2], start, end, elements[5], elements[6], elements[7], comment)) - -def get_in_gff_lines(in_gff, is_add_new_chrom): + +def valid_gff_line(line_elements): + ''' + Checks if the splited line is a valid GFF line. + Returns True if valid, False otherwise. + ''' + if not line_elements[0].startswith("##sequence-region"): + if len(line_elements) != 9: + print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) + print(line_elements) + return False + else: + ## Check if ##sequence-region line has 4 columns, the reason why use 5 here is because last element is + ## spliting format indicator. + if len(line_elements) != 5: + print("** ERROR: ##sequence-region line does not have 4 columns, it has", len(line_elements) - 1) + print(line_elements) + return False + return True + +def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None): ''' Takes a gff file and returns a list of lists where each parent list item is a single line of the gff file @@ -283,33 +303,42 @@ def get_in_gff_lines(in_gff, is_add_new_chrom): continue # Handle differently based on whether we're adding a new chromosome or modifying existing - if not is_add_new_chrom: - # Skip comment lines when modifying existing chromosome - if line.startswith("#"): - continue - # Parse with tab delimiter for regular lines - line_elements = line.split('\t') - # Ensure lines have 9 columns when modifying existing chromosome - if len(line_elements) != 9: - print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) - print(line_elements) + if line.startswith("##sequence-region"): + ## Paste ##sequence-region line which only exists in gtf/gff for adding new chromosome. + ## Select user used delimiter based on content + if '\t' in line: + line_elements = line.split('\t') + line_elements.append('\t') ## Add tab to the end as a format indicator + else: + line_elements = line.split() + line_elements.append(' ') ## Add whitespace to the end as a format indicator + if not valid_gff_line(line_elements): exit() - in_gff_lines.append(line_elements) + # Validate new_chrom value and correct if needed + original_chrom = line_elements[1] + if original_chrom != new_chrom: + print(f"** INFO: Updating chromosome name from {original_chrom} to {new_chrom} to fit the\ + input from new_chrom parameter in command-line.") + line_elements[1] = new_chrom + # Validate sequence_length value and correct if needed + original_start, original_end = line_elements[2], line_elements[3] + if original_start != "1": + print(f"** INFO: Updating start position from {original_start} to 1 to fit the\ + format requirement of annotation file.") + line_elements[2] = "1" + if original_end != str(sequence_length): + print(f"** INFO: Updating sequence length from {original_end} to {sequence_length} to fit the\ + length of sequence in input FASTA file.") + line_elements[3] = str(sequence_length) + elif line.startswith("#"): + ## Ignore other comment lines + continue else: - # For adding new chromosomes, process all header and feature lines - if line.startswith("#"): - # Select appropriate delimiter based on content - if '\t' in line: - line_elements = line.split('\t') - else: - line_elements = line.split() - if line_elements[0] == "##sequence-region": - line_elements[-1] += '\n' - in_gff_lines.append(line_elements) - elif len(line.split('\t')) == 9: - # Normal feature line with 9 tab-delimited columns - in_gff_lines.append(line.split('\t')) - print(in_gff_lines) + ## Split, check and add feature lines + line_elements = line.split('\t') + if not valid_gff_line(line_elements): + exit() + in_gff_lines.append(line_elements) return in_gff_lines def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_modifications): @@ -658,7 +687,9 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.") for new_annotation in in_gff_lines: if new_annotation[0] == "##sequence-region": - gff_out.write(" ".join(new_annotation)) + ## Use predefined format for sequence-region line + ## Remove format indicater, and add new line + gff_out.write(new_annotation[-1].join(new_annotation[:-1])+'\n') elif new_annotation: gff_out.write("\t".join(new_annotation)) diff --git a/test_data/18/gold.gtf b/test_data/18/gold.gtf index de2f092..29476bf 100644 --- a/test_data/18/gold.gtf +++ b/test_data/18/gold.gtf @@ -2,11 +2,12 @@ X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; -##sequence-region NC_000001.11 1 248956422 +##sequence-region Y 1 16 Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +##sequence-region H 1 16 H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1"; H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1"; diff --git a/test_data/18/in1.gtf b/test_data/18/in1.gtf index 86a6f67..326d9fa 100644 --- a/test_data/18/in1.gtf +++ b/test_data/18/in1.gtf @@ -1,4 +1,4 @@ -##sequence-region NC_000001.11 1 248956422 +##sequence-region Y.11 1 16 Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; diff --git a/test_data/18/in2.gtf b/test_data/18/in2.gtf index f4e79ad..9e0b425 100644 --- a/test_data/18/in2.gtf +++ b/test_data/18/in2.gtf @@ -1,3 +1,4 @@ +##sequence-region F.11 3 27 H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1"; H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1"; From eccc15cdab99019814ccc2f724c3d835d5d1929e Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Mon, 21 Apr 2025 20:25:36 -0400 Subject: [PATCH 07/14] Resolve comments --- reform.py | 37 ++++++++++++++++++++++++++++++++----- test_data/19/gold.fa | 8 ++++++++ test_data/19/gold.gtf | 18 ++++++++++++++++++ test_data/19/in1.fa | 2 ++ test_data/19/in1.gtf | 5 +++++ test_data/19/in2.fa | 2 ++ test_data/19/in2.gtf | 5 +++++ test_data/19/in3.fa | 2 ++ test_data/19/in3.gtf | 5 +++++ test_data/19/ref.fa | 2 ++ test_data/19/ref.gtf | 4 ++++ test_reform.py | 40 ++++++++++++++++++++++++++++++++++++++++ 12 files changed, 125 insertions(+), 5 deletions(-) create mode 100644 test_data/19/gold.fa create mode 100644 test_data/19/gold.gtf create mode 100644 test_data/19/in1.fa create mode 100644 test_data/19/in1.gtf create mode 100644 test_data/19/in2.fa create mode 100644 test_data/19/in2.gtf create mode 100644 test_data/19/in3.fa create mode 100644 test_data/19/in3.gtf create mode 100644 test_data/19/ref.fa create mode 100644 test_data/19/ref.gtf diff --git a/reform.py b/reform.py index 3661bb0..ec5ecbc 100755 --- a/reform.py +++ b/reform.py @@ -101,7 +101,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications else: SeqIO.write([chrom_seqs[s]], f, "fasta") ## Read in new GFF features from in_gff, False means modify existing chrom - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index]) + in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], existing_chrom=in_arg.chrom, new_chrom=None) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) if index < iterations - 1: @@ -157,7 +157,7 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) SeqIO.write([new_record], f, "fasta") ## Read in new GFF features from in_gff ## Pass the new_chrom name from command line and the length of the new sequence to correct ##sequence-region line - in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], in_arg.new_chrom[index], len(new_seq)) + in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], new_chrom=in_arg.new_chrom[index], sequence_length=len(new_seq)) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) if index < iterations - 1: @@ -190,6 +190,19 @@ def read_fasta(in_arg, index, prev_fasta_path): raise FileNotFoundError(f"Error: File {filename_fa} does not exist.") real_path_fa = os.path.realpath(filename_fa) record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0] + # Check for mismatch between FASTA record ID and command line chromosome name + if hasattr(in_arg, 'new_chrom') and in_arg.new_chrom is not None: + if record.id != in_arg.new_chrom[index]: + print(f"** WARNING: Mismatch detected between chromosome name in input FASTA ({record.id}) " + f"and command line parameter ({in_arg.new_chrom[index]}).") + print(f"Using command line chromosome name: {in_arg.new_chrom[index]}") + # The actual override happens in add_new_chrom_seq where a new SeqRecord is created + elif hasattr(in_arg, 'chrom') and in_arg.chrom is not None: + if record.id != in_arg.chrom: + print(f"** WARNING: Mismatch detected between chromosome name in input FASTA ({record.id}) " + f"and command line parameter ({in_arg.chrom}).") + print(f"Using command line chromosome name: {in_arg.chrom}") + # The actual override happens in modify_existing_chrom_seq where the existing sequence is modified except IndexError: raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.") except Exception as e: @@ -289,7 +302,7 @@ def valid_gff_line(line_elements): return False return True -def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None): +def get_in_gff_lines(in_gff=None, existing_chrom=None, new_chrom=None, sequence_length=None): ''' Takes a gff file and returns a list of lists where each parent list item is a single line of the gff file @@ -303,7 +316,7 @@ def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None): continue # Handle differently based on whether we're adding a new chromosome or modifying existing - if line.startswith("##sequence-region"): + if line.startswith("##sequence-region") and new_chrom is not None: ## Paste ##sequence-region line which only exists in gtf/gff for adding new chromosome. ## Select user used delimiter based on content if '\t' in line: @@ -336,6 +349,11 @@ def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None): else: ## Split, check and add feature lines line_elements = line.split('\t') + chorme_id = existing_chrom if existing_chrom else new_chrom + if line_elements[0] != chorme_id: + print("** Warning: The chromosome name in the GFF file does not match the new chromosome name.") + print(f"Correct the chromosome name {line_elements[0]} to {chorme_id}") + line_elements[0] = chorme_id if not valid_gff_line(line_elements): exit() in_gff_lines.append(line_elements) @@ -667,6 +685,7 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i """ Appends new annotations to an existing GFF file without modifying existing features. """ + gff_splitor = '' ref_gff_path = ref_gff ## Handle compressed .gz GFF files if ref_gff.endswith('.gz'): @@ -681,6 +700,12 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i ## Copy all existing annotations to new GFF file with open(ref_gff_path, "r", encoding="utf-8") as f: for line in f: + if line.startswith("##sequence-region") and gff_splitor == '': + ## Select user used delimiter based on content + if '\t' in line: + gff_splitor = ('\t') + else: + gff_splitor = (' ') gff_out.write(line) ## Append new annotations if present if in_gff_lines: @@ -688,8 +713,10 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i for new_annotation in in_gff_lines: if new_annotation[0] == "##sequence-region": ## Use predefined format for sequence-region line + if gff_splitor == '': + gff_splitor = new_annotation[-1] ## Remove format indicater, and add new line - gff_out.write(new_annotation[-1].join(new_annotation[:-1])+'\n') + gff_out.write(gff_splitor.join(new_annotation[:-1])+'\n') elif new_annotation: gff_out.write("\t".join(new_annotation)) diff --git a/test_data/19/gold.fa b/test_data/19/gold.fa new file mode 100644 index 0000000..0194856 --- /dev/null +++ b/test_data/19/gold.fa @@ -0,0 +1,8 @@ +>X +ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK +>Y +AAAATTTTGGGGCCCC +>H +GGGGAATTCCCCGGGG +>M +CCCCGGGGAAAATTTT diff --git a/test_data/19/gold.gtf b/test_data/19/gold.gtf new file mode 100644 index 0000000..29476bf --- /dev/null +++ b/test_data/19/gold.gtf @@ -0,0 +1,18 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +##sequence-region Y 1 16 +Y ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Y ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +##sequence-region H 1 16 +H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref stop_codon 14 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +M ref exon 1 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref CDS 4 14 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref start_codon 1 3 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref stop_codon 14 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; diff --git a/test_data/19/in1.fa b/test_data/19/in1.fa new file mode 100644 index 0000000..53a0c2f --- /dev/null +++ b/test_data/19/in1.fa @@ -0,0 +1,2 @@ +>Z +AAAATTTTGGGGCCCC diff --git a/test_data/19/in1.gtf b/test_data/19/in1.gtf new file mode 100644 index 0000000..26db7ce --- /dev/null +++ b/test_data/19/in1.gtf @@ -0,0 +1,5 @@ +##sequence-region Y.11 1 16 +Z ref exon 1 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Z ref CDS 4 14 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Z ref start_codon 1 3 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; +Z ref stop_codon 14 16 . + 0 gene_id "new_gene"; transcript_id "new_gene.1"; diff --git a/test_data/19/in2.fa b/test_data/19/in2.fa new file mode 100644 index 0000000..baf880f --- /dev/null +++ b/test_data/19/in2.fa @@ -0,0 +1,2 @@ +>Y +GGGGAATTCCCCGGGG diff --git a/test_data/19/in2.gtf b/test_data/19/in2.gtf new file mode 100644 index 0000000..9e0b425 --- /dev/null +++ b/test_data/19/in2.gtf @@ -0,0 +1,5 @@ +##sequence-region F.11 3 27 +H ref exon 1 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref CDS 4 14 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref start_codon 1 3 . + 0 gene_id "gene2"; transcript_id "gene2.1"; +H ref stop_codon 14 16 . + 0 gene_id "gene2"; transcript_id "gene2.1"; diff --git a/test_data/19/in3.fa b/test_data/19/in3.fa new file mode 100644 index 0000000..38306a6 --- /dev/null +++ b/test_data/19/in3.fa @@ -0,0 +1,2 @@ +>M +CCCCGGGGAAAATTTT diff --git a/test_data/19/in3.gtf b/test_data/19/in3.gtf new file mode 100644 index 0000000..46fe5e2 --- /dev/null +++ b/test_data/19/in3.gtf @@ -0,0 +1,5 @@ +##Test Data +M ref exon 1 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +K ref CDS 4 14 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +Z ref start_codon 1 3 . + 0 gene_id "gene4"; transcript_id "gene4.1"; +M ref stop_codon 14 16 . + 0 gene_id "gene4"; transcript_id "gene4.1"; diff --git a/test_data/19/ref.fa b/test_data/19/ref.fa new file mode 100644 index 0000000..6fa4d80 --- /dev/null +++ b/test_data/19/ref.fa @@ -0,0 +1,2 @@ +>X +ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK diff --git a/test_data/19/ref.gtf b/test_data/19/ref.gtf new file mode 100644 index 0000000..6737be8 --- /dev/null +++ b/test_data/19/ref.gtf @@ -0,0 +1,4 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; diff --git a/test_reform.py b/test_reform.py index 0d421c2..2768106 100644 --- a/test_reform.py +++ b/test_reform.py @@ -762,5 +762,45 @@ def test_case_18(self): os.chdir(wd) + def test_case_19(self): + """ + Case 19: + Testing Reform with incorrect new chrom and comments in input FASTA + and annotation files. Also, testing reform with more formal printing + """ + + wd = os.getcwd() + os.chdir('test_data/19/') + + command = """ + python3 ../../reform.py \ + --new_chrom="Y,H,M" \ + --in_fasta=in1.fa,in2.fa,in3.fa \ + --in_gff=in1.gtf,in2.gtf,in3.gtf \ + --ref_fasta=ref.fa \ + --ref_gff=ref.gtf + """ + + response = subprocess.getoutput(command) + print(response) + + with open('gold.gtf', 'r') as f: + gold_gff = f.read() + with open('ref_reformed.gtf', 'r') as f: + new_gff = f.read() + print("Testing gtf") + self.assertListEqual(list(gold_gff), list(new_gff)) + print("Done") + + with open('gold.fa', 'r') as f: + gold_fa = f.read() + with open('ref_reformed.fa', 'r') as f: + new_fa = f.read() + print("Testing Fasta") + self.assertListEqual(list(gold_fa), list(new_fa)) + print("Done") + + os.chdir(wd) + if __name__ == '__main__': unittest.main() From df049016c14baddacc61b25b163ccff1422f9c90 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Mon, 21 Apr 2025 21:26:47 -0400 Subject: [PATCH 08/14] use f-string --- reform.py | 43 ++++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/reform.py b/reform.py index ec5ecbc..ea6557c 100755 --- a/reform.py +++ b/reform.py @@ -7,11 +7,11 @@ from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord try: - import pgzip as gzip_module - print("\nUsing pgzip for gzip operations.\n") + import pgzip as gzip_module + print(f"\nUsing pgzip for gzip operations.\n") except ImportError: - import gzip as gzip_module - print("\npgzip not found, falling back to gzip.\n") + import gzip as gzip_module + print(f"\npgzip not found, falling back to gzip.\n") def main(): @@ -42,7 +42,7 @@ def main(): add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) print("------------------------------------------") - print("Reform Complete") + print(f"Reform Complete") print("------------------------------------------") print(f"New .fa file created: {os.path.realpath(new_fasta)}") print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}") @@ -71,9 +71,8 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications length_changed = len(str(record.seq)) - (down_position - position - 1) prev_modifications.append((position,length_changed)) if position != down_position: - print("Removing nucleotides from position {} - {}".format(position, down_position)) - print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}" - .format(record.description, in_arg.in_fasta[index], position, in_arg.chrom)) + print(f"Removing nucleotides from position {position} - {down_position}") + print(f"Proceeding to insert sequence '{record.description}' from {in_arg.in_fasta[index]} at position {position} on chromosome {in_arg.chrom}") ## Build the new chromosome sequence with the inserted_seq ## If the chromosome sequence length is in the header, replace it with new length new_seq = existing_seq_str[:position] + str(record.seq) + existing_seq_str[down_position:] @@ -207,7 +206,7 @@ def read_fasta(in_arg, index, prev_fasta_path): raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.") except Exception as e: raise ValueError(f"Error parsing FASTA file: {str(e)}") - print("Preparing to create new FASTA file") + print(f"Preparing to create new FASTA file") print(f"Original FASTA: {real_path_fa}") ## Generate index of sequences from ref reference fasta if prev_fasta_path: @@ -290,14 +289,14 @@ def valid_gff_line(line_elements): ''' if not line_elements[0].startswith("##sequence-region"): if len(line_elements) != 9: - print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements)) + print(f"** ERROR: in_gff file does not have 9 columns, it has {len(line_elements)}") print(line_elements) return False else: ## Check if ##sequence-region line has 4 columns, the reason why use 5 here is because last element is ## spliting format indicator. if len(line_elements) != 5: - print("** ERROR: ##sequence-region line does not have 4 columns, it has", len(line_elements) - 1) + print(f"** ERROR: ##sequence-region line does not have 4 columns, it has {len(line_elements) - 1}") print(line_elements) return False return True @@ -376,7 +375,7 @@ def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_mo position += lc if position > len(seq_str): print("** ERROR: Position greater than length of chromosome.") - print("Chromosome: {}\Chromosome length: {}\nPosition: \n{}".format(chrom, len(seq_str), position)) + print(f"Chromosome: {chrom}\nChromosome length: {len(seq_str)}\nPosition: {position}") exit() elif position == -1: position = len(seq_str) @@ -403,8 +402,8 @@ def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_mo else: print("** ERROR: The upstream and downstream target sequences must be present and unique in the specified chromosome.") print("Chromosome: {}\n".format(chrom)) - print("Upstream sequence found {} times".format(upstream_seq_count)) - print("Downstream sequence found {} times".format(downstream_seq_count)) + print(f"Upstream sequence found {upstream_seq_count} times") + print(f"Downstream sequence found {downstream_seq_count} times") exit() else: print("** ERROR: You must specify a valid position or upstream and downstream sequences.") @@ -602,8 +601,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, elif gff_feat_start <= position and gff_feat_end <= down_position: # Which side of the feature depends on the strand (we add this as a comment) x = "3" if gff_feat_strand == "+" else "5" - print("Feature cut off - {} prime side of feature cut off ({} strand)" - .format(x, gff_feat_strand)) + print(f"Feature cut off - {x} prime side of feature cut off ({gff_feat_strand} strand)") new_comment = format_comment( "{} prime side of feature cut-off by inserted sequence".format(x), gff_ext @@ -631,8 +629,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, and gff_feat_start <= down_position and gff_feat_end > down_position): x = "5" if gff_feat_strand == "+" else "3" - print("Feature cut off - {} prime side of feature cut off ({} strand)" - .format(x, gff_feat_strand)) + print(f"Feature cut off - {x} prime side of feature cut off ({gff_feat_strand} strand)") new_comment = format_comment( "{} prime side of feature cut-off by inserted sequence".format(x), gff_ext @@ -656,7 +653,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, gff_out.write(modified_line) else: - print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements)) + print(f"** Error: Unknown case for GFF modification. Exiting {line_elements}") exit() # If we've iterated over the entire original gff @@ -738,7 +735,7 @@ def format_comment(comment, ext): elif ext.lower().startswith('gff'): new_comment = "; reform_comment={}".format(comment) else: - print("** Error: Unrecognized extension {} in format_comment(). Exiting".format(ext)) + print(f"** Error: Unrecognized extension {ext} in format_comment(). Exiting") exit() return new_comment @@ -750,15 +747,15 @@ def rename_id(line): attributes = line.split('\t')[8].strip() elements = attributes.split(';') if elements[0].startswith("ID="): - print("Renaming split feature {} --> {}_split".format(elements[0], elements[0])) + print(f"Renaming split feature {elements[0]} --> {elements[0]}_split") return ("{}_split;{}".format(elements[0], ';'.join(elements[1:]))) elif elements[0].startswith("gene_id "): gene_id = re.match(r'gene_id \"(.+)\"', elements[0])[1] - print('Renaming split feature {} --> {}_split'.format(gene_id, gene_id)) + print(f"Renaming split feature {gene_id} --> {gene_id}_split") return ('gene _id "{}_split";{}'.format(gene_id, ';'.join(elements[1:]))) else: - print("This feature will not be renamed because it does not has an ID/gene_id attribute:\n", line) + print(f"This feature will not be renamed because it does not have an ID/gene_id attribute:\n{line}") return attributes def get_input_args(): From 1c263aa19892bc40a55fdf027a8e9741d7652284 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Thu, 24 Apr 2025 21:43:11 -0400 Subject: [PATCH 09/14] Helper Function for Chromosome Length Calculation --- reform.py | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/reform.py b/reform.py index ea6557c..ee7abdd 100755 --- a/reform.py +++ b/reform.py @@ -514,7 +514,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, if line.startswith("##sequence-region") and line_elements[1] == chrom_id: ## Edit the length of the chromosome original_length = int(line_elements[3]) - new_length = original_length - (down_position - position) + new_seq_length + new_length = calculate_new_length(original_length, position, down_position, new_seq_length) line = line.replace(str(original_length), str(new_length)) gff_out.write(line) else: @@ -555,7 +555,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, # "chromosome" or "region") elif gff_feat_type in ['chromosome', 'region']: original_length = gff_feat_end - new_length = original_length - (down_position - position) + new_seq_length + new_length = calculate_new_length(original_length, position, down_position, new_seq_length) line = line.replace(str(original_length), str(new_length)) gff_out.write(line) @@ -757,7 +757,14 @@ def rename_id(line): else: print(f"This feature will not be renamed because it does not have an ID/gene_id attribute:\n{line}") return attributes - + +def calculate_new_length(original_length, position, down_position, new_seq_length): + """ + Calculate the new chromosome/region length after a modification. + Returns: The new calculated length + """ + return original_length - (down_position - position) + new_seq_length + def get_input_args(): parser = argparse.ArgumentParser() chrom_group = parser.add_mutually_exclusive_group(required=True) From ad7d6a428caba7784787ef7c213c80c4856d4d11 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Fri, 9 May 2025 12:15:01 -0400 Subject: [PATCH 10/14] Improve printing in Reform and Test --- reform.py | 31 ++++-- test_order_explanation.py | 14 +++ test_reform.py | 208 ++++++++++++++++++++++++++++++++++++-- 3 files changed, 236 insertions(+), 17 deletions(-) create mode 100644 test_order_explanation.py diff --git a/reform.py b/reform.py index ee7abdd..9d6f667 100755 --- a/reform.py +++ b/reform.py @@ -6,17 +6,29 @@ from Bio import SeqIO from Bio.Seq import Seq from Bio.SeqRecord import SeqRecord + +## Importing gzip or pgzip module for file compression +print("------------------------------------------") +print(f"Compression Library Use:") +print("------------------------------------------") try: import pgzip as gzip_module - print(f"\nUsing pgzip for gzip operations.\n") + print(f"Using pgzip for gzip operations.") except ImportError: import gzip as gzip_module - print(f"\npgzip not found, falling back to gzip.\n") + print(f"pgzip not found, falling back to gzip.") def main(): ## Retrieve command line arguments and number of iterations in_arg, iterations = get_input_args() + + ## Print reference file paths at start of process + print("------------------------------------------") + print(f"Path of Reference Files:") + print("------------------------------------------") + print(f"Reference FASTA: {os.path.realpath(in_arg.ref_fasta)}") + print(f"Reference Annotation: {os.path.realpath(in_arg.ref_gff)}") ## List for previous postion and modification length prev_modifications = [] @@ -28,16 +40,19 @@ def main(): ## Sequential processing for index in range(iterations): # Start interation - print("-------------------------------------------") - print(f"Begin modification from in{index+1}.fa") - print("-------------------------------------------") if hasattr(in_arg, 'chrom') and in_arg.chrom is not None: ## Modify existing chrom seq + print("-------------------------------------------") + print(f"Begin modification from in{index+1}.fa") + print("-------------------------------------------") new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \ modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \ iterations, prev_gff_path) else: - ## Add new chrom + ## Add new chrom seq + print("-------------------------------------------") + print(f"Begin adding a new chromosome from in{index+1}.fa") + print("-------------------------------------------") new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \ add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) @@ -207,7 +222,7 @@ def read_fasta(in_arg, index, prev_fasta_path): except Exception as e: raise ValueError(f"Error parsing FASTA file: {str(e)}") print(f"Preparing to create new FASTA file") - print(f"Original FASTA: {real_path_fa}") + print(f"Original Input FASTA: {real_path_fa}") ## Generate index of sequences from ref reference fasta if prev_fasta_path: chrom_seqs = index_fasta(prev_fasta_path) @@ -225,7 +240,7 @@ def check_gff(in_arg, index): raise FileNotFoundError(f"Error: File {filename_gff} does not exist.") real_path_gff = os.path.realpath(filename_gff) print("Preparing to create new annotation file") - print(f"Original Annotation: {real_path_gff}") + print(f"Original Input Annotation: {real_path_gff}") print() ### print new line def index_fasta(fasta_path): diff --git a/test_order_explanation.py b/test_order_explanation.py new file mode 100644 index 0000000..253c36b --- /dev/null +++ b/test_order_explanation.py @@ -0,0 +1,14 @@ +import unittest + +class TestOrderExample(unittest.TestCase): + def test_z_third(self): + print("This runs third despite being defined first") + + def test_a_first(self): + print("This runs first despite being defined second") + + def test_b_second(self): + print("This runs second despite being defined third") + +if __name__ == '__main__': + unittest.main() diff --git a/test_reform.py b/test_reform.py index 2768106..683538d 100644 --- a/test_reform.py +++ b/test_reform.py @@ -19,13 +19,18 @@ def cleanup(self): os.system(cleanup_command) print("Done") - def test_case_1(self): + def test_case_01(self): """ Case 1: Indel within a feature resulting in deleted sequence, inserted sequence, and splitting of existing features """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/1/') @@ -59,9 +64,14 @@ def test_case_1(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) - def test_case_2(self): + def test_case_02(self): """ Case 2: Indel resulting in truncating 3' side of feature. @@ -69,6 +79,11 @@ def test_case_2(self): of feature. All but first position of original feature remains. """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/2/') @@ -102,9 +117,14 @@ def test_case_2(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) - def test_case_3(self): + def test_case_03(self): """ Case 3: Deletion resulting in removal of entire feature. @@ -112,6 +132,11 @@ def test_case_3(self): at last position of feature. """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/3/') @@ -145,14 +170,24 @@ def test_case_3(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) - def test_case_4(self): + def test_case_04(self): """ Case 4: Indel causing feature split. """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/4/') @@ -186,9 +221,14 @@ def test_case_4(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) - def test_case_5(self): + def test_case_05(self): """ Case 5: 1-base deletion, 10 base insertion. @@ -197,6 +237,11 @@ def test_case_5(self): remainder of the feature to be offset 9 bases downstream. """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/5/') @@ -230,9 +275,14 @@ def test_case_5(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) - def test_case_6(self): + def test_case_06(self): """ Case 6: 2 bp deletion, 10 bp insertion. @@ -241,6 +291,11 @@ def test_case_6(self): by 8 bases. """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/6/') @@ -274,9 +329,14 @@ def test_case_6(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) - def test_case_7(self): + def test_case_07(self): """ Case 7: Simple insertion using the position argument. @@ -284,6 +344,11 @@ def test_case_7(self): offset by length of insertion. """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/7/') @@ -316,14 +381,24 @@ def test_case_7(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) - def test_case_8(self): + def test_case_08(self): """ Case 8: Testing truncating features on reverse strand """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/8/') @@ -357,14 +432,24 @@ def test_case_8(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) - def test_case_9(self): + def test_case_09(self): """ Case 9: Insertion using the position argument at position 0 """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/9/') @@ -397,6 +482,11 @@ def test_case_9(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) def test_case_10(self): @@ -406,6 +496,11 @@ def test_case_10(self): (end of chromosome) """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/10/') @@ -438,6 +533,11 @@ def test_case_10(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) def test_case_11(self): @@ -446,6 +546,11 @@ def test_case_11(self): Testing GTF3 comment format (all above are GTF) """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/11/') @@ -478,6 +583,11 @@ def test_case_11(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) @@ -487,6 +597,11 @@ def test_case_12(self): Testing Case 2 with .gz ref sequence input """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/12/') @@ -520,6 +635,11 @@ def test_case_12(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) def test_case_13(self): @@ -528,6 +648,11 @@ def test_case_13(self): Testing Case 10 with .gz ref sequence input """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/13/') @@ -560,6 +685,11 @@ def test_case_13(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) def test_case_14(self): @@ -568,6 +698,11 @@ def test_case_14(self): Testing Sequential Processing which use multiple up.fa and down.fa files """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/14/') @@ -601,6 +736,11 @@ def test_case_14(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) @@ -610,6 +750,11 @@ def test_case_15(self): Testing Sequential Processing which combine test case 9, 10, 11 """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/15/') @@ -642,6 +787,11 @@ def test_case_15(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) def test_case_16(self): @@ -650,6 +800,11 @@ def test_case_16(self): Testing Reform which invalid chrom """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/16/') @@ -682,6 +837,11 @@ def test_case_16(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) def test_case_17(self): @@ -690,6 +850,11 @@ def test_case_17(self): Testing Reform which adding new chrom """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/17/') @@ -721,6 +886,11 @@ def test_case_17(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) def test_case_18(self): @@ -729,6 +899,11 @@ def test_case_18(self): Testing Reform which adding multiple new chroms """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/18/') @@ -760,6 +935,11 @@ def test_case_18(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) def test_case_19(self): @@ -769,6 +949,11 @@ def test_case_19(self): and annotation files. Also, testing reform with more formal printing """ + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + wd = os.getcwd() os.chdir('test_data/19/') @@ -800,6 +985,11 @@ def test_case_19(self): self.assertListEqual(list(gold_fa), list(new_fa)) print("Done") + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + os.chdir(wd) if __name__ == '__main__': From 6a27a58649adbde41e33f6b4e0c1089b7533a87b Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Fri, 9 May 2025 14:52:40 -0400 Subject: [PATCH 11/14] Addauto correction for wrong input GTF/GFF length --- reform.py | 86 ++++++++++++++++++++++++++++--------------- test_data/20/gold.fa | 4 ++ test_data/20/gold.gtf | 5 +++ test_data/20/in.fa | 2 + test_data/20/in.gtf | 1 + test_data/20/ref.fa | 2 + test_data/20/ref.gtf | 4 ++ test_reform.py | 51 +++++++++++++++++++++++++ 8 files changed, 126 insertions(+), 29 deletions(-) create mode 100644 test_data/20/gold.fa create mode 100644 test_data/20/gold.gtf create mode 100644 test_data/20/in.fa create mode 100644 test_data/20/in.gtf create mode 100644 test_data/20/ref.fa create mode 100644 test_data/20/ref.gtf diff --git a/reform.py b/reform.py index 9d6f667..32d9d31 100755 --- a/reform.py +++ b/reform.py @@ -149,6 +149,7 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) ## Build the new chromosome sequence by append new sequence below ## Using new_chrom as the id of the new chromosome new_seq = str(record.seq) + new_seq_length = len(new_seq) new_record = SeqRecord( Seq(new_seq), id=in_arg.new_chrom[index], # Use new_chrom @@ -171,7 +172,7 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) SeqIO.write([new_record], f, "fasta") ## Read in new GFF features from in_gff ## Pass the new_chrom name from command line and the length of the new sequence to correct ##sequence-region line - in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], new_chrom=in_arg.new_chrom[index], sequence_length=len(new_seq)) + in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], new_chrom=in_arg.new_chrom[index], sequence_length=new_seq_length) ## Create a temp file for gff, if index is not equal to last iteration annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) if index < iterations - 1: @@ -179,16 +180,16 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) temp_gff_name = temp_gff.name temp_gff.close() if prev_gff_path: - new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index]) + new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index], new_seq_length) os.remove(prev_gff_path) else: - new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index]) + new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index], new_seq_length) else: new_gff_name = annotation_name + '_reformed' + annotation_ext if prev_gff_path: - new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index]) + new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index], new_seq_length) else: - new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index]) + new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index], new_seq_length) prev_gff_path = new_gff_path return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path @@ -428,36 +429,39 @@ def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_mo exit() return {'position': position, 'down_position': down_position} -def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence_length, chrom): +def calculate_new_length_for_in_gff(in_gff_lines, position, sequence_length): ''' - in_gff_lines: a list of lists where each nested list is a list of - columns (in gff format) associated with each new feature to insert - split_features: contains information about features in the original GFF - file that were split due to the insertion of the new sequence. - sequence_length: length of the inserted sequence, used to determine - the new end positions in the GFF file. + Calculate the new length of the chromosome after modification. + This function checks the start and end positions of the features in the GFF file + and adjusts them based on the insertion position and the length of the inserted sequence. ''' - ## Replace the chromosome ID from in_gff with the correct chromosome ID - for l in in_gff_lines: - l[0] = chrom + ## List to store new gff lines + new_gff_lines = [] # Handling of single-line comments if len(in_gff_lines) == 1: - l = in_gff_lines[0] - ## Check length - ## l[3] is start position of fasta in in.gtf and l[4] is end position - seq_id = l[0] - if int(l[4]) - int(l[3]) + 1 != sequence_length: - print(f"** WARNING: Inconsistent length for {seq_id}. Correcting start position to 1 and end position to {sequence_length}.") - ## Correct start(l[3]) to 1 and end(l[4]) to length of insert fasta - new_gff_line = modify_gff_line( - l, start=1 + position, end=sequence_length + position) - gff_out.write(new_gff_line) + ## Check if the line isn't a comment line + if in_gff_lines[0][0].startswith("##sequence-region"): + new_gff_lines.append(in_gff_lines[0]) + else: + l = in_gff_lines[0] + ## Check length + ## l[3] is start position of fasta in in.gtf and l[4] is end position + seq_id = l[0] + if int(l[4]) - int(l[3]) + 1 != sequence_length: + print(f"** WARNING: Inconsistent length for {seq_id}. Correcting start position to 1 and end position to {sequence_length}.") + ## Correct start(l[3]) to 1 and end(l[4]) to length of insert fasta + new_gff_line = modify_gff_line( + l, start=1 + position, end=sequence_length + position) + new_gff_lines.append(new_gff_line) # Handling of multiple-line comments else: ### Step1: extract all start and end into corresponding set() start_positions = set() end_positions = set() for l in in_gff_lines: + ## Check length and ignore comment lines + if l[0].startswith("##sequence-region"): + continue start_positions.add(int(l[3])) end_positions.add(int(l[4])) ### Step2: Find min start and max end @@ -474,11 +478,32 @@ def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence ### Step4: Adjust start and end. Offset will be 0 if no adjust need offset = min_start - 1 for l in in_gff_lines: + ## Update length and ignore comment lines for lenght calculation + if l[0].startswith("##sequence-region"): + new_gff_lines.append(l) + continue ### Step5: Correct start(l[3]) and end(l[4]) by minus offset new_gff_line = modify_gff_line( l, start = int(l[3]) - offset + position, end = int(l[4]) - offset + position) - gff_out.write(new_gff_line) + new_gff_lines.append(new_gff_line) + return new_gff_lines +def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence_length, chrom): + ''' + in_gff_lines: a list of lists where each nested list is a list of + columns (in gff format) associated with each new feature to insert + split_features: contains information about features in the original GFF + file that were split due to the insertion of the new sequence. + sequence_length: length of the inserted sequence, used to determine + the new end positions in the GFF file. + ''' + ## Replace the chromosome ID from in_gff with the correct chromosome ID + for l in in_gff_lines: + l[0] = chrom + ## Check if the lenght in_gff_lines are valid, and correct if needed + new_gff_lines = calculate_new_length_for_in_gff(in_gff_lines, position, sequence_length) + for gff_line in new_gff_lines: + gff_out.write(gff_line) ## If insertion caused any existing features to be split, add ## the split features now immediately after adding the new features for sf in split_features: @@ -693,7 +718,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, os.remove(ref_gff_path) return new_gff_name -def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_id): +def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_id, sequence_length): """ Appends new annotations to an existing GFF file without modifying existing features. """ @@ -721,8 +746,11 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i gff_out.write(line) ## Append new annotations if present if in_gff_lines: + ## Call calculate_new_length_for_in_gff to correct the length of the chromosome + ## Since we are not modifying existing features, we can set position to 0 + new_gff_lines = calculate_new_length_for_in_gff(in_gff_lines, 0, sequence_length) print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.") - for new_annotation in in_gff_lines: + for new_annotation in new_gff_lines: if new_annotation[0] == "##sequence-region": ## Use predefined format for sequence-region line if gff_splitor == '': @@ -730,7 +758,7 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i ## Remove format indicater, and add new line gff_out.write(gff_splitor.join(new_annotation[:-1])+'\n') elif new_annotation: - gff_out.write("\t".join(new_annotation)) + gff_out.write(new_annotation) ## Cleanup temp file if needed if ref_gff.endswith('.gz') and ref_gff_path != ref_gff: diff --git a/test_data/20/gold.fa b/test_data/20/gold.fa new file mode 100644 index 0000000..26b1a4a --- /dev/null +++ b/test_data/20/gold.fa @@ -0,0 +1,4 @@ +>X +XZZZABBBBBDDDDDCCCCCIIIIIKKKKK +>Y +---------- diff --git a/test_data/20/gold.gtf b/test_data/20/gold.gtf new file mode 100644 index 0000000..789a83d --- /dev/null +++ b/test_data/20/gold.gtf @@ -0,0 +1,5 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +Y new exon 1 10 . + 0 gene_id "new"; transcript_id "new.1"; diff --git a/test_data/20/in.fa b/test_data/20/in.fa new file mode 100644 index 0000000..fa2f7d3 --- /dev/null +++ b/test_data/20/in.fa @@ -0,0 +1,2 @@ +>test 1 +---------- diff --git a/test_data/20/in.gtf b/test_data/20/in.gtf new file mode 100644 index 0000000..7beb1a8 --- /dev/null +++ b/test_data/20/in.gtf @@ -0,0 +1 @@ +X new exon 1 3 . + 0 gene_id "new"; transcript_id "new.1"; diff --git a/test_data/20/ref.fa b/test_data/20/ref.fa new file mode 100644 index 0000000..b73c712 --- /dev/null +++ b/test_data/20/ref.fa @@ -0,0 +1,2 @@ +>X +XZZZABBBBBDDDDDCCCCCIIIIIKKKKK diff --git a/test_data/20/ref.gtf b/test_data/20/ref.gtf new file mode 100644 index 0000000..6737be8 --- /dev/null +++ b/test_data/20/ref.gtf @@ -0,0 +1,4 @@ +X ref exon 5 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref CDS 8 22 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref start_codon 5 7 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; +X ref stop_codon 23 25 . + 0 gene_id "ref_gene"; transcript_id "ref_gene.1"; diff --git a/test_reform.py b/test_reform.py index 683538d..fb33299 100644 --- a/test_reform.py +++ b/test_reform.py @@ -991,6 +991,57 @@ def test_case_19(self): print() os.chdir(wd) + + def test_case_20(self): + """ + Case 20: + Simple addition using the position argument. + Test auto correction for wrong input fasta name, + wrong input gtf name and length + """ + + print() + print("------------------------------------------") + print("Test Start") + print("------------------------------------------") + + wd = os.getcwd() + os.chdir('test_data/20/') + + command = """ + python3 ../../reform.py \ + --new_chrom="Y" \ + --in_fasta=in.fa \ + --in_gff=in.gtf \ + --ref_fasta=ref.fa \ + --ref_gff=ref.gtf \ + """ + + response = subprocess.getoutput(command) + print(response) + + with open('gold.gtf', 'r') as f: + gold_gff = f.read() + with open('ref_reformed.gtf', 'r') as f: + new_gff = f.read() + print("Testing GTF") + self.assertListEqual(list(gold_gff), list(new_gff)) + print("Done") + + with open('gold.fa', 'r') as f: + gold_fa = f.read() + with open('ref_reformed.fa', 'r') as f: + new_fa = f.read() + print("Testing Fasta") + self.assertListEqual(list(gold_fa), list(new_fa)) + print("Done") + + print("------------------------------------------") + print("Test Done") + print("------------------------------------------") + print() + + os.chdir(wd) if __name__ == '__main__': unittest.main() From f0aa0c0c9df37d689802e11f216109f9d96743ed Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Fri, 9 May 2025 14:55:39 -0400 Subject: [PATCH 12/14] Remove Mismatch WARNING in editing existed sequence --- reform.py | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/reform.py b/reform.py index 32d9d31..69bed1a 100755 --- a/reform.py +++ b/reform.py @@ -212,12 +212,7 @@ def read_fasta(in_arg, index, prev_fasta_path): f"and command line parameter ({in_arg.new_chrom[index]}).") print(f"Using command line chromosome name: {in_arg.new_chrom[index]}") # The actual override happens in add_new_chrom_seq where a new SeqRecord is created - elif hasattr(in_arg, 'chrom') and in_arg.chrom is not None: - if record.id != in_arg.chrom: - print(f"** WARNING: Mismatch detected between chromosome name in input FASTA ({record.id}) " - f"and command line parameter ({in_arg.chrom}).") - print(f"Using command line chromosome name: {in_arg.chrom}") - # The actual override happens in modify_existing_chrom_seq where the existing sequence is modified + except IndexError: raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.") except Exception as e: From 6651fbd3698b264842f5fad784e6180d9753d000 Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Tue, 13 May 2025 10:53:24 -0400 Subject: [PATCH 13/14] Update Readme --- README.md | 71 +++++++++++++++++++++++++++++++++++++++++++------------ 1 file changed, 56 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index e74560e..690bf31 100644 --- a/README.md +++ b/README.md @@ -4,25 +4,55 @@ Execution of *ref*orm requires a reference sequence (fasta), reference annotation (GFF or GTF), the novel sequences to be added (fasta), and corresponding novel annotations (GFF or GTF). A user provides as arguments the name of the modified chromosome and either the position at which the novel sequence is inserted, or the upstream and downstream sequences flanking the novel sequences. This results in the addition and/or deletion of sequence from the reference in the modified fasta file. In addition to the novel annotations, any changes to the reference annotations that result from deleted or interrupted sequence are incorporated into the modified gff. Importantly, modified gff and fasta files include a record of the modifications. +In addition to the editing functionality described above, *ref*orm also supports the addition of novel chromosomes. This requires a reference sequence (FASTA) and annotation (GFF or GTF), along with the novel chromosome to be added. Users must provide the novel sequence (FASTA) and its corresponding annotation (GFF or GTF). The new chromosome is appended to the reference files, and all associated annotations are incorporated accordingly. + Learn more at https://gencore.bio.nyu.edu/reform/ ## Usage -*ref*orm requires Python3, pgzip and Biopython v1.78 or higher. +*ref*orm requires Python3 and Biopython v1.78 or higher. -Install pgzip and biopython if you don't already have it: +Install biopython if you don't already have it: -`pip install pgzip` `pip install biopython` +Reform supports reading and writing .gz files using gzip. To accelerate compression and decompression, it optionally supports pgzip, a parallel implementation of gzip. Users must install pgzip separately to enable this feature. + +Install pgzip if you don't already have it: + +`pip install pgzip` + Invoke the python script: +```bash +### Edit a sequence within position +python3 reform.py + --chrom= \ + --position=,, \ + --in_fasta=,, \ + --in_gff=,, \ + --ref_fasta= \ + --ref_gff= ``` + +```bash +### Edit a sequence within upstream & downstream python3 reform.py --chrom= \ - --position= \ - --in_fasta= \ - --in_gff= \ + --upstream_fasta=, , \ + --downstream_fasta=,, \ + --in_fasta=,, \ + --in_gff=,, \ + --ref_fasta= \ + --ref_gff= +``` + +```bash +### Append a novel chromosome sequence +python3 reform.py + --new_chrom= \ + --in_fasta=,, \ + --in_gff=,, \ --ref_fasta= \ --ref_gff= ``` @@ -31,15 +61,17 @@ python3 reform.py `chrom` ID of the chromsome to modify -`position` Position in chromosome at which to insert . Can use `-1` to add to end of chromosome. Note: Either position, or upstream AND downstream sequence must be provided. **Note: Position is 0-based** +`new_chrom` ID of the novel chromsome to append + +`position` Position in chromosome at which to insert . Can use `-1` to add to end of chromosome. Note: Either position, or upstream AND downstream sequence must be provided. To perform multiple edits in one run, provide multiple positions separated by commas (e.g., 0,5,-1). **Note: Position is 0-based** -`upstream_fasta` Path to Fasta file with upstream sequence. Note: Either position, or upstream AND downstream sequence must be provided. +`upstream_fasta` Paths to Fasta file with upstream sequence. Note: Either position, or upstream AND downstream sequence must be provided. -`downstream_fasta` Path to Fasta file with downstream sequence. Note: Either position, or upstream AND downstream sequence must be provided. +`downstream_fasta` Paths to Fasta file with downstream sequence. Note: Either position, or upstream AND downstream sequence must be provided. -`in_fasta` Path to new sequence to be inserted into reference genome in fasta format. +`in_fasta` Paths to new sequence to be inserted into reference genome in fasta format. -`in_gff` Path to GFF file describing new fasta sequence to be inserted. +`in_gff` Paths to GFF file describing new fasta sequence to be inserted. `ref_fasta` Path to reference fasta file. @@ -50,10 +82,10 @@ python3 reform.py ``` python3 reform.py --chrom="I" \ - --upstream_fasta="data/up.fa" \ - --downstream_fasta="data/down.fa" \ - --in_fasta="data/new.fa" \ - --in_gff="data/new.gff" \ + --upstream_fasta="data/up1.fa,data/up2.fa,data/up3.fa" \ + --downstream_fasta="data/down1.fa,data/down2.fa,data/down3.fa" \ + --in_fasta="data/new1.fa,data/new2.fa,data/new3.fa" \ + --in_gff="data/new1.gff,data/new2.gff,data/new3.gff" \ --ref_fasta="data/Saccharomyces_cerevisiae.R64-1-1.dna.toplevel.fa" \ --ref_gff="data/Saccharomyces_cerevisiae.R64-1-1.34.gff3" ``` @@ -64,3 +96,12 @@ python3 reform.py `reformed.gff3` Modified GFF file. +## Test +After local deployment or modification, you can use `test_reform.py` to verify the functionality of Reform. +This script contains an automated test suite using the Python `unittest` framework. It verifies the correctness of Reform across a variety of genome editing scenarios. + +To run all tests: + +```bash +python3 test_reform.py +``` \ No newline at end of file From 6562dd69ddb6e4a7fccff1151850d2c4104f3bfa Mon Sep 17 00:00:00 2001 From: Yuwei Sun Date: Wed, 14 May 2025 10:53:10 -0400 Subject: [PATCH 14/14] Add Setup --- build/lib/reform.py | 873 +++++++++++++++++++++++++++ reform.egg-info/PKG-INFO | 12 + reform.egg-info/SOURCES.txt | 9 + reform.egg-info/dependency_links.txt | 1 + reform.egg-info/entry_points.txt | 2 + reform.egg-info/requires.txt | 2 + reform.egg-info/top_level.txt | 1 + setup.py | 19 + 8 files changed, 919 insertions(+) create mode 100644 build/lib/reform.py create mode 100644 reform.egg-info/PKG-INFO create mode 100644 reform.egg-info/SOURCES.txt create mode 100644 reform.egg-info/dependency_links.txt create mode 100644 reform.egg-info/entry_points.txt create mode 100644 reform.egg-info/requires.txt create mode 100644 reform.egg-info/top_level.txt create mode 100644 setup.py diff --git a/build/lib/reform.py b/build/lib/reform.py new file mode 100644 index 0000000..69bed1a --- /dev/null +++ b/build/lib/reform.py @@ -0,0 +1,873 @@ +#!/bin/env python +import argparse +import re +import os +import tempfile +from Bio import SeqIO +from Bio.Seq import Seq +from Bio.SeqRecord import SeqRecord + +## Importing gzip or pgzip module for file compression +print("------------------------------------------") +print(f"Compression Library Use:") +print("------------------------------------------") +try: + import pgzip as gzip_module + print(f"Using pgzip for gzip operations.") +except ImportError: + import gzip as gzip_module + print(f"pgzip not found, falling back to gzip.") + + +def main(): + ## Retrieve command line arguments and number of iterations + in_arg, iterations = get_input_args() + + ## Print reference file paths at start of process + print("------------------------------------------") + print(f"Path of Reference Files:") + print("------------------------------------------") + print(f"Reference FASTA: {os.path.realpath(in_arg.ref_fasta)}") + print(f"Reference Annotation: {os.path.realpath(in_arg.ref_gff)}") + + ## List for previous postion and modification length + prev_modifications = [] + + ## Path for the files generated in sequential processing, mainly managed by tempfile. + prev_fasta_path = None + prev_gff_path = None + + ## Sequential processing + for index in range(iterations): + # Start interation + if hasattr(in_arg, 'chrom') and in_arg.chrom is not None: + ## Modify existing chrom seq + print("-------------------------------------------") + print(f"Begin modification from in{index+1}.fa") + print("-------------------------------------------") + new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \ + modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \ + iterations, prev_gff_path) + else: + ## Add new chrom seq + print("-------------------------------------------") + print(f"Begin adding a new chromosome from in{index+1}.fa") + print("-------------------------------------------") + new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \ + add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations) + + print("------------------------------------------") + print(f"Reform Complete") + print("------------------------------------------") + print(f"New .fa file created: {os.path.realpath(new_fasta)}") + print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}") + +def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, iterations, prev_gff_path): + """ + Modifies a specified and existing chromosome sequence by inserting/replacing a new sequence at a given + position and updates the corresponding FASTA and GFF files. + """ + ## Read FASTA + record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path) + ## Read annotation (gff/gtf) + check_gff(in_arg, index) + ## Obtain the sequence of the chromosome we want to modify + existing_seq = chrom_seqs[in_arg.chrom] + existing_seq_str = str(existing_seq.seq) + + ## Get the position to insert the new sequence + positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, existing_seq_str, prev_modifications) + position = positions['position'] + down_position = positions['down_position'] + ## Save current modification which include position(index) and length changed. + if position == down_position: + length_changed = len(str(record.seq)) + else: + length_changed = len(str(record.seq)) - (down_position - position - 1) + prev_modifications.append((position,length_changed)) + if position != down_position: + print(f"Removing nucleotides from position {position} - {down_position}") + print(f"Proceeding to insert sequence '{record.description}' from {in_arg.in_fasta[index]} at position {position} on chromosome {in_arg.chrom}") + ## Build the new chromosome sequence with the inserted_seq + ## If the chromosome sequence length is in the header, replace it with new length + new_seq = existing_seq_str[:position] + str(record.seq) + existing_seq_str[down_position:] + chrom_length = str(len(existing_seq_str)) + new_length = str(len(new_seq)) + new_record = SeqRecord( + Seq(new_seq), + id=existing_seq.id, + description=existing_seq.description.replace(chrom_length, new_length) + ) + ## Create new fasta file with modified chromosome + if index < iterations - 1: + new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa') + new_fasta_file.close() + new_fasta = new_fasta_file.name + prev_fasta_path = new_fasta + else: + ref_basename, _ = get_ref_basename(in_arg.ref_fasta) + ref_name = ref_basename + new_fasta = ref_name + '_reformed.fa' + with open(new_fasta, "w") as f: + for s in chrom_seqs: + if s == existing_seq.id: + SeqIO.write([new_record], f, "fasta") + else: + SeqIO.write([chrom_seqs[s]], f, "fasta") + ## Read in new GFF features from in_gff, False means modify existing chrom + in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], existing_chrom=in_arg.chrom, new_chrom=None) + ## Create a temp file for gff, if index is not equal to last iteration + annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) + if index < iterations - 1: + temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext) + temp_gff_name = temp_gff.name + temp_gff.close() + if prev_gff_path: + new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq))) + os.remove(prev_gff_path) + else: + new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq))) + else: + new_gff_name = annotation_name + '_reformed' + annotation_ext + if prev_gff_path: + new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq))) + else: + new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq))) + + prev_gff_path = new_gff_path + + return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path + +def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations): + ## Read FASTA + record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path) + ## Read annotation (gff/gtf) + check_gff(in_arg, index) + ## Check if new chromosome existed in sequence + if in_arg.new_chrom[index] in chrom_seqs: + raise ValueError(f"Chromosome {in_arg.new_chrom[index]} already exists in the FASTA file.") + ## Build the new chromosome sequence by append new sequence below + ## Using new_chrom as the id of the new chromosome + new_seq = str(record.seq) + new_seq_length = len(new_seq) + new_record = SeqRecord( + Seq(new_seq), + id=in_arg.new_chrom[index], # Use new_chrom + description=f"{in_arg.new_chrom[index]}" + ) + ## Create new fasta file with modified chromosome + if index < iterations - 1: + new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa') + new_fasta_file.close() + new_fasta = new_fasta_file.name + prev_fasta_path = new_fasta + else: + ref_basename, _ = get_ref_basename(in_arg.ref_fasta) + ref_name = ref_basename + new_fasta = ref_name + '_reformed.fa' + ## Write all the original chromosomes first, then new chromosomes. + with open(new_fasta, "w") as f: + for s in chrom_seqs: + SeqIO.write([chrom_seqs[s]], f, "fasta") + SeqIO.write([new_record], f, "fasta") + ## Read in new GFF features from in_gff + ## Pass the new_chrom name from command line and the length of the new sequence to correct ##sequence-region line + in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], new_chrom=in_arg.new_chrom[index], sequence_length=new_seq_length) + ## Create a temp file for gff, if index is not equal to last iteration + annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff) + if index < iterations - 1: + temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext) + temp_gff_name = temp_gff.name + temp_gff.close() + if prev_gff_path: + new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index], new_seq_length) + os.remove(prev_gff_path) + else: + new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index], new_seq_length) + else: + new_gff_name = annotation_name + '_reformed' + annotation_ext + if prev_gff_path: + new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index], new_seq_length) + else: + new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index], new_seq_length) + prev_gff_path = new_gff_path + return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path + +def read_fasta(in_arg, index, prev_fasta_path): + """ + Reads the FASTA file, extracts the sequence to be inserted, + and indexes the reference genome chromosome sequences. + """ + ## Read the new fasta (to be inserted into the ref genome) + try: + filename_fa = in_arg.in_fasta[index] + if not os.path.exists(filename_fa): + raise FileNotFoundError(f"Error: File {filename_fa} does not exist.") + real_path_fa = os.path.realpath(filename_fa) + record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0] + # Check for mismatch between FASTA record ID and command line chromosome name + if hasattr(in_arg, 'new_chrom') and in_arg.new_chrom is not None: + if record.id != in_arg.new_chrom[index]: + print(f"** WARNING: Mismatch detected between chromosome name in input FASTA ({record.id}) " + f"and command line parameter ({in_arg.new_chrom[index]}).") + print(f"Using command line chromosome name: {in_arg.new_chrom[index]}") + # The actual override happens in add_new_chrom_seq where a new SeqRecord is created + + except IndexError: + raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.") + except Exception as e: + raise ValueError(f"Error parsing FASTA file: {str(e)}") + print(f"Preparing to create new FASTA file") + print(f"Original Input FASTA: {real_path_fa}") + ## Generate index of sequences from ref reference fasta + if prev_fasta_path: + chrom_seqs = index_fasta(prev_fasta_path) + os.remove(prev_fasta_path) + else: + chrom_seqs = index_fasta(in_arg.ref_fasta) + return record, chrom_seqs + +def check_gff(in_arg, index): + """ + Reads the GFF file, verifies its existence, and prints its file path information. + """ + filename_gff = in_arg.in_gff[index] + if not os.path.exists(filename_gff): + raise FileNotFoundError(f"Error: File {filename_gff} does not exist.") + real_path_gff = os.path.realpath(filename_gff) + print("Preparing to create new annotation file") + print(f"Original Input Annotation: {real_path_gff}") + print() ### print new line + +def index_fasta(fasta_path): + ''' + Process incoming FASTA files, supporting both decompressed and uncompressed + formats. It takes a file path (fasta_path), decompresses the file into a + temporary file if it is a compressed file ending in .gz, and then indexes + the temporary file. Finally, the indexing result is returned. + ''' + if fasta_path.endswith('.gz'): + ## Create a tempfile to store uncompressde content + with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: + tmp_f_path = tmp_f.name + ## Use pgzip or gzip to decompress parallely. Set thread=None means use all cores + with gzip_module.open(fasta_path, 'rt', thread=None) as f: + tmp_f.write(f.read()) + chrom_seqs = SeqIO.index(tmp_f_path, 'fasta') + ## remove temp file + os.remove(tmp_f_path) + else: + chrom_seqs = SeqIO.index(fasta_path, 'fasta') + return chrom_seqs + +def get_ref_basename(filepath): + ''' + Takes a filepath and returns the filename and extension minus the .gz. + ''' + base = os.path.basename(filepath) + if base.endswith('.gz'): + base = base[:-3] # remove .gz + name, ext = os.path.splitext(base) + return name, ext + +def modify_gff_line(elements, start=None, end=None, comment=None): + ''' + Modifies an existing GFF line and returns the modified line. Currently, you can + override the start position, end position, and comment column. + + gff_out: an open file handle for writing new gff lines to + elements: a list containing each column (9 in total) of a single feature line in a gff file + start: if provided, overwrite the start position in elements with this start position + end: if provided, overwrite the end position in elements with this end position + comment: if provided, overwrite the comments column in elements with this comment + ''' + if start == None: + start = int(elements[3]) + if end == None: + end = int(elements[4]) + if comment == None: + comment = elements[8] + if not comment.endswith('\n'): + comment += '\n' + + ## Return the modified line + return("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(elements[0], elements[1], elements[2], start, end, elements[5], elements[6], elements[7], comment)) + +def valid_gff_line(line_elements): + ''' + Checks if the splited line is a valid GFF line. + Returns True if valid, False otherwise. + ''' + if not line_elements[0].startswith("##sequence-region"): + if len(line_elements) != 9: + print(f"** ERROR: in_gff file does not have 9 columns, it has {len(line_elements)}") + print(line_elements) + return False + else: + ## Check if ##sequence-region line has 4 columns, the reason why use 5 here is because last element is + ## spliting format indicator. + if len(line_elements) != 5: + print(f"** ERROR: ##sequence-region line does not have 4 columns, it has {len(line_elements) - 1}") + print(line_elements) + return False + return True + +def get_in_gff_lines(in_gff=None, existing_chrom=None, new_chrom=None, sequence_length=None): + ''' + Takes a gff file and returns a list of lists where + each parent list item is a single line of the gff file + and the child elements are the columns of the line + ''' + with open(in_gff, "r") as f: + in_gff_lines = [] + for line in f: + # Skip empty lines + if not line.strip(): + continue + + # Handle differently based on whether we're adding a new chromosome or modifying existing + if line.startswith("##sequence-region") and new_chrom is not None: + ## Paste ##sequence-region line which only exists in gtf/gff for adding new chromosome. + ## Select user used delimiter based on content + if '\t' in line: + line_elements = line.split('\t') + line_elements.append('\t') ## Add tab to the end as a format indicator + else: + line_elements = line.split() + line_elements.append(' ') ## Add whitespace to the end as a format indicator + if not valid_gff_line(line_elements): + exit() + # Validate new_chrom value and correct if needed + original_chrom = line_elements[1] + if original_chrom != new_chrom: + print(f"** INFO: Updating chromosome name from {original_chrom} to {new_chrom} to fit the\ + input from new_chrom parameter in command-line.") + line_elements[1] = new_chrom + # Validate sequence_length value and correct if needed + original_start, original_end = line_elements[2], line_elements[3] + if original_start != "1": + print(f"** INFO: Updating start position from {original_start} to 1 to fit the\ + format requirement of annotation file.") + line_elements[2] = "1" + if original_end != str(sequence_length): + print(f"** INFO: Updating sequence length from {original_end} to {sequence_length} to fit the\ + length of sequence in input FASTA file.") + line_elements[3] = str(sequence_length) + elif line.startswith("#"): + ## Ignore other comment lines + continue + else: + ## Split, check and add feature lines + line_elements = line.split('\t') + chorme_id = existing_chrom if existing_chrom else new_chrom + if line_elements[0] != chorme_id: + print("** Warning: The chromosome name in the GFF file does not match the new chromosome name.") + print(f"Correct the chromosome name {line_elements[0]} to {chorme_id}") + line_elements[0] = chorme_id + if not valid_gff_line(line_elements): + exit() + in_gff_lines.append(line_elements) + return in_gff_lines + +def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_modifications): + ''' + Determine the position in seq_str to insert the new sequence given + the position, upstream, downstream, and chrom arguments. + Note that either position, or upstream AND downstream sequences must + be provided. + ''' + if positions and index < len(positions) and positions[index] >= -1: + print("Checking position validity") + position = positions[index] + ## Update current postion based on previous modification + for pos, lc in prev_modifications: + ## Also ignore when postion == -1 + if position >= pos: + position += lc + if position > len(seq_str): + print("** ERROR: Position greater than length of chromosome.") + print(f"Chromosome: {chrom}\nChromosome length: {len(seq_str)}\nPosition: {position}") + exit() + elif position == -1: + position = len(seq_str) + # At the moment we don't accept down position as a param + # so set down_position = position + down_position = position + print("Position valid") + else: + print("No valid position specified, checking for upstream and downstream sequence") + if index < len(upstream) and index < len(downstream): + seq_str = seq_str.upper() + upstream_fasta = list(SeqIO.parse(upstream[index], "fasta")) + upstream_seq = str(upstream_fasta[0].seq).upper() + downstream_fasta = list(SeqIO.parse(downstream[index], "fasta")) + downstream_seq = str(downstream_fasta[0].seq).upper() + # Ensure the upstream and downstream target sequences exists once in the selected chromosome, else die + upstream_seq_count = seq_str.count(upstream_seq) + downstream_seq_count = seq_str.count(downstream_seq) + if upstream_seq_count == 1 and downstream_seq_count == 1: + ## Obtain the starting position of the left_strand + new_index = seq_str.find(upstream_seq) + position = new_index + len(upstream_seq) + down_position = seq_str.find(downstream_seq) + else: + print("** ERROR: The upstream and downstream target sequences must be present and unique in the specified chromosome.") + print("Chromosome: {}\n".format(chrom)) + print(f"Upstream sequence found {upstream_seq_count} times") + print(f"Downstream sequence found {downstream_seq_count} times") + exit() + else: + print("** ERROR: You must specify a valid position or upstream and downstream sequences.") + exit() + if down_position < position: + print("** ERROR: Upstream and Downstream sequences must not overlap. Exiting.") + exit() + return {'position': position, 'down_position': down_position} + +def calculate_new_length_for_in_gff(in_gff_lines, position, sequence_length): + ''' + Calculate the new length of the chromosome after modification. + This function checks the start and end positions of the features in the GFF file + and adjusts them based on the insertion position and the length of the inserted sequence. + ''' + ## List to store new gff lines + new_gff_lines = [] + # Handling of single-line comments + if len(in_gff_lines) == 1: + ## Check if the line isn't a comment line + if in_gff_lines[0][0].startswith("##sequence-region"): + new_gff_lines.append(in_gff_lines[0]) + else: + l = in_gff_lines[0] + ## Check length + ## l[3] is start position of fasta in in.gtf and l[4] is end position + seq_id = l[0] + if int(l[4]) - int(l[3]) + 1 != sequence_length: + print(f"** WARNING: Inconsistent length for {seq_id}. Correcting start position to 1 and end position to {sequence_length}.") + ## Correct start(l[3]) to 1 and end(l[4]) to length of insert fasta + new_gff_line = modify_gff_line( + l, start=1 + position, end=sequence_length + position) + new_gff_lines.append(new_gff_line) + # Handling of multiple-line comments + else: + ### Step1: extract all start and end into corresponding set() + start_positions = set() + end_positions = set() + for l in in_gff_lines: + ## Check length and ignore comment lines + if l[0].startswith("##sequence-region"): + continue + start_positions.add(int(l[3])) + end_positions.add(int(l[4])) + ### Step2: Find min start and max end + min_start = min(start_positions) + max_end = max(end_positions) + ### Step3: Check sequence length, validness of min_start and max_end + if max_end - min_start + 1 != sequence_length: + raise ValueError(f"Error: Annotation length does not match sequence length. " + f"Expected {sequence_length}, but got {max_end - min_start + 1}") + if min_start < 1: + raise ValueError(f"Error: Invalid min_start value: {min_start}. It must be a positive integer.") + if min_start > max_end: + raise ValueError(f"Error: min_start ({min_start}) cannot be greater than max_end ({max_end}).") + ### Step4: Adjust start and end. Offset will be 0 if no adjust need + offset = min_start - 1 + for l in in_gff_lines: + ## Update length and ignore comment lines for lenght calculation + if l[0].startswith("##sequence-region"): + new_gff_lines.append(l) + continue + ### Step5: Correct start(l[3]) and end(l[4]) by minus offset + new_gff_line = modify_gff_line( + l, start = int(l[3]) - offset + position, end = int(l[4]) - offset + position) + new_gff_lines.append(new_gff_line) + return new_gff_lines + +def write_in_gff_lines(gff_out, in_gff_lines, position, split_features, sequence_length, chrom): + ''' + in_gff_lines: a list of lists where each nested list is a list of + columns (in gff format) associated with each new feature to insert + split_features: contains information about features in the original GFF + file that were split due to the insertion of the new sequence. + sequence_length: length of the inserted sequence, used to determine + the new end positions in the GFF file. + ''' + ## Replace the chromosome ID from in_gff with the correct chromosome ID + for l in in_gff_lines: + l[0] = chrom + ## Check if the lenght in_gff_lines are valid, and correct if needed + new_gff_lines = calculate_new_length_for_in_gff(in_gff_lines, position, sequence_length) + for gff_line in new_gff_lines: + gff_out.write(gff_line) + ## If insertion caused any existing features to be split, add + ## the split features now immediately after adding the new features + for sf in split_features: + modified_line = modify_gff_line( + sf[0], start = sf[1], end = sf[2], comment = sf[3]) + gff_out.write(modified_line) + + ## Return True after writing the new GFF lines + return True + +def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, chrom_id, new_seq_length): + ''' + Goes line by line through a gff file to remove existing features + (or parts of existing features) and/or insert new features. + In the process of adding new features, existing features + may be cut-off on either end or split into two. + This attempts to handle all cases. + new_gff_name: the name of the new gff file to create + ref_gff: the reference gff file to modify + in_gff_lines: a list of lists where each nested list is a list of + columns (in gff format) associated with each new feature to insert + position: start position of removal of existing sequence + down_position: end position of removal of existing sequence + chrom_id: the ID of the chromosome to modify + new_seq_length: the length of the new sequence being added to the chromosome + ''' + with open(new_gff_name, "w") as gff_out: + in_gff_lines_appended = False + split_features = [] + last_seen_chrom_id = None + gff_ext = new_gff_name.split('.')[-1] + ref_gff_path = ref_gff + if ref_gff.endswith('.gz'): + with gzip_module.open(ref_gff, 'rt') as f: + ## Create a tempfile to store uncompressde content + with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: + tmp_f.write(f.read()) + ref_gff_path = tmp_f.name + with open(ref_gff_path, "r") as f: + for line in f: + # For header lines, handle both tab and space-delimited formats + if line.startswith("#"): + if '\t' in line: + line_elements = line.split('\t') + else: + line_elements = line.split() + + if line.startswith("##sequence-region") and line_elements[1] == chrom_id: + ## Edit the length of the chromosome + original_length = int(line_elements[3]) + new_length = calculate_new_length(original_length, position, down_position, new_seq_length) + line = line.replace(str(original_length), str(new_length)) + gff_out.write(line) + else: + # Regular feature lines are always tab-delimited + line_elements = line.split('\t') + gff_chrom_id = line_elements[0] + gff_feat_start = int(line_elements[3]) + gff_feat_end = int(line_elements[4]) + gff_feat_type = line_elements[2] + gff_feat_strand = line_elements[6] + gff_comments = line_elements[8].strip() + + # If we've seen at least one chromosome + # and the last chromosome seen was the chromosome of interest (i.e. chrom_id) + # and now we're on a new chromosome in the original gff file + # and the in_gff_lines have not yet been appended: + # we assume they need to be appended to the end of the chromosome + # so append them before proceeding + if (last_seen_chrom_id is not None + and last_seen_chrom_id == chrom_id + and gff_chrom_id != last_seen_chrom_id + and not in_gff_lines_appended): + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) + + last_seen_chrom_id = gff_chrom_id + + # If this is not the chromosome of interest + # Or if the current feature ends before any + # modification (which occurs at position) + # Then simply write the feature as is (no modification) + # (remember gff co-ordinates are 1 based and position + # is 0 based, therefor check less than *or equal to*) + if gff_chrom_id != chrom_id or gff_feat_end <= position: + gff_out.write(line) + + # Modify chromosome feature length (if feature is + # "chromosome" or "region") + elif gff_feat_type in ['chromosome', 'region']: + original_length = gff_feat_end + new_length = calculate_new_length(original_length, position, down_position, new_seq_length) + line = line.replace(str(original_length), str(new_length)) + gff_out.write(line) + + # Split feature into 2 if feature starts before position + # and ends after down_position + elif gff_feat_start <= position and gff_feat_end > down_position: + print("Feature split") + print(line) + # Which side of the feature depends on the strand (we add this as a comment) + (x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5") + + new_comment = format_comment( + "original feature split by inserted sequence, this is the {} prime end".format(x), + gff_ext + ) + # Modified feature ends at 'position' + modified_line = modify_gff_line( + line_elements, + end = position, + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # The downstream flank(s) will be added immediately after the + # in_gff (new) features are written. + # First, attempt to rename IDs (to indicate split) + renamed_id_attributes = rename_id(line) + new_comment = format_comment( + "original feature split by inserted sequence, this is the {} prime end".format(y), + gff_ext + ) + split_features.append( + ( + line_elements, + position + new_seq_length + 1, + gff_feat_end + new_seq_length - (down_position - position), + renamed_id_attributes + new_comment + ) + ) + + # Change end position of feature to cut off point (position) if the + # feature ends within the deletion (between position & down_position) + elif gff_feat_start <= position and gff_feat_end <= down_position: + # Which side of the feature depends on the strand (we add this as a comment) + x = "3" if gff_feat_strand == "+" else "5" + print(f"Feature cut off - {x} prime side of feature cut off ({gff_feat_strand} strand)") + new_comment = format_comment( + "{} prime side of feature cut-off by inserted sequence".format(x), + gff_ext + ) + modified_line = modify_gff_line( + line_elements, + end = position, + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # Skip this feature if it falls entirely within the deletion + elif gff_feat_start > position and gff_feat_end <= down_position: + print("Skip feature (this feature was removed from sequence)") + continue + + else: + if not in_gff_lines_appended: + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) + + # Change start position of feature to after cutoff point if + # the feature starts within the deletion + if (gff_feat_start > position + and gff_feat_start <= down_position + and gff_feat_end > down_position): + x = "5" if gff_feat_strand == "+" else "3" + print(f"Feature cut off - {x} prime side of feature cut off ({gff_feat_strand} strand)") + new_comment = format_comment( + "{} prime side of feature cut-off by inserted sequence".format(x), + gff_ext + ) + modified_line = modify_gff_line( + line_elements, + start = position + new_seq_length + 1, + end = gff_feat_end + new_seq_length - (down_position - position), + comment = gff_comments + new_comment + ) + gff_out.write(modified_line) + + # Offset all downstream feature positions by offset length + elif gff_feat_start > down_position: + offset_length = new_seq_length - (down_position - position) + modified_line = modify_gff_line( + line_elements, + start = gff_feat_start + offset_length, + end = gff_feat_end + offset_length + ) + gff_out.write(modified_line) + + else: + print(f"** Error: Unknown case for GFF modification. Exiting {line_elements}") + exit() + + # If we've iterated over the entire original gff + # (i.e. out of the for loop which iterates over it) + # and still haven't written in_gff_lines + # and the last chromosome seen was the chromosome of interest (i.e. chrom_id): + # we assume they need to be appended to the end of the genome + # (i.e. end of the last chromosome) + # so append them now + if (last_seen_chrom_id is not None + and last_seen_chrom_id == chrom_id + and not in_gff_lines_appended): + in_gff_lines_appended = write_in_gff_lines( + gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id) + + # Checking to ensure in_gff_lines written + if not in_gff_lines_appended: + print("** Error: Something went wrong, in_gff not added to reference gff. Exiting") + exit() + # Remove temp gtf file + if ref_gff.endswith('.gz'): + os.remove(ref_gff_path) + return new_gff_name + +def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_id, sequence_length): + """ + Appends new annotations to an existing GFF file without modifying existing features. + """ + gff_splitor = '' + ref_gff_path = ref_gff + ## Handle compressed .gz GFF files + if ref_gff.endswith('.gz'): + with gzip_module.open(ref_gff, 'rt') as f: + with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f: + for line in f: + tmp_f.write(line) + tmp_f.flush() + ref_gff_path = tmp_f.name + ## Open new GFF file for writing + with open(new_gff_name, "w") as gff_out: + ## Copy all existing annotations to new GFF file + with open(ref_gff_path, "r", encoding="utf-8") as f: + for line in f: + if line.startswith("##sequence-region") and gff_splitor == '': + ## Select user used delimiter based on content + if '\t' in line: + gff_splitor = ('\t') + else: + gff_splitor = (' ') + gff_out.write(line) + ## Append new annotations if present + if in_gff_lines: + ## Call calculate_new_length_for_in_gff to correct the length of the chromosome + ## Since we are not modifying existing features, we can set position to 0 + new_gff_lines = calculate_new_length_for_in_gff(in_gff_lines, 0, sequence_length) + print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.") + for new_annotation in new_gff_lines: + if new_annotation[0] == "##sequence-region": + ## Use predefined format for sequence-region line + if gff_splitor == '': + gff_splitor = new_annotation[-1] + ## Remove format indicater, and add new line + gff_out.write(gff_splitor.join(new_annotation[:-1])+'\n') + elif new_annotation: + gff_out.write(new_annotation) + + ## Cleanup temp file if needed + if ref_gff.endswith('.gz') and ref_gff_path != ref_gff: + try: + os.remove(ref_gff_path) + except Exception as e: + print(f"Warning: Failed to delete temp file {ref_gff_path}: {e}") + + return new_gff_name + +def format_comment(comment, ext): + ''' + Format comment according to ext (GFF or GTF) and return + ''' + if ext.lower() == 'gtf': + new_comment = 'reform_comment "{}";'.format(comment) + elif ext.lower().startswith('gff'): + new_comment = "; reform_comment={}".format(comment) + else: + print(f"** Error: Unrecognized extension {ext} in format_comment(). Exiting") + exit() + return new_comment + +def rename_id(line): + ''' + Given a gff or gtf line, this function will append the string "_split" + to the end of the ID/gene_id attribute + ''' + attributes = line.split('\t')[8].strip() + elements = attributes.split(';') + if elements[0].startswith("ID="): + print(f"Renaming split feature {elements[0]} --> {elements[0]}_split") + return ("{}_split;{}".format(elements[0], ';'.join(elements[1:]))) + elif elements[0].startswith("gene_id "): + gene_id = re.match(r'gene_id \"(.+)\"', elements[0])[1] + print(f"Renaming split feature {gene_id} --> {gene_id}_split") + return ('gene _id "{}_split";{}'.format(gene_id, ';'.join(elements[1:]))) + + else: + print(f"This feature will not be renamed because it does not have an ID/gene_id attribute:\n{line}") + return attributes + +def calculate_new_length(original_length, position, down_position, new_seq_length): + """ + Calculate the new chromosome/region length after a modification. + Returns: The new calculated length + """ + return original_length - (down_position - position) + new_seq_length + +def get_input_args(): + parser = argparse.ArgumentParser() + chrom_group = parser.add_mutually_exclusive_group(required=True) + chrom_group.add_argument('--chrom', type=str, + help="Chromosome name (String)") + chrom_group.add_argument('--new_chrom', type=str, + help="Comma-separated new chromosome name (String)") + parser.add_argument('--in_fasta', type=str, required=True, + help="Path(s) to new sequence(s) to be inserted into reference genome in fasta format") + parser.add_argument('--in_gff', type=str, required=True, + help="Path(s) to GFF file(s) describing new fasta sequence(s) to be inserted") + parser.add_argument('--upstream_fasta', type=str, default=None, + help="Path(s) to Fasta file(s) with upstream sequence. Either position, or upstream AND downstream sequence must be provided.") + parser.add_argument('--downstream_fasta', type=str, default=None, + help="Path(s) to Fasta file(s) with downstream sequence. Either position, or upstream AND downstream sequence must be provided.") + parser.add_argument('--position', type=str, default=None, + help="Comma-separated positions at which to insert new sequence. Note: Position is 0-based, no space between each comma. Either position, or upstream AND downstream sequence must be provided.") + parser.add_argument('--ref_fasta', type = str, required = True, + help = "Path to reference fasta file") + parser.add_argument('--ref_gff', type = str, required = True, + help = "Path to reference gff file") + + in_args = parser.parse_args() + in_args.in_fasta = in_args.in_fasta.split(',') + in_args.in_gff = in_args.in_gff.split(',') + if (len(in_args.in_fasta) != len(in_args.in_gff)): + print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.") + exit() + else: + iterations = len(in_args.in_fasta) + ## Modify existing chrom + if in_args.chrom: + if in_args.upstream_fasta: + in_args.upstream_fasta = in_args.upstream_fasta.split(',') + if in_args.downstream_fasta: + in_args.downstream_fasta = in_args.downstream_fasta.split(',') + if in_args.position is None and (in_args.upstream_fasta is None or in_args.downstream_fasta is None): + print("** Error: You must provide either the position, or the upstream and downstream sequences.") + exit() + if in_args.position is not None: + try: + in_args.position = list(map(int, in_args.position.split(','))) + except ValueError: + print("** Error: Position must be a comma-separated list of integers, like 1,5,-1.") + exit() + if iterations != len(in_args.position): + print("** Error: Position must be a equal to number of input FASTA") + exit() + else: + if iterations != len(in_args.upstream_fasta): + print("** Error: Upstream FASTA must be a equal to number of input FASTA") + exit() + if not in_args.position and len(in_args.upstream_fasta) != len(in_args.downstream_fasta): + print("** Error: The number of upstream_fasta and downstream_fasta files does not match.") + exit() + ## Add new chrom + else: + if in_args.position or in_args.upstream_fasta or in_args.downstream_fasta: + parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.") + exit() + ## Convert new_chrom from string to list + in_args.new_chrom = in_args.new_chrom.split(',') + + return in_args, iterations + +if __name__ == "__main__": + main() + diff --git a/reform.egg-info/PKG-INFO b/reform.egg-info/PKG-INFO new file mode 100644 index 0000000..db24290 --- /dev/null +++ b/reform.egg-info/PKG-INFO @@ -0,0 +1,12 @@ +Metadata-Version: 2.4 +Name: reform +Version: 1.1.0 +Summary: A tool for editing genome sequences and annotation files +Author: gencorefacility +License: BSD-3-Clause +Requires-Dist: biopython +Requires-Dist: pgzip +Dynamic: author +Dynamic: license +Dynamic: requires-dist +Dynamic: summary diff --git a/reform.egg-info/SOURCES.txt b/reform.egg-info/SOURCES.txt new file mode 100644 index 0000000..53ddea1 --- /dev/null +++ b/reform.egg-info/SOURCES.txt @@ -0,0 +1,9 @@ +README.md +reform.py +setup.py +reform.egg-info/PKG-INFO +reform.egg-info/SOURCES.txt +reform.egg-info/dependency_links.txt +reform.egg-info/entry_points.txt +reform.egg-info/requires.txt +reform.egg-info/top_level.txt \ No newline at end of file diff --git a/reform.egg-info/dependency_links.txt b/reform.egg-info/dependency_links.txt new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/reform.egg-info/dependency_links.txt @@ -0,0 +1 @@ + diff --git a/reform.egg-info/entry_points.txt b/reform.egg-info/entry_points.txt new file mode 100644 index 0000000..c35a1e8 --- /dev/null +++ b/reform.egg-info/entry_points.txt @@ -0,0 +1,2 @@ +[console_scripts] +reform = reform:main diff --git a/reform.egg-info/requires.txt b/reform.egg-info/requires.txt new file mode 100644 index 0000000..f99b0c6 --- /dev/null +++ b/reform.egg-info/requires.txt @@ -0,0 +1,2 @@ +biopython +pgzip diff --git a/reform.egg-info/top_level.txt b/reform.egg-info/top_level.txt new file mode 100644 index 0000000..b3b8f80 --- /dev/null +++ b/reform.egg-info/top_level.txt @@ -0,0 +1 @@ +reform diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..60e727e --- /dev/null +++ b/setup.py @@ -0,0 +1,19 @@ +from setuptools import setup + +setup( + name='reform', + version='1.1.0', + py_modules=['reform'], + install_requires=[ + 'biopython', + 'pgzip', + ], + entry_points={ + 'console_scripts': [ + 'reform = reform:main', + ], + }, + author="gencorefacility", + description='A tool for editing genome sequences and annotation files', + license='BSD-3-Clause', +) \ No newline at end of file