From edfd89be6c7e2ba8395c3abbf6b74df9e30a29cf Mon Sep 17 00:00:00 2001
From: Yuwei Sun <ys4680@nyu.edu>
Date: Wed, 29 Jan 2025 14:20:11 -0500
Subject: [PATCH 01/11] addjust script printing

---
 reform.py | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/reform.py b/reform.py
index 949342f..7e5655b 100755
--- a/reform.py
+++ b/reform.py
@@ -27,7 +27,11 @@ def main():
 
 	## Sequential processing
 	for index in range(iterations):
-		## Read the new fasta (to be inserted into the ref genome)
+		# Start interation
+		print("-------------------------------------------")
+		print(f"Begin modification from in{index+1}.fa")
+		print("-------------------------------------------")
+  		## Read the new fasta (to be inserted into the ref genome)
 		try:
 			record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0]
 		except IndexError:
@@ -91,16 +95,17 @@ def main():
 					SeqIO.write([new_record], f, "fasta")
 				else:
 					SeqIO.write([chrom_seqs[s]], f, "fasta")
-		print("New fasta file created: ", new_fasta)
-		
-		print("Preparing to create new annotation file")
+		if (index == iterations-1):
+			print("------------------------------------------")
+			print("Reform Complete")
+			print("------------------------------------------")
+			print("New fasta file created: ", new_fasta)
 		
 		## Read in new GFF features from in_gff
 		in_gff_lines = get_in_gff_lines(in_arg.in_gff[index])
 		
 		## Create a temp file for gff, if index is not equal to last iteration
 		annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
-		print(in_arg.ref_gff)
 		if index < iterations - 1:
 			temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext)
 			temp_gff_name = temp_gff.name
@@ -117,7 +122,9 @@ def main():
 			else:
 				new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
 		prev_gff_path = new_gff_path
-		print("New {} file created: {} ".format(annotation_ext.upper(), prev_gff_path))
+		if (index == iterations-1):
+			print(f"New fasta file created: {new_gff_path}")
+			print(f"Original Annotation: {in_arg.ref_gff}")
 
 
 def index_fasta(fasta_path):
@@ -394,6 +401,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
 					# and ends after down_position
 					elif gff_feat_start <= position and gff_feat_end > down_position:
 						print("Feature split")
+						print(line)
 						# Which side of the feature depends on the strand (we add this as a comment)
 						(x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5")
 						

From ee82a45cb5542585fb8fcbcc3cd8d65848f821b5 Mon Sep 17 00:00:00 2001
From: Yuwei Sun <ys4680@nyu.edu>
Date: Mon, 3 Feb 2025 13:39:04 -0500
Subject: [PATCH 02/11] add just print statement

---
 reform.py | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/reform.py b/reform.py
index 7e5655b..d164c5a 100755
--- a/reform.py
+++ b/reform.py
@@ -8,10 +8,10 @@
 from Bio.SeqRecord import SeqRecord
 try:
     import pgzip as gzip_module
-    print("Using pgzip for gzip operations.")
+    print("\nUsing pgzip for gzip operations.\n")
 except ImportError:
     import gzip as gzip_module
-    print("pgzip not found, falling back to gzip.")
+    print("\npgzip not found, falling back to gzip.\n")
 
 
 def main():
@@ -33,14 +33,26 @@ def main():
 		print("-------------------------------------------")
   		## Read the new fasta (to be inserted into the ref genome)
 		try:
+			filename_fa = in_arg.in_fasta[index]
+			if not os.path.exists(filename_fa):
+				raise FileNotFoundError(f"Error: File {filename_fa} does not exist.")
+			real_path_fa = os.path.realpath(filename_fa)
 			record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0]
 		except IndexError:
-			filename = in_arg.in_fasta[index].name
-			raise ValueError(f"Error: {filename} is not a valid FASTA file.")
+			raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.")
 		except Exception as e:
 			raise ValueError(f"Error parsing FASTA file: {str(e)}")
+		print("Preparing to create new FASTA file")
+		print(f"Original FASTA: {real_path_fa}")
+		filename_gff = in_arg.in_gff[index]
+		if not os.path.exists(filename_gff):
+			raise FileNotFoundError(f"Error: File {filename_gff} does not exist.")
+		real_path_gff = os.path.realpath(filename_gff)
+		print("Preparing to create new annotation file")
+		print(f"Original Annotation: {real_path_gff}")
+		print() ### print new line
 
-		## Generate index of sequences from ref reference fasta
+  		## Generate index of sequences from ref reference fasta
 		if prev_fasta_path:
 			chrom_seqs = index_fasta(prev_fasta_path)
 			os.remove(prev_fasta_path)
@@ -95,11 +107,6 @@ def main():
 					SeqIO.write([new_record], f, "fasta")
 				else:
 					SeqIO.write([chrom_seqs[s]], f, "fasta")
-		if (index == iterations-1):
-			print("------------------------------------------")
-			print("Reform Complete")
-			print("------------------------------------------")
-			print("New fasta file created: ", new_fasta)
 		
 		## Read in new GFF features from in_gff
 		in_gff_lines = get_in_gff_lines(in_arg.in_gff[index])
@@ -123,8 +130,11 @@ def main():
 				new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
 		prev_gff_path = new_gff_path
 		if (index == iterations-1):
-			print(f"New fasta file created: {new_gff_path}")
-			print(f"Original Annotation: {in_arg.ref_gff}")
+			print("------------------------------------------")
+			print("Reform Complete")
+			print("------------------------------------------")
+			print(f"New .fa file created:  {os.path.realpath(new_fasta)}")
+			print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}")
 
 
 def index_fasta(fasta_path):

From 58058ceb4a9a1ce3a3b4af8d1c8ac95377ae9bc8 Mon Sep 17 00:00:00 2001
From: Yuwei Sun <ys4680@nyu.edu>
Date: Wed, 5 Feb 2025 13:21:57 -0500
Subject: [PATCH 03/11] split modify process

---
 reform.py | 219 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 116 insertions(+), 103 deletions(-)

diff --git a/reform.py b/reform.py
index d164c5a..0d03024 100755
--- a/reform.py
+++ b/reform.py
@@ -31,111 +31,118 @@ def main():
 		print("-------------------------------------------")
 		print(f"Begin modification from in{index+1}.fa")
 		print("-------------------------------------------")
-  		## Read the new fasta (to be inserted into the ref genome)
-		try:
-			filename_fa = in_arg.in_fasta[index]
-			if not os.path.exists(filename_fa):
-				raise FileNotFoundError(f"Error: File {filename_fa} does not exist.")
-			real_path_fa = os.path.realpath(filename_fa)
-			record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0]
-		except IndexError:
-			raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.")
-		except Exception as e:
-			raise ValueError(f"Error parsing FASTA file: {str(e)}")
-		print("Preparing to create new FASTA file")
-		print(f"Original FASTA: {real_path_fa}")
-		filename_gff = in_arg.in_gff[index]
-		if not os.path.exists(filename_gff):
-			raise FileNotFoundError(f"Error: File {filename_gff} does not exist.")
-		real_path_gff = os.path.realpath(filename_gff)
-		print("Preparing to create new annotation file")
-		print(f"Original Annotation: {real_path_gff}")
-		print() ### print new line
+		if hasattr(in_arg, 'chrom'):
+			# Modify existing chrom seq
+			new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path \
+				=  modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \
+					iterations, prev_gff_path)
 
-  		## Generate index of sequences from ref reference fasta
-		if prev_fasta_path:
-			chrom_seqs = index_fasta(prev_fasta_path)
-			os.remove(prev_fasta_path)
-		else:	
-			chrom_seqs = index_fasta(in_arg.ref_fasta)
-		
-		## Obtain the sequence of the chromosome we want to modify
-		seq = chrom_seqs[in_arg.chrom]
-		seq_str = str(seq.seq)
-		
-		## Get the position to insert the new sequence
-		positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str, prev_modifications)
-		position = positions['position']
-		down_position = positions['down_position']
-		
-		## Save current modification which include position(index) and length changed.
-		if position == down_position:
-			length_changed = len(str(record.seq))
-		else:
-			length_changed = len(str(record.seq)) - (down_position - position - 1)
-		prev_modifications.append((position,length_changed))
-		
-		if position != down_position:
-			print("Removing nucleotides from position {} - {}".format(position, down_position))
-		print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}"
-			.format(record.description, in_arg.in_fasta[index], position, in_arg.chrom))
-		
-		## Build the new chromosome sequence with the inserted_seq 
-		## If the chromosome sequence length is in the header, replace it with new length
-		new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:]
-		chrom_length = str(len(seq_str))
-		new_length = str(len(new_seq))
-		new_record = SeqRecord(
-			Seq(new_seq), 
-			id=seq.id, 
-			description=seq.description.replace(chrom_length, new_length)
+	print("------------------------------------------")
+	print("Reform Complete")
+	print("------------------------------------------")
+	print(f"New .fa file created:  {os.path.realpath(new_fasta)}")
+	print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}")
+
+def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, iterations, prev_gff_path):
+	## Read FASTA
+	record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path)
+	## Read annotation (gff/gtf)
+	read_gff(in_arg, index)
+	## Obtain the sequence of the chromosome we want to modify
+	seq = chrom_seqs[in_arg.chrom]
+	seq_str = str(seq.seq)
+	## Get the position to insert the new sequence
+	positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str, prev_modifications)
+	position = positions['position']
+	down_position = positions['down_position']
+	## Save current modification which include position(index) and length changed.
+	if position == down_position:
+		length_changed = len(str(record.seq))
+	else:
+		length_changed = len(str(record.seq)) - (down_position - position - 1)
+	prev_modifications.append((position,length_changed))
+	if position != down_position:
+		print("Removing nucleotides from position {} - {}".format(position, down_position))
+	print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}"
+		.format(record.description, in_arg.in_fasta[index], position, in_arg.chrom))
+	## Build the new chromosome sequence with the inserted_seq 
+	## If the chromosome sequence length is in the header, replace it with new length
+	new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:]
+	chrom_length = str(len(seq_str))
+	new_length = str(len(new_seq))
+	new_record = SeqRecord(
+		Seq(new_seq), 
+		id=seq.id, 
+		description=seq.description.replace(chrom_length, new_length)
 		)
-		
-		## Create new fasta file with modified chromosome 
-		if index < iterations - 1:
-			new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa')
-			new_fasta_file.close()
-			new_fasta = new_fasta_file.name
-			prev_fasta_path = new_fasta
-		else:
-			ref_basename, _ = get_ref_basename(in_arg.ref_fasta)
-			ref_name = ref_basename
-			new_fasta = ref_name + '_reformed.fa'
-		with open(new_fasta, "w") as f:
-			for s in chrom_seqs:
-				if s == seq.id:
-					SeqIO.write([new_record], f, "fasta")
-				else:
-					SeqIO.write([chrom_seqs[s]], f, "fasta")
-		
-		## Read in new GFF features from in_gff
-		in_gff_lines = get_in_gff_lines(in_arg.in_gff[index])
-		
-		## Create a temp file for gff, if index is not equal to last iteration
-		annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
-		if index < iterations - 1:
-			temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext)
-			temp_gff_name = temp_gff.name
-			temp_gff.close()
-			if prev_gff_path:
-				new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
-				os.remove(prev_gff_path)
+	## Create new fasta file with modified chromosome 
+	if index < iterations - 1:
+		new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa')
+		new_fasta_file.close()
+		new_fasta = new_fasta_file.name
+		prev_fasta_path = new_fasta
+	else:
+		ref_basename, _ = get_ref_basename(in_arg.ref_fasta)
+		ref_name = ref_basename
+		new_fasta = ref_name + '_reformed.fa'
+	with open(new_fasta, "w") as f:
+		for s in chrom_seqs:
+			if s == seq.id:
+				SeqIO.write([new_record], f, "fasta")
 			else:
-				new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
+				SeqIO.write([chrom_seqs[s]], f, "fasta")
+	## Read in new GFF features from in_gff
+	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index])
+	## Create a temp file for gff, if index is not equal to last iteration
+	annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
+	if index < iterations - 1:
+		temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext)
+		temp_gff_name = temp_gff.name
+		temp_gff.close()
+		if prev_gff_path:
+			new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
+			os.remove(prev_gff_path)
 		else:
-			new_gff_name = annotation_name + '_reformed' + annotation_ext
-			if prev_gff_path: 
-				new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
-			else:
-				new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
-		prev_gff_path = new_gff_path
-		if (index == iterations-1):
-			print("------------------------------------------")
-			print("Reform Complete")
-			print("------------------------------------------")
-			print(f"New .fa file created:  {os.path.realpath(new_fasta)}")
-			print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}")
+			new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
+	else:
+		new_gff_name = annotation_name + '_reformed' + annotation_ext
+		if prev_gff_path: 
+			new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
+		else:
+			new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
+	prev_gff_path = new_gff_path
+	return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path, 
 
+def read_fasta(in_arg, index, prev_fasta_path):
+	## Read the new fasta (to be inserted into the ref genome)
+	try:
+		filename_fa = in_arg.in_fasta[index]
+		if not os.path.exists(filename_fa):
+			raise FileNotFoundError(f"Error: File {filename_fa} does not exist.")
+		real_path_fa = os.path.realpath(filename_fa)
+		record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0]
+	except IndexError:
+		raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.")
+	except Exception as e:
+		raise ValueError(f"Error parsing FASTA file: {str(e)}")
+	print("Preparing to create new FASTA file")
+	print(f"Original FASTA: {real_path_fa}")
+	## Generate index of sequences from ref reference fasta
+	if prev_fasta_path:
+		chrom_seqs = index_fasta(prev_fasta_path)
+		os.remove(prev_fasta_path)
+	else:	
+		chrom_seqs = index_fasta(in_arg.ref_fasta)
+	return record, chrom_seqs
+		
+def read_gff(in_arg, index):
+	filename_gff = in_arg.in_gff[index]
+	if not os.path.exists(filename_gff):
+		raise FileNotFoundError(f"Error: File {filename_gff} does not exist.")
+	real_path_gff = os.path.realpath(filename_gff)
+	print("Preparing to create new annotation file")
+	print(f"Original Annotation: {real_path_gff}")
+	print() ### print new line
 
 def index_fasta(fasta_path):
 	'''
@@ -562,9 +569,11 @@ def rename_id(line):
 		
 def get_input_args():
 	parser = argparse.ArgumentParser()
-	
-	parser.add_argument('--chrom', type = str, required = True,
-					help = "Chromosome name (String)") 
+	chrom_group = parser.add_mutually_exclusive_group(required=True)
+	chrom_group.add_argument('--chrom', type=str, 
+                             help="Chromosome name (String)")
+	chrom_group.add_argument('--new_chrom', type=str, 
+                             help="New Chromosome name (String)")
 	parser.add_argument('--in_fasta', type=str, required=True,
                     help="Path(s) to new sequence(s) to be inserted into reference genome in fasta format") 
 	parser.add_argument('--in_gff', type=str, required=True,
@@ -610,6 +619,10 @@ def get_input_args():
 		print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.")
 		exit()
 		
+	if in_args.new_chrom and (in_args.position or in_args.upstream_fasta or in_args.downstream_fasta):
+		parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.")
+		exit()
+	
 	return in_args, iterations
 	
 if __name__ == "__main__":

From f1aecaa4f31211055f0137dadae03f5e28964ce9 Mon Sep 17 00:00:00 2001
From: Yuwei Sun <ys4680@nyu.edu>
Date: Fri, 7 Feb 2025 21:47:44 -0500
Subject: [PATCH 04/11] add new chrom

---
 reform.py             | 193 +++++++++++++++++++++++++++++++++------
 temp.py               | 208 ++++++++++++++++++++++++++++++++++++++++++
 test_data/17/gold.fa  |   4 +
 test_data/17/gold.gtf |   8 ++
 test_data/17/in.fa    |   2 +
 test_data/17/in.gtf   |   4 +
 test_data/17/ref.fa   |   2 +
 test_data/17/ref.gtf  |   4 +
 test_reform.py        |  39 ++++++++
 9 files changed, 434 insertions(+), 30 deletions(-)
 create mode 100644 temp.py
 create mode 100644 test_data/17/gold.fa
 create mode 100644 test_data/17/gold.gtf
 create mode 100644 test_data/17/in.fa
 create mode 100644 test_data/17/in.gtf
 create mode 100644 test_data/17/ref.fa
 create mode 100644 test_data/17/ref.gtf

diff --git a/reform.py b/reform.py
index 0d03024..fb59319 100755
--- a/reform.py
+++ b/reform.py
@@ -31,11 +31,15 @@ def main():
 		print("-------------------------------------------")
 		print(f"Begin modification from in{index+1}.fa")
 		print("-------------------------------------------")
-		if hasattr(in_arg, 'chrom'):
+		if hasattr(in_arg, 'chrom') and in_arg.chrom is not None:
 			# Modify existing chrom seq
 			new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path \
 				=  modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \
 					iterations, prev_gff_path)
+		else:
+			# Add new chrom
+			new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \
+				add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
 
 	print("------------------------------------------")
 	print("Reform Complete")
@@ -44,7 +48,11 @@ def main():
 	print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}")
 
 def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, iterations, prev_gff_path):
-	## Read FASTA
+	"""
+	Modifies a specified and existing chromosome sequence by inserting/replacing a new sequence at a given 
+ 	position and updates the corresponding FASTA and GFF files.
+ 	"""
+ 	## Read FASTA
 	record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path)
 	## Read annotation (gff/gtf)
 	read_gff(in_arg, index)
@@ -111,9 +119,69 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
 		else:
 			new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
 	prev_gff_path = new_gff_path
-	return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path, 
+	return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path
+
+def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations):
+    ## Read FASTA
+	record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path)
+	## Read annotation (gff/gtf)
+	read_gff(in_arg, index)
+	## Check if new chromosome existed in sequence
+	if in_arg.new_chrom in chrom_seqs:
+		raise ValueError(f"Chromosome {in_arg.new_chrom} already exists in the FASTA file.")
+	
+ 	## Build the new chromosome sequence by append new sequence below
+	## If the chromosome sequence length is in the header, replace it with new length
+	new_seq = str(record.seq)
+	new_length = str(len(new_seq))
+	new_length = str(len(new_seq))
+	new_record = SeqRecord(
+		Seq(new_seq), 
+		id=in_arg.new_chrom, # Use new_chrom
+		description=f"{in_arg.new_chrom}"
+	)
+	## Create new fasta file with modified chromosome 
+	if index < iterations - 1:
+		new_fasta_file = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix='.fa')
+		new_fasta_file.close()
+		new_fasta = new_fasta_file.name
+		prev_fasta_path = new_fasta
+	else:
+		ref_basename, _ = get_ref_basename(in_arg.ref_fasta)
+		ref_name = ref_basename
+		new_fasta = ref_name + '_reformed.fa'
+	## Write all the original chromosomes first, then new chromosomes.
+	with open(new_fasta, "w") as f:
+		for s in chrom_seqs:
+			SeqIO.write([chrom_seqs[s]], f, "fasta")
+		SeqIO.write([new_record], f, "fasta")
+	## Read in new GFF features from in_gff
+	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index])
+	## Create a temp file for gff, if index is not equal to last iteration
+	annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
+	if index < iterations - 1:
+		temp_gff = tempfile.NamedTemporaryFile(delete=False, mode='w', suffix=annotation_ext)
+		temp_gff_name = temp_gff.name
+		temp_gff.close()
+		if prev_gff_path:
+			new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom)
+			os.remove(prev_gff_path)
+		else:
+			new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom)
+	else:
+		new_gff_name = annotation_name + '_reformed' + annotation_ext
+		if prev_gff_path: 
+			new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom)
+		else:
+			new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom)
+	prev_gff_path = new_gff_path
+	return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path
 
 def read_fasta(in_arg, index, prev_fasta_path):
+	"""
+	Reads the FASTA file, extracts the sequence to be inserted,
+	and indexes the reference genome chromosome sequences.
+	"""
 	## Read the new fasta (to be inserted into the ref genome)
 	try:
 		filename_fa = in_arg.in_fasta[index]
@@ -136,6 +204,9 @@ def read_fasta(in_arg, index, prev_fasta_path):
 	return record, chrom_seqs
 		
 def read_gff(in_arg, index):
+	"""
+	Reads the GFF file, verifies its existence, and prints its file path information.
+ 	"""
 	filename_gff = in_arg.in_gff[index]
 	if not os.path.exists(filename_gff):
 		raise FileNotFoundError(f"Error: File {filename_gff} does not exist.")
@@ -535,6 +606,63 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
 			os.remove(ref_gff_path)
 	return new_gff_name
 
+def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_id):
+	"""
+	Appends new annotations to an existing GFF file without modifying existing features.
+	"""
+	ref_gff_path = ref_gff
+	## Handle compressed .gz GFF files
+	if ref_gff.endswith('.gz'):
+		with gzip_module.open(ref_gff, 'rt') as f:
+			with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f:
+				for line in f:
+					tmp_f.write(line)
+				tmp_f.flush()
+				ref_gff_path = tmp_f.name
+    ## Open new GFF file for writing
+	with open(new_gff_name, "w") as gff_out:
+        ## Copy all existing annotations to new GFF file
+		with open(ref_gff_path, "r", encoding="utf-8") as f:
+			for line in f:
+				gff_out.write(line)
+		## Append new annotations if present
+		if in_gff_lines:
+			print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.")
+			for new_annotation in in_gff_lines:
+				if new_annotation:
+					gff_out.write("\t".join(new_annotation))
+	
+ 	## Cleanup temp file if needed
+	if ref_gff.endswith('.gz') and ref_gff_path != ref_gff:
+		try:
+			os.remove(ref_gff_path)
+		except Exception as e:
+			print(f"Warning: Failed to delete temp file {ref_gff_path}: {e}")
+	
+	# with open(new_gff_name, "w") as gff_out:
+	# 	ref_gff_path = ref_gff
+	# 	if ref_gff.endswith('.gz'):
+	# 		with gzip_module.open(ref_gff, 'rt') as f:
+	# 			with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f:
+	# 				for line in f:
+	# 					tmp_f.write(line)
+	# 				tmp_f.flush()
+	# 				ref_gff_path = tmp_f.name
+	# 	## Copy all existing annotations to new GFF file
+	# 	with open(ref_gff_path, "r") as f:
+	# 		for line in f:
+	# 			gff_out.write(line)	
+	# 	# Append new annotations
+	# 	print(f"Appending new annotations to chromosome {chrom_id}.")
+	# 	for new_annotation in in_gff_lines:
+	# 		if new_annotation:
+	# 			gff_out.write("\t".join(new_annotation) + "\n")
+
+	# # Cleanup temp file if needed
+	# if ref_gff.endswith('.gz'):
+	# 	os.remove(ref_gff_path)
+	return new_gff_name
+
 def format_comment(comment, ext):
 	'''
 	Format comment according to ext (GFF or GTF) and return
@@ -592,36 +720,41 @@ def get_input_args():
 	in_args = parser.parse_args()
 	in_args.in_fasta = in_args.in_fasta.split(',')
 	in_args.in_gff = in_args.in_gff.split(',')
-	if in_args.upstream_fasta:
-		in_args.upstream_fasta = in_args.upstream_fasta.split(',')
-	if in_args.downstream_fasta:
-		in_args.downstream_fasta = in_args.downstream_fasta.split(',')
-
-	if in_args.position is None and (in_args.upstream_fasta is None or in_args.downstream_fasta is None):
-		print("** Error: You must provide either the position, or the upstream and downstream sequences.")
-		exit()
-	
-	if not in_args.position and len(in_args.upstream_fasta) != len(in_args.downstream_fasta):
-		print("** Error: The number of upstream_fasta and downstream_fasta files does not match.")
+	if (len(in_args.in_fasta) != len(in_args.in_gff)):
+		print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.")
 		exit()
-
-	if in_args.position is not None:
-		try:
-			in_args.position = list(map(int, in_args.position.split(',')))
-			iterations = iterations = len(in_args.position)
-		except ValueError:
-			print("** Error: Position must be a comma-separated list of integers, like 1,5,-1.")
+	else:
+		iterations = len(in_args.in_fasta)
+	## Modify existing chrom
+	if in_args.chrom:
+		if in_args.upstream_fasta:
+			in_args.upstream_fasta = in_args.upstream_fasta.split(',')
+		if in_args.downstream_fasta:
+			in_args.downstream_fasta = in_args.downstream_fasta.split(',')
+		if in_args.position is None and (in_args.upstream_fasta is None or in_args.downstream_fasta is None):
+			print("** Error: You must provide either the position, or the upstream and downstream sequences.")
+			exit()
+		if in_args.position is not None:
+			try:
+				in_args.position = list(map(int, in_args.position.split(',')))
+			except ValueError:
+				print("** Error: Position must be a comma-separated list of integers, like 1,5,-1.")
+				exit()
+			if iterations != len(in_args.position):
+				print("** Error: Position must be a equal to number of input FASTA")
+				exit()
+		else:
+			if iterations != len(in_args.upstream_fasta):
+				print("** Error: Upstream FASTA must be a equal to number of input FASTA")
+				exit()
+		if not in_args.position and len(in_args.upstream_fasta) != len(in_args.downstream_fasta):
+			print("** Error: The number of upstream_fasta and downstream_fasta files does not match.")
 			exit()
+	## Add new chrom
 	else:
-		iterations = len(in_args.upstream_fasta)
-
-	if (len(in_args.in_fasta) != len(in_args.in_gff)) or (len(in_args.in_fasta) != iterations):
-		print("** Error: The number of inserted FASTA files does not match the number of GTF files, or their counts and positions do not align.")
-		exit()
-		
-	if in_args.new_chrom and (in_args.position or in_args.upstream_fasta or in_args.downstream_fasta):
-		parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.")
-		exit()
+		if in_args.position or in_args.upstream_fasta or in_args.downstream_fasta:
+			parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.")
+			exit()
 	
 	return in_args, iterations
 	
diff --git a/temp.py b/temp.py
new file mode 100644
index 0000000..077400a
--- /dev/null
+++ b/temp.py
@@ -0,0 +1,208 @@
+def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, chrom_id, new_seq_length):
+	'''
+	Goes line by line through a gff file to remove existing features 
+	(or parts of existing features) and/or insert new features. 
+	In the process of adding new features, existing features 
+	may be cut-off on either end or split into two. 
+	This attempts to handle all cases.
+	new_gff_name: the name of the new gff file to create
+	ref_gff: the reference gff file to modify
+	in_gff_lines: a list of lists where each nested list is a list of 
+		columns (in gff format) associated with each new feature to insert
+	position: start position of removal of existing sequence
+	down_position: end position of removal of existing sequence
+	chrom_id: the ID of the chromosome to modify
+	new_seq_length: the length of the new sequence being added to the chromosome
+	'''
+	with open(new_gff_name, "w") as gff_out:
+		in_gff_lines_appended = False
+		split_features = []
+		last_seen_chrom_id = None
+		gff_ext = new_gff_name.split('.')[-1]
+		ref_gff_path = ref_gff
+		if ref_gff.endswith('.gz'):
+			with gzip_module.open(ref_gff, 'rt') as f:
+				## Create a tempfile to store uncompressde content
+				with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f:
+					tmp_f.write(f.read())
+					ref_gff_path = tmp_f.name
+		with open(ref_gff_path, "r") as f:
+			for line in f:
+				line_elements = line.split('\t')
+				if line.startswith("#"):
+					if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id:
+						## Edit the length of the chromosome 
+						original_length = int(line_elements[3])
+						new_length = original_length - (down_position - position) + new_seq_length
+						line = line.replace(str(original_length), str(new_length))
+					gff_out.write(line)
+				else:
+					gff_chrom_id = line_elements[0]
+					gff_feat_start = int(line_elements[3])
+					gff_feat_end = int(line_elements[4])
+					gff_feat_type = line_elements[2]
+					gff_feat_strand = line_elements[6]
+					gff_comments = line_elements[8].strip()
+					
+					# If we've seen at least one chromosome
+					# and the last chromosome seen was the chromosome of interest (i.e. chrom_id)
+					# and now we're on a new chromosome in the original gff file
+					# and the in_gff_lines have not yet been appended:
+					# we assume they need to be appended to the end of the chromosome
+					# so append them before proceeding
+					if (last_seen_chrom_id is not None
+						and last_seen_chrom_id == chrom_id
+						and gff_chrom_id != last_seen_chrom_id 
+						and not in_gff_lines_appended):
+						in_gff_lines_appended = write_in_gff_lines(
+							gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)
+					
+					last_seen_chrom_id = gff_chrom_id
+					
+					# If this is not the chromosome of interest
+					# Or if the current feature ends before any 
+					# modification (which occurs at position)
+					# Then simply write the feature as is (no modification)
+					# (remember gff co-ordinates are 1 based and position 
+					# is 0 based, therefor check less than *or equal to*)
+					if gff_chrom_id != chrom_id or gff_feat_end <= position:
+						gff_out.write(line)
+						
+					# Modify chromosome feature length (if feature is 
+					# "chromosome" or "region")
+					elif gff_feat_type in ['chromosome', 'region']:
+						original_length = gff_feat_end
+						new_length = original_length - (down_position - position) + new_seq_length
+						line = line.replace(str(original_length), str(new_length))
+						gff_out.write(line)
+						
+					# Split feature into 2 if feature starts before position
+					# and ends after down_position
+					elif gff_feat_start <= position and gff_feat_end > down_position:
+						print("Feature split")
+						print(line)
+						# Which side of the feature depends on the strand (we add this as a comment)
+						(x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5")
+						
+						new_comment = format_comment(
+							"original feature split by inserted sequence, this is the {} prime end".format(x),
+							gff_ext
+						)
+						# Modified feature ends at 'position'
+						modified_line = modify_gff_line( 
+							line_elements, 
+							end = position, 
+							comment = gff_comments + new_comment
+						)
+						gff_out.write(modified_line)
+						
+						# The downstream flank(s) will be added immediately after the 
+						# in_gff (new) features are written.
+						# First, attempt to rename IDs (to indicate split)
+						renamed_id_attributes = rename_id(line)
+						new_comment = format_comment(
+							"original feature split by inserted sequence, this is the {} prime end".format(y),
+							gff_ext
+						)
+						split_features.append(
+							(
+								line_elements, 
+								position + new_seq_length + 1, 
+								gff_feat_end + new_seq_length - (down_position - position), 
+								renamed_id_attributes + new_comment 
+							)
+						)
+					
+					# Change end position of feature to cut off point (position) if the
+					# feature ends within the deletion (between position & down_position)
+					elif gff_feat_start <= position and gff_feat_end <= down_position:
+						# Which side of the feature depends on the strand (we add this as a comment)
+						x = "3" if gff_feat_strand == "+" else "5"
+						print("Feature cut off - {} prime side of feature cut off ({} strand)"
+							.format(x, gff_feat_strand))
+						new_comment = format_comment(
+							"{} prime side of feature cut-off by inserted sequence".format(x),
+							gff_ext
+						)
+						modified_line = modify_gff_line(
+							line_elements, 
+							end = position, 
+							comment = gff_comments + new_comment 
+						)
+						gff_out.write(modified_line)
+					
+					# Skip this feature if it falls entirely within the deletion 
+					elif gff_feat_start > position and gff_feat_end <= down_position:
+						print("Skip feature (this feature was removed from sequence)")
+						continue
+						
+					else:
+						if not in_gff_lines_appended:
+							in_gff_lines_appended = write_in_gff_lines(
+								gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)
+							
+						# Change start position of feature to after cutoff point if
+						# the feature starts within the deletion
+						if (gff_feat_start > position 
+							and gff_feat_start <= down_position 
+							and gff_feat_end > down_position):
+							x = "5" if gff_feat_strand == "+" else "3"
+							print("Feature cut off - {} prime side of feature cut off ({} strand)"
+								.format(x, gff_feat_strand))
+							new_comment = format_comment(
+								"{} prime side of feature cut-off by inserted sequence".format(x),
+								gff_ext
+							)
+							modified_line = modify_gff_line(
+								line_elements, 
+								start = position + new_seq_length + 1, 
+								end = gff_feat_end + new_seq_length - (down_position - position), 
+								comment = gff_comments + new_comment
+							)
+							gff_out.write(modified_line)
+							
+						# Offset all downstream feature positions by offset length
+						elif gff_feat_start > down_position:
+							offset_length = new_seq_length - (down_position - position)
+							modified_line = modify_gff_line(
+								line_elements, 
+								start = gff_feat_start + offset_length, 
+								end = gff_feat_end + offset_length
+							)
+							gff_out.write(modified_line)
+							
+						else:
+							print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements))
+							exit()
+							
+			# If we've iterated over the entire original gff
+			# (i.e. out of the for loop which iterates over it)
+			# and still haven't written in_gff_lines
+			# and the last chromosome seen was the chromosome of interest (i.e. chrom_id):
+			# we assume they need to be appended to the end of the genome
+			# (i.e. end of the last chromosome)
+			# so append them now
+			if (last_seen_chrom_id is not None 
+				and last_seen_chrom_id == chrom_id
+				and not in_gff_lines_appended):
+				in_gff_lines_appended = write_in_gff_lines(
+					gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)
+			
+			# Checking to ensure in_gff_lines written
+			if not in_gff_lines_appended:
+				print("** Error: Something went wrong, in_gff not added to reference gff. Exiting")
+				exit()
+		# Remove temp gtf file
+		if ref_gff.endswith('.gz'):
+			os.remove(ref_gff_path)
+	return new_gff_name
+
+
+"""
+python3 ../../reform.py \
+		--new_chrom="Y" \
+		--in_fasta=in.fa \
+		--in_gff=in.gtf \
+		--ref_fasta=ref.fa \
+		--ref_gff=ref.gtf
+"""
\ No newline at end of file
diff --git a/test_data/17/gold.fa b/test_data/17/gold.fa
new file mode 100644
index 0000000..79a60e4
--- /dev/null
+++ b/test_data/17/gold.fa
@@ -0,0 +1,4 @@
+>X
+ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK
+>Y
+AAAATTTTGGGGCCCC
diff --git a/test_data/17/gold.gtf b/test_data/17/gold.gtf
new file mode 100644
index 0000000..83f530e
--- /dev/null
+++ b/test_data/17/gold.gtf
@@ -0,0 +1,8 @@
+X	ref	exon	5	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	CDS	8	22	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	start_codon	5	7	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	stop_codon	23	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+Y	ref	exon	1	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	CDS	4	14	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	start_codon	1	3	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	stop_codon	14	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
diff --git a/test_data/17/in.fa b/test_data/17/in.fa
new file mode 100644
index 0000000..2b55705
--- /dev/null
+++ b/test_data/17/in.fa
@@ -0,0 +1,2 @@
+>Y
+AAAATTTTGGGGCCCC
diff --git a/test_data/17/in.gtf b/test_data/17/in.gtf
new file mode 100644
index 0000000..7dc556d
--- /dev/null
+++ b/test_data/17/in.gtf
@@ -0,0 +1,4 @@
+Y	ref	exon	1	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	CDS	4	14	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	start_codon	1	3	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	stop_codon	14	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
diff --git a/test_data/17/ref.fa b/test_data/17/ref.fa
new file mode 100644
index 0000000..6fa4d80
--- /dev/null
+++ b/test_data/17/ref.fa
@@ -0,0 +1,2 @@
+>X
+ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK
diff --git a/test_data/17/ref.gtf b/test_data/17/ref.gtf
new file mode 100644
index 0000000..6737be8
--- /dev/null
+++ b/test_data/17/ref.gtf
@@ -0,0 +1,4 @@
+X	ref	exon	5	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	CDS	8	22	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	start_codon	5	7	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	stop_codon	23	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
diff --git a/test_reform.py b/test_reform.py
index a10d75e..61054b6 100644
--- a/test_reform.py
+++ b/test_reform.py
@@ -684,5 +684,44 @@ def test_case_16(self):
 
 		os.chdir(wd)
 
+	def test_case_17(self):
+		"""
+		Case 17:
+		Testing Reform which adding new chrom
+		"""
+
+		wd = os.getcwd()
+		os.chdir('test_data/17/')
+
+		command = """
+		python3 ../../reform.py \
+		--new_chrom="Y" \
+		--in_fasta=in.fa \
+		--in_gff=in.gtf \
+		--ref_fasta=ref.fa \
+		--ref_gff=ref.gtf \
+		"""
+
+		response = subprocess.getoutput(command)
+		print(response)
+
+		with open('gold.gtf', 'r') as f:
+			gold_gff = f.read()
+		with open('ref_reformed.gtf', 'r') as f:
+			new_gff = f.read()
+		print("Testing gtf")
+		self.assertListEqual(list(gold_gff), list(new_gff))
+		print("Done")
+
+		with open('gold.fa', 'r') as f:
+			gold_fa = f.read()
+		with open('ref_reformed.fa', 'r') as f:
+			new_fa = f.read()
+		print("Testing Fasta")
+		self.assertListEqual(list(gold_fa), list(new_fa))
+		print("Done")
+
+		os.chdir(wd)
+
 if __name__ == '__main__':
     unittest.main()

From 05a6e15be09e0be00e7aa40d60f54549312a7305 Mon Sep 17 00:00:00 2001
From: Yuwei Sun <ys4680@nyu.edu>
Date: Fri, 7 Feb 2025 22:03:34 -0500
Subject: [PATCH 05/11] remove redundancy comments and temp file

---
 reform.py |  35 ++-------
 temp.py   | 208 ------------------------------------------------------
 2 files changed, 7 insertions(+), 236 deletions(-)
 delete mode 100644 temp.py

diff --git a/reform.py b/reform.py
index fb59319..dd0ed84 100755
--- a/reform.py
+++ b/reform.py
@@ -32,12 +32,12 @@ def main():
 		print(f"Begin modification from in{index+1}.fa")
 		print("-------------------------------------------")
 		if hasattr(in_arg, 'chrom') and in_arg.chrom is not None:
-			# Modify existing chrom seq
-			new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path \
-				=  modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \
-					iterations, prev_gff_path)
+			## Modify existing chrom seq
+			new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \
+				modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications, \
+				iterations, prev_gff_path)
 		else:
-			# Add new chrom
+			## Add new chrom
 			new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path = \
 				add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
 
@@ -118,7 +118,9 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
 			new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
 		else:
 			new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
+	
 	prev_gff_path = new_gff_path
+	
 	return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path
 
 def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations):
@@ -246,7 +248,6 @@ def get_ref_basename(filepath):
 	name, ext = os.path.splitext(base)
 	return name, ext
 
-
 def modify_gff_line(elements, start=None, end=None, comment=None):
 	'''
 	Modifies an existing GFF line and returns the modified line. Currently, you can 
@@ -639,28 +640,6 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
 		except Exception as e:
 			print(f"Warning: Failed to delete temp file {ref_gff_path}: {e}")
 	
-	# with open(new_gff_name, "w") as gff_out:
-	# 	ref_gff_path = ref_gff
-	# 	if ref_gff.endswith('.gz'):
-	# 		with gzip_module.open(ref_gff, 'rt') as f:
-	# 			with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f:
-	# 				for line in f:
-	# 					tmp_f.write(line)
-	# 				tmp_f.flush()
-	# 				ref_gff_path = tmp_f.name
-	# 	## Copy all existing annotations to new GFF file
-	# 	with open(ref_gff_path, "r") as f:
-	# 		for line in f:
-	# 			gff_out.write(line)	
-	# 	# Append new annotations
-	# 	print(f"Appending new annotations to chromosome {chrom_id}.")
-	# 	for new_annotation in in_gff_lines:
-	# 		if new_annotation:
-	# 			gff_out.write("\t".join(new_annotation) + "\n")
-
-	# # Cleanup temp file if needed
-	# if ref_gff.endswith('.gz'):
-	# 	os.remove(ref_gff_path)
 	return new_gff_name
 
 def format_comment(comment, ext):
diff --git a/temp.py b/temp.py
deleted file mode 100644
index 077400a..0000000
--- a/temp.py
+++ /dev/null
@@ -1,208 +0,0 @@
-def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position, chrom_id, new_seq_length):
-	'''
-	Goes line by line through a gff file to remove existing features 
-	(or parts of existing features) and/or insert new features. 
-	In the process of adding new features, existing features 
-	may be cut-off on either end or split into two. 
-	This attempts to handle all cases.
-	new_gff_name: the name of the new gff file to create
-	ref_gff: the reference gff file to modify
-	in_gff_lines: a list of lists where each nested list is a list of 
-		columns (in gff format) associated with each new feature to insert
-	position: start position of removal of existing sequence
-	down_position: end position of removal of existing sequence
-	chrom_id: the ID of the chromosome to modify
-	new_seq_length: the length of the new sequence being added to the chromosome
-	'''
-	with open(new_gff_name, "w") as gff_out:
-		in_gff_lines_appended = False
-		split_features = []
-		last_seen_chrom_id = None
-		gff_ext = new_gff_name.split('.')[-1]
-		ref_gff_path = ref_gff
-		if ref_gff.endswith('.gz'):
-			with gzip_module.open(ref_gff, 'rt') as f:
-				## Create a tempfile to store uncompressde content
-				with tempfile.NamedTemporaryFile(delete=False, mode='w') as tmp_f:
-					tmp_f.write(f.read())
-					ref_gff_path = tmp_f.name
-		with open(ref_gff_path, "r") as f:
-			for line in f:
-				line_elements = line.split('\t')
-				if line.startswith("#"):
-					if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id:
-						## Edit the length of the chromosome 
-						original_length = int(line_elements[3])
-						new_length = original_length - (down_position - position) + new_seq_length
-						line = line.replace(str(original_length), str(new_length))
-					gff_out.write(line)
-				else:
-					gff_chrom_id = line_elements[0]
-					gff_feat_start = int(line_elements[3])
-					gff_feat_end = int(line_elements[4])
-					gff_feat_type = line_elements[2]
-					gff_feat_strand = line_elements[6]
-					gff_comments = line_elements[8].strip()
-					
-					# If we've seen at least one chromosome
-					# and the last chromosome seen was the chromosome of interest (i.e. chrom_id)
-					# and now we're on a new chromosome in the original gff file
-					# and the in_gff_lines have not yet been appended:
-					# we assume they need to be appended to the end of the chromosome
-					# so append them before proceeding
-					if (last_seen_chrom_id is not None
-						and last_seen_chrom_id == chrom_id
-						and gff_chrom_id != last_seen_chrom_id 
-						and not in_gff_lines_appended):
-						in_gff_lines_appended = write_in_gff_lines(
-							gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)
-					
-					last_seen_chrom_id = gff_chrom_id
-					
-					# If this is not the chromosome of interest
-					# Or if the current feature ends before any 
-					# modification (which occurs at position)
-					# Then simply write the feature as is (no modification)
-					# (remember gff co-ordinates are 1 based and position 
-					# is 0 based, therefor check less than *or equal to*)
-					if gff_chrom_id != chrom_id or gff_feat_end <= position:
-						gff_out.write(line)
-						
-					# Modify chromosome feature length (if feature is 
-					# "chromosome" or "region")
-					elif gff_feat_type in ['chromosome', 'region']:
-						original_length = gff_feat_end
-						new_length = original_length - (down_position - position) + new_seq_length
-						line = line.replace(str(original_length), str(new_length))
-						gff_out.write(line)
-						
-					# Split feature into 2 if feature starts before position
-					# and ends after down_position
-					elif gff_feat_start <= position and gff_feat_end > down_position:
-						print("Feature split")
-						print(line)
-						# Which side of the feature depends on the strand (we add this as a comment)
-						(x, y) = ("5", "3") if gff_feat_strand == "+" else ("3", "5")
-						
-						new_comment = format_comment(
-							"original feature split by inserted sequence, this is the {} prime end".format(x),
-							gff_ext
-						)
-						# Modified feature ends at 'position'
-						modified_line = modify_gff_line( 
-							line_elements, 
-							end = position, 
-							comment = gff_comments + new_comment
-						)
-						gff_out.write(modified_line)
-						
-						# The downstream flank(s) will be added immediately after the 
-						# in_gff (new) features are written.
-						# First, attempt to rename IDs (to indicate split)
-						renamed_id_attributes = rename_id(line)
-						new_comment = format_comment(
-							"original feature split by inserted sequence, this is the {} prime end".format(y),
-							gff_ext
-						)
-						split_features.append(
-							(
-								line_elements, 
-								position + new_seq_length + 1, 
-								gff_feat_end + new_seq_length - (down_position - position), 
-								renamed_id_attributes + new_comment 
-							)
-						)
-					
-					# Change end position of feature to cut off point (position) if the
-					# feature ends within the deletion (between position & down_position)
-					elif gff_feat_start <= position and gff_feat_end <= down_position:
-						# Which side of the feature depends on the strand (we add this as a comment)
-						x = "3" if gff_feat_strand == "+" else "5"
-						print("Feature cut off - {} prime side of feature cut off ({} strand)"
-							.format(x, gff_feat_strand))
-						new_comment = format_comment(
-							"{} prime side of feature cut-off by inserted sequence".format(x),
-							gff_ext
-						)
-						modified_line = modify_gff_line(
-							line_elements, 
-							end = position, 
-							comment = gff_comments + new_comment 
-						)
-						gff_out.write(modified_line)
-					
-					# Skip this feature if it falls entirely within the deletion 
-					elif gff_feat_start > position and gff_feat_end <= down_position:
-						print("Skip feature (this feature was removed from sequence)")
-						continue
-						
-					else:
-						if not in_gff_lines_appended:
-							in_gff_lines_appended = write_in_gff_lines(
-								gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)
-							
-						# Change start position of feature to after cutoff point if
-						# the feature starts within the deletion
-						if (gff_feat_start > position 
-							and gff_feat_start <= down_position 
-							and gff_feat_end > down_position):
-							x = "5" if gff_feat_strand == "+" else "3"
-							print("Feature cut off - {} prime side of feature cut off ({} strand)"
-								.format(x, gff_feat_strand))
-							new_comment = format_comment(
-								"{} prime side of feature cut-off by inserted sequence".format(x),
-								gff_ext
-							)
-							modified_line = modify_gff_line(
-								line_elements, 
-								start = position + new_seq_length + 1, 
-								end = gff_feat_end + new_seq_length - (down_position - position), 
-								comment = gff_comments + new_comment
-							)
-							gff_out.write(modified_line)
-							
-						# Offset all downstream feature positions by offset length
-						elif gff_feat_start > down_position:
-							offset_length = new_seq_length - (down_position - position)
-							modified_line = modify_gff_line(
-								line_elements, 
-								start = gff_feat_start + offset_length, 
-								end = gff_feat_end + offset_length
-							)
-							gff_out.write(modified_line)
-							
-						else:
-							print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements))
-							exit()
-							
-			# If we've iterated over the entire original gff
-			# (i.e. out of the for loop which iterates over it)
-			# and still haven't written in_gff_lines
-			# and the last chromosome seen was the chromosome of interest (i.e. chrom_id):
-			# we assume they need to be appended to the end of the genome
-			# (i.e. end of the last chromosome)
-			# so append them now
-			if (last_seen_chrom_id is not None 
-				and last_seen_chrom_id == chrom_id
-				and not in_gff_lines_appended):
-				in_gff_lines_appended = write_in_gff_lines(
-					gff_out, in_gff_lines, position, split_features, new_seq_length, chrom_id)
-			
-			# Checking to ensure in_gff_lines written
-			if not in_gff_lines_appended:
-				print("** Error: Something went wrong, in_gff not added to reference gff. Exiting")
-				exit()
-		# Remove temp gtf file
-		if ref_gff.endswith('.gz'):
-			os.remove(ref_gff_path)
-	return new_gff_name
-
-
-"""
-python3 ../../reform.py \
-		--new_chrom="Y" \
-		--in_fasta=in.fa \
-		--in_gff=in.gtf \
-		--ref_fasta=ref.fa \
-		--ref_gff=ref.gtf
-"""
\ No newline at end of file

From b2c1bb446c143eac14555f07678cc4795c801304 Mon Sep 17 00:00:00 2001
From: Yuwei Sun <ys4680@nyu.edu>
Date: Mon, 3 Mar 2025 15:40:51 -0500
Subject: [PATCH 06/11] Fix bugs

---
 reform.py             | 56 +++++++++++++++++++++----------------------
 test_data/18/gold.fa  |  8 +++++++
 test_data/18/gold.gtf | 16 +++++++++++++
 test_data/18/in1.fa   |  2 ++
 test_data/18/in1.gtf  |  4 ++++
 test_data/18/in2.fa   |  2 ++
 test_data/18/in2.gtf  |  4 ++++
 test_data/18/in3.fa   |  2 ++
 test_data/18/in3.gtf  |  4 ++++
 test_data/18/ref.fa   |  2 ++
 test_data/18/ref.gtf  |  4 ++++
 test_reform.py        | 41 ++++++++++++++++++++++++++++++-
 12 files changed, 116 insertions(+), 29 deletions(-)
 create mode 100644 test_data/18/gold.fa
 create mode 100644 test_data/18/gold.gtf
 create mode 100644 test_data/18/in1.fa
 create mode 100644 test_data/18/in1.gtf
 create mode 100644 test_data/18/in2.fa
 create mode 100644 test_data/18/in2.gtf
 create mode 100644 test_data/18/in3.fa
 create mode 100644 test_data/18/in3.gtf
 create mode 100644 test_data/18/ref.fa
 create mode 100644 test_data/18/ref.gtf

diff --git a/reform.py b/reform.py
index dd0ed84..eb918fc 100755
--- a/reform.py
+++ b/reform.py
@@ -55,12 +55,13 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
  	## Read FASTA
 	record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path)
 	## Read annotation (gff/gtf)
-	read_gff(in_arg, index)
+	check_gff(in_arg, index)
 	## Obtain the sequence of the chromosome we want to modify
-	seq = chrom_seqs[in_arg.chrom]
-	seq_str = str(seq.seq)
+	existing_seq = chrom_seqs[in_arg.chrom]
+	existing_seq_str = str(existing_seq.seq)
+
 	## Get the position to insert the new sequence
-	positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, seq_str, prev_modifications)
+	positions = get_position(index, in_arg.position, in_arg.upstream_fasta, in_arg.downstream_fasta, in_arg.chrom, existing_seq_str, prev_modifications)
 	position = positions['position']
 	down_position = positions['down_position']
 	## Save current modification which include position(index) and length changed.
@@ -75,13 +76,13 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
 		.format(record.description, in_arg.in_fasta[index], position, in_arg.chrom))
 	## Build the new chromosome sequence with the inserted_seq 
 	## If the chromosome sequence length is in the header, replace it with new length
-	new_seq = seq_str[:position] + str(record.seq) + seq_str[down_position:]
-	chrom_length = str(len(seq_str))
+	new_seq = existing_seq_str[:position] + str(record.seq) + existing_seq_str[down_position:]
+	chrom_length = str(len(existing_seq_str))
 	new_length = str(len(new_seq))
 	new_record = SeqRecord(
 		Seq(new_seq), 
-		id=seq.id, 
-		description=seq.description.replace(chrom_length, new_length)
+		id=existing_seq.id, 
+		description=existing_seq.description.replace(chrom_length, new_length)
 		)
 	## Create new fasta file with modified chromosome 
 	if index < iterations - 1:
@@ -95,7 +96,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
 		new_fasta = ref_name + '_reformed.fa'
 	with open(new_fasta, "w") as f:
 		for s in chrom_seqs:
-			if s == seq.id:
+			if s == existing_seq.id:
 				SeqIO.write([new_record], f, "fasta")
 			else:
 				SeqIO.write([chrom_seqs[s]], f, "fasta")
@@ -108,16 +109,16 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
 		temp_gff_name = temp_gff.name
 		temp_gff.close()
 		if prev_gff_path:
-			new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
+			new_gff_path = create_new_gff(temp_gff_name, prev_gff_path, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq)))
 			os.remove(prev_gff_path)
 		else:
-			new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
+			new_gff_path = create_new_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq)))
 	else:
 		new_gff_name = annotation_name + '_reformed' + annotation_ext
 		if prev_gff_path: 
-			new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
+			new_gff_path = create_new_gff(new_gff_name, prev_gff_path, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq)))
 		else:
-			new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, seq.id, len(str(record.seq)))
+			new_gff_path = create_new_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, position, down_position, existing_seq.id, len(str(record.seq)))
 	
 	prev_gff_path = new_gff_path
 	
@@ -127,20 +128,17 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
     ## Read FASTA
 	record, chrom_seqs= read_fasta(in_arg, index, prev_fasta_path)
 	## Read annotation (gff/gtf)
-	read_gff(in_arg, index)
+	check_gff(in_arg, index)
 	## Check if new chromosome existed in sequence
-	if in_arg.new_chrom in chrom_seqs:
-		raise ValueError(f"Chromosome {in_arg.new_chrom} already exists in the FASTA file.")
-	
+	if in_arg.new_chrom[index] in chrom_seqs:
+		raise ValueError(f"Chromosome {in_arg.new_chrom[index]} already exists in the FASTA file.")
  	## Build the new chromosome sequence by append new sequence below
-	## If the chromosome sequence length is in the header, replace it with new length
+	## Using new_chrom as the id of the new chromosome
 	new_seq = str(record.seq)
-	new_length = str(len(new_seq))
-	new_length = str(len(new_seq))
 	new_record = SeqRecord(
 		Seq(new_seq), 
-		id=in_arg.new_chrom, # Use new_chrom
-		description=f"{in_arg.new_chrom}"
+		id=in_arg.new_chrom[index], # Use new_chrom
+		description=f"{in_arg.new_chrom[index]}"
 	)
 	## Create new fasta file with modified chromosome 
 	if index < iterations - 1:
@@ -166,16 +164,16 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
 		temp_gff_name = temp_gff.name
 		temp_gff.close()
 		if prev_gff_path:
-			new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom)
+			new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index])
 			os.remove(prev_gff_path)
 		else:
-			new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom)
+			new_gff_path = create_new_gff_for_existing_gff(temp_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index])
 	else:
 		new_gff_name = annotation_name + '_reformed' + annotation_ext
 		if prev_gff_path: 
-			new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom)
+			new_gff_path = create_new_gff_for_existing_gff(new_gff_name, prev_gff_path, in_gff_lines, in_arg.new_chrom[index])
 		else:
-			new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom)
+			new_gff_path = create_new_gff_for_existing_gff(new_gff_name, in_arg.ref_gff, in_gff_lines, in_arg.new_chrom[index])
 	prev_gff_path = new_gff_path
 	return new_fasta, annotation_ext, new_gff_path, prev_fasta_path, prev_gff_path
 
@@ -205,7 +203,7 @@ def read_fasta(in_arg, index, prev_fasta_path):
 		chrom_seqs = index_fasta(in_arg.ref_fasta)
 	return record, chrom_seqs
 		
-def read_gff(in_arg, index):
+def check_gff(in_arg, index):
 	"""
 	Reads the GFF file, verifies its existence, and prints its file path information.
  	"""
@@ -680,7 +678,7 @@ def get_input_args():
 	chrom_group.add_argument('--chrom', type=str, 
                              help="Chromosome name (String)")
 	chrom_group.add_argument('--new_chrom', type=str, 
-                             help="New Chromosome name (String)")
+                             help="Comma-separated new chromosome name (String)")
 	parser.add_argument('--in_fasta', type=str, required=True,
                     help="Path(s) to new sequence(s) to be inserted into reference genome in fasta format") 
 	parser.add_argument('--in_gff', type=str, required=True,
@@ -734,6 +732,8 @@ def get_input_args():
 		if in_args.position or in_args.upstream_fasta or in_args.downstream_fasta:
 			parser.error("** Error: When using --new_chrom, you cannot provide --position, --upstream_fasta, or --downstream_fasta.")
 			exit()
+		## Convert new_chrom from string to list
+		in_args.new_chrom = in_args.new_chrom.split(',')
 	
 	return in_args, iterations
 	
diff --git a/test_data/18/gold.fa b/test_data/18/gold.fa
new file mode 100644
index 0000000..0194856
--- /dev/null
+++ b/test_data/18/gold.fa
@@ -0,0 +1,8 @@
+>X
+ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK
+>Y
+AAAATTTTGGGGCCCC
+>H
+GGGGAATTCCCCGGGG
+>M
+CCCCGGGGAAAATTTT
diff --git a/test_data/18/gold.gtf b/test_data/18/gold.gtf
new file mode 100644
index 0000000..7d876ea
--- /dev/null
+++ b/test_data/18/gold.gtf
@@ -0,0 +1,16 @@
+X	ref	exon	5	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	CDS	8	22	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	start_codon	5	7	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	stop_codon	23	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+Y	ref	exon	1	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	CDS	4	14	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	start_codon	1	3	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	stop_codon	14	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+H	ref	exon	1	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	CDS	4	14	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	start_codon	1	3	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	stop_codon	14	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+M	ref	exon	1	16	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	CDS	4	14	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	start_codon	1	3	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	stop_codon	14	16	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
diff --git a/test_data/18/in1.fa b/test_data/18/in1.fa
new file mode 100644
index 0000000..2b55705
--- /dev/null
+++ b/test_data/18/in1.fa
@@ -0,0 +1,2 @@
+>Y
+AAAATTTTGGGGCCCC
diff --git a/test_data/18/in1.gtf b/test_data/18/in1.gtf
new file mode 100644
index 0000000..7dc556d
--- /dev/null
+++ b/test_data/18/in1.gtf
@@ -0,0 +1,4 @@
+Y	ref	exon	1	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	CDS	4	14	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	start_codon	1	3	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	stop_codon	14	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
diff --git a/test_data/18/in2.fa b/test_data/18/in2.fa
new file mode 100644
index 0000000..0b5cf0c
--- /dev/null
+++ b/test_data/18/in2.fa
@@ -0,0 +1,2 @@
+>H
+GGGGAATTCCCCGGGG
diff --git a/test_data/18/in2.gtf b/test_data/18/in2.gtf
new file mode 100644
index 0000000..f4e79ad
--- /dev/null
+++ b/test_data/18/in2.gtf
@@ -0,0 +1,4 @@
+H	ref	exon	1	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	CDS	4	14	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	start_codon	1	3	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	stop_codon	14	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
diff --git a/test_data/18/in3.fa b/test_data/18/in3.fa
new file mode 100644
index 0000000..38306a6
--- /dev/null
+++ b/test_data/18/in3.fa
@@ -0,0 +1,2 @@
+>M
+CCCCGGGGAAAATTTT
diff --git a/test_data/18/in3.gtf b/test_data/18/in3.gtf
new file mode 100644
index 0000000..ee9257c
--- /dev/null
+++ b/test_data/18/in3.gtf
@@ -0,0 +1,4 @@
+M	ref	exon	1	16	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	CDS	4	14	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	start_codon	1	3	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	stop_codon	14	16	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
diff --git a/test_data/18/ref.fa b/test_data/18/ref.fa
new file mode 100644
index 0000000..6fa4d80
--- /dev/null
+++ b/test_data/18/ref.fa
@@ -0,0 +1,2 @@
+>X
+ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK
diff --git a/test_data/18/ref.gtf b/test_data/18/ref.gtf
new file mode 100644
index 0000000..6737be8
--- /dev/null
+++ b/test_data/18/ref.gtf
@@ -0,0 +1,4 @@
+X	ref	exon	5	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	CDS	8	22	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	start_codon	5	7	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	stop_codon	23	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
diff --git a/test_reform.py b/test_reform.py
index 61054b6..0d421c2 100644
--- a/test_reform.py
+++ b/test_reform.py
@@ -699,7 +699,46 @@ def test_case_17(self):
 		--in_fasta=in.fa \
 		--in_gff=in.gtf \
 		--ref_fasta=ref.fa \
-		--ref_gff=ref.gtf \
+		--ref_gff=ref.gtf
+		"""
+
+		response = subprocess.getoutput(command)
+		print(response)
+
+		with open('gold.gtf', 'r') as f:
+			gold_gff = f.read()
+		with open('ref_reformed.gtf', 'r') as f:
+			new_gff = f.read()
+		print("Testing gtf")
+		self.assertListEqual(list(gold_gff), list(new_gff))
+		print("Done")
+
+		with open('gold.fa', 'r') as f:
+			gold_fa = f.read()
+		with open('ref_reformed.fa', 'r') as f:
+			new_fa = f.read()
+		print("Testing Fasta")
+		self.assertListEqual(list(gold_fa), list(new_fa))
+		print("Done")
+
+		os.chdir(wd)
+	
+	def test_case_18(self):
+		"""
+		Case 18:
+		Testing Reform which adding multiple new chroms
+		"""
+
+		wd = os.getcwd()
+		os.chdir('test_data/18/')
+
+		command = """
+		python3 ../../reform.py \
+		--new_chrom="Y,H,M" \
+		--in_fasta=in1.fa,in2.fa,in3.fa \
+		--in_gff=in1.gtf,in2.gtf,in3.gtf \
+		--ref_fasta=ref.fa \
+		--ref_gff=ref.gtf
 		"""
 
 		response = subprocess.getoutput(command)

From bf158ee990ba5ac605cc728f499ae1d8f3ab5cb5 Mon Sep 17 00:00:00 2001
From: Yuwei Sun <ys4680@nyu.edu>
Date: Fri, 14 Mar 2025 14:25:12 -0400
Subject: [PATCH 07/11] Read meta-data from input gtf/gff

---
 reform.py             | 63 ++++++++++++++++++++++++++++++++-----------
 test_data/18/gold.gtf |  1 +
 test_data/18/in1.gtf  |  1 +
 3 files changed, 49 insertions(+), 16 deletions(-)

diff --git a/reform.py b/reform.py
index eb918fc..c978a7a 100755
--- a/reform.py
+++ b/reform.py
@@ -101,7 +101,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
 			else:
 				SeqIO.write([chrom_seqs[s]], f, "fasta")
 	## Read in new GFF features from in_gff
-	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index])
+	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], False)
 	## Create a temp file for gff, if index is not equal to last iteration
 	annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
 	if index < iterations - 1:
@@ -156,7 +156,7 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
 			SeqIO.write([chrom_seqs[s]], f, "fasta")
 		SeqIO.write([new_record], f, "fasta")
 	## Read in new GFF features from in_gff
-	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index])
+	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], True)
 	## Create a temp file for gff, if index is not equal to last iteration
 	annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
 	if index < iterations - 1:
@@ -269,7 +269,7 @@ def modify_gff_line(elements, start=None, end=None, comment=None):
 	## Return the modified line
 	return("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(elements[0], elements[1], elements[2], start, end, elements[5], elements[6], elements[7], comment))
 	
-def get_in_gff_lines(in_gff):
+def get_in_gff_lines(in_gff, is_add_new_chrom):
 	'''
 	Takes a gff file and returns a list of lists where 
 	each parent list item is a single line of the gff file
@@ -278,16 +278,38 @@ def get_in_gff_lines(in_gff):
 	with open(in_gff, "r") as f:
 		in_gff_lines = []
 		for line in f:
-			line_elements = line.split('\t')
-			# Skip comment lines
-			if line.startswith("#"):
+			# Skip empty lines
+			if not line.strip():
 				continue
-			# Ensure lines have 9 columns
-			if len(line_elements) != 9:
-				print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements))
-				print(line_elements)
-				exit()
-			in_gff_lines.append(line_elements)
+			
+			# Handle differently based on whether we're adding a new chromosome or modifying existing
+			if not is_add_new_chrom:
+   				# Skip comment lines when modifying existing chromosome
+				if line.startswith("#"):
+					continue
+				# Parse with tab delimiter for regular lines
+				line_elements = line.split('\t')
+				# Ensure lines have 9 columns when modifying existing chromosome
+				if len(line_elements) != 9:
+					print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements))
+					print(line_elements)
+					exit()
+				in_gff_lines.append(line_elements)
+			else:
+				# For adding new chromosomes, process all header and feature lines
+				if line.startswith("#"):
+					# Select appropriate delimiter based on content
+					if '\t' in line:
+						line_elements = line.split('\t')
+					else:
+						line_elements = line.split()
+					if line_elements[0] == "##sequence-region":
+						line_elements[-1] += '\n'
+						in_gff_lines.append(line_elements)
+				elif len(line.split('\t')) == 9:
+					# Normal feature line with 9 tab-delimited columns
+					in_gff_lines.append(line.split('\t'))
+		print(in_gff_lines) 
 	return in_gff_lines
 	
 def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_modifications):
@@ -436,15 +458,22 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
 					ref_gff_path = tmp_f.name
 		with open(ref_gff_path, "r") as f:
 			for line in f:
-				line_elements = line.split('\t')
+				# For header lines, handle both tab and space-delimited formats
 				if line.startswith("#"):
-					if line_elements[0] == "##sequence-region" and line_elements[1] == chrom_id:
+					if '\t' in line:
+						line_elements = line.split('\t')
+					else:
+						line_elements = line.split()
+					
+					if line.startswith("##sequence-region") and line_elements[1] == chrom_id:
 						## Edit the length of the chromosome 
 						original_length = int(line_elements[3])
 						new_length = original_length - (down_position - position) + new_seq_length
 						line = line.replace(str(original_length), str(new_length))
 					gff_out.write(line)
 				else:
+					# Regular feature lines are always tab-delimited
+					line_elements = line.split('\t')
 					gff_chrom_id = line_elements[0]
 					gff_feat_start = int(line_elements[3])
 					gff_feat_end = int(line_elements[4])
@@ -628,7 +657,9 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
 		if in_gff_lines:
 			print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.")
 			for new_annotation in in_gff_lines:
-				if new_annotation:
+				if new_annotation[0] == "##sequence-region":
+					gff_out.write(" ".join(new_annotation))
+				elif new_annotation:
 					gff_out.write("\t".join(new_annotation))
 	
  	## Cleanup temp file if needed
@@ -739,4 +770,4 @@ def get_input_args():
 	
 if __name__ == "__main__":
 	main()
-	
+
diff --git a/test_data/18/gold.gtf b/test_data/18/gold.gtf
index 7d876ea..de2f092 100644
--- a/test_data/18/gold.gtf
+++ b/test_data/18/gold.gtf
@@ -2,6 +2,7 @@ X	ref	exon	5	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
 X	ref	CDS	8	22	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
 X	ref	start_codon	5	7	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
 X	ref	stop_codon	23	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+##sequence-region NC_000001.11 1 248956422
 Y	ref	exon	1	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
 Y	ref	CDS	4	14	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
 Y	ref	start_codon	1	3	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
diff --git a/test_data/18/in1.gtf b/test_data/18/in1.gtf
index 7dc556d..86a6f67 100644
--- a/test_data/18/in1.gtf
+++ b/test_data/18/in1.gtf
@@ -1,3 +1,4 @@
+##sequence-region NC_000001.11 1 248956422
 Y	ref	exon	1	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
 Y	ref	CDS	4	14	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
 Y	ref	start_codon	1	3	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";

From d7047cd3e3d78e9293c23542a6f43202b8402d6a Mon Sep 17 00:00:00 2001
From: Yuwei Sun <ys4680@nyu.edu>
Date: Mon, 14 Apr 2025 17:49:23 -0400
Subject: [PATCH 08/11] feature improvement

---
 reform.py             | 93 ++++++++++++++++++++++++++++---------------
 test_data/18/gold.gtf |  3 +-
 test_data/18/in1.gtf  |  2 +-
 test_data/18/in2.gtf  |  1 +
 4 files changed, 66 insertions(+), 33 deletions(-)

diff --git a/reform.py b/reform.py
index c978a7a..3661bb0 100755
--- a/reform.py
+++ b/reform.py
@@ -100,8 +100,8 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
 				SeqIO.write([new_record], f, "fasta")
 			else:
 				SeqIO.write([chrom_seqs[s]], f, "fasta")
-	## Read in new GFF features from in_gff
-	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], False)
+	## Read in new GFF features from in_gff, False means modify existing chrom
+	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index])
 	## Create a temp file for gff, if index is not equal to last iteration
 	annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
 	if index < iterations - 1:
@@ -156,7 +156,8 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
 			SeqIO.write([chrom_seqs[s]], f, "fasta")
 		SeqIO.write([new_record], f, "fasta")
 	## Read in new GFF features from in_gff
-	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], True)
+	## Pass the new_chrom name from command line and the length of the new sequence to correct ##sequence-region line
+	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], in_arg.new_chrom[index], len(new_seq))
 	## Create a temp file for gff, if index is not equal to last iteration
 	annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
 	if index < iterations - 1:
@@ -268,8 +269,27 @@ def modify_gff_line(elements, start=None, end=None, comment=None):
 
 	## Return the modified line
 	return("{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}".format(elements[0], elements[1], elements[2], start, end, elements[5], elements[6], elements[7], comment))
-	
-def get_in_gff_lines(in_gff, is_add_new_chrom):
+
+def valid_gff_line(line_elements):
+	'''
+	Checks if the splited line is a valid GFF line. 
+	Returns True if valid, False otherwise.
+	'''
+	if not line_elements[0].startswith("##sequence-region"):
+		if len(line_elements) != 9:
+			print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements))
+			print(line_elements)
+			return False
+	else:
+		## Check if ##sequence-region line has 4 columns, the reason why use 5 here is because last element is
+		## spliting format indicator.
+		if len(line_elements) != 5:
+			print("** ERROR: ##sequence-region line does not have 4 columns, it has", len(line_elements) - 1)
+			print(line_elements)
+			return False
+	return True
+
+def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None):
 	'''
 	Takes a gff file and returns a list of lists where 
 	each parent list item is a single line of the gff file
@@ -283,33 +303,42 @@ def get_in_gff_lines(in_gff, is_add_new_chrom):
 				continue
 			
 			# Handle differently based on whether we're adding a new chromosome or modifying existing
-			if not is_add_new_chrom:
-   				# Skip comment lines when modifying existing chromosome
-				if line.startswith("#"):
-					continue
-				# Parse with tab delimiter for regular lines
-				line_elements = line.split('\t')
-				# Ensure lines have 9 columns when modifying existing chromosome
-				if len(line_elements) != 9:
-					print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements))
-					print(line_elements)
+			if line.startswith("##sequence-region"):
+				## Paste ##sequence-region line which only exists in gtf/gff for adding new chromosome.
+				## Select user used delimiter based on content
+				if '\t' in line:
+					line_elements = line.split('\t')
+					line_elements.append('\t') ## Add tab to the end as a format indicator
+				else:
+					line_elements = line.split()
+					line_elements.append(' ') ## Add whitespace to the end as a format indicator
+				if not valid_gff_line(line_elements):
 					exit()
-				in_gff_lines.append(line_elements)
+				# Validate new_chrom value and correct if needed
+				original_chrom = line_elements[1]
+				if original_chrom != new_chrom:
+					print(f"** INFO: Updating chromosome name from {original_chrom} to {new_chrom} to fit the\
+        				input from new_chrom parameter in command-line.")
+					line_elements[1] = new_chrom
+				# Validate sequence_length value and correct if needed
+				original_start, original_end = line_elements[2], line_elements[3]
+				if original_start != "1":
+					print(f"** INFO: Updating start position from {original_start} to 1 to fit the\
+         				format requirement of annotation file.")
+					line_elements[2] = "1"
+				if original_end != str(sequence_length):
+					print(f"** INFO: Updating sequence length from {original_end} to {sequence_length} to fit the\
+        				length of sequence in input FASTA file.")
+				line_elements[3] = str(sequence_length)
+			elif line.startswith("#"):
+				## Ignore other comment lines
+				continue
 			else:
-				# For adding new chromosomes, process all header and feature lines
-				if line.startswith("#"):
-					# Select appropriate delimiter based on content
-					if '\t' in line:
-						line_elements = line.split('\t')
-					else:
-						line_elements = line.split()
-					if line_elements[0] == "##sequence-region":
-						line_elements[-1] += '\n'
-						in_gff_lines.append(line_elements)
-				elif len(line.split('\t')) == 9:
-					# Normal feature line with 9 tab-delimited columns
-					in_gff_lines.append(line.split('\t'))
-		print(in_gff_lines) 
+				## Split, check and add feature lines
+				line_elements = line.split('\t')
+				if not valid_gff_line(line_elements):
+					exit()
+			in_gff_lines.append(line_elements)
 	return in_gff_lines
 	
 def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_modifications):
@@ -658,7 +687,9 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
 			print(f"Appending {len(in_gff_lines)} new annotations to chromosome {chrom_id}.")
 			for new_annotation in in_gff_lines:
 				if new_annotation[0] == "##sequence-region":
-					gff_out.write(" ".join(new_annotation))
+					## Use predefined format for sequence-region line
+					## Remove format indicater, and add new line
+					gff_out.write(new_annotation[-1].join(new_annotation[:-1])+'\n')
 				elif new_annotation:
 					gff_out.write("\t".join(new_annotation))
 	
diff --git a/test_data/18/gold.gtf b/test_data/18/gold.gtf
index de2f092..29476bf 100644
--- a/test_data/18/gold.gtf
+++ b/test_data/18/gold.gtf
@@ -2,11 +2,12 @@ X	ref	exon	5	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
 X	ref	CDS	8	22	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
 X	ref	start_codon	5	7	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
 X	ref	stop_codon	23	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
-##sequence-region NC_000001.11 1 248956422
+##sequence-region Y 1 16
 Y	ref	exon	1	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
 Y	ref	CDS	4	14	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
 Y	ref	start_codon	1	3	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
 Y	ref	stop_codon	14	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+##sequence-region H 1 16
 H	ref	exon	1	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
 H	ref	CDS	4	14	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
 H	ref	start_codon	1	3	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
diff --git a/test_data/18/in1.gtf b/test_data/18/in1.gtf
index 86a6f67..326d9fa 100644
--- a/test_data/18/in1.gtf
+++ b/test_data/18/in1.gtf
@@ -1,4 +1,4 @@
-##sequence-region NC_000001.11 1 248956422
+##sequence-region Y.11 1 16
 Y	ref	exon	1	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
 Y	ref	CDS	4	14	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
 Y	ref	start_codon	1	3	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
diff --git a/test_data/18/in2.gtf b/test_data/18/in2.gtf
index f4e79ad..9e0b425 100644
--- a/test_data/18/in2.gtf
+++ b/test_data/18/in2.gtf
@@ -1,3 +1,4 @@
+##sequence-region F.11 3 27
 H	ref	exon	1	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
 H	ref	CDS	4	14	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
 H	ref	start_codon	1	3	.	+	0	gene_id "gene2"; transcript_id "gene2.1";

From eccc15cdab99019814ccc2f724c3d835d5d1929e Mon Sep 17 00:00:00 2001
From: Yuwei Sun <ys4680@nyu.edu>
Date: Mon, 21 Apr 2025 20:25:36 -0400
Subject: [PATCH 09/11] Resolve comments

---
 reform.py             | 37 ++++++++++++++++++++++++++++++++-----
 test_data/19/gold.fa  |  8 ++++++++
 test_data/19/gold.gtf | 18 ++++++++++++++++++
 test_data/19/in1.fa   |  2 ++
 test_data/19/in1.gtf  |  5 +++++
 test_data/19/in2.fa   |  2 ++
 test_data/19/in2.gtf  |  5 +++++
 test_data/19/in3.fa   |  2 ++
 test_data/19/in3.gtf  |  5 +++++
 test_data/19/ref.fa   |  2 ++
 test_data/19/ref.gtf  |  4 ++++
 test_reform.py        | 40 ++++++++++++++++++++++++++++++++++++++++
 12 files changed, 125 insertions(+), 5 deletions(-)
 create mode 100644 test_data/19/gold.fa
 create mode 100644 test_data/19/gold.gtf
 create mode 100644 test_data/19/in1.fa
 create mode 100644 test_data/19/in1.gtf
 create mode 100644 test_data/19/in2.fa
 create mode 100644 test_data/19/in2.gtf
 create mode 100644 test_data/19/in3.fa
 create mode 100644 test_data/19/in3.gtf
 create mode 100644 test_data/19/ref.fa
 create mode 100644 test_data/19/ref.gtf

diff --git a/reform.py b/reform.py
index 3661bb0..ec5ecbc 100755
--- a/reform.py
+++ b/reform.py
@@ -101,7 +101,7 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
 			else:
 				SeqIO.write([chrom_seqs[s]], f, "fasta")
 	## Read in new GFF features from in_gff, False means modify existing chrom
-	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index])
+	in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], existing_chrom=in_arg.chrom, new_chrom=None)
 	## Create a temp file for gff, if index is not equal to last iteration
 	annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
 	if index < iterations - 1:
@@ -157,7 +157,7 @@ def add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
 		SeqIO.write([new_record], f, "fasta")
 	## Read in new GFF features from in_gff
 	## Pass the new_chrom name from command line and the length of the new sequence to correct ##sequence-region line
-	in_gff_lines = get_in_gff_lines(in_arg.in_gff[index], in_arg.new_chrom[index], len(new_seq))
+	in_gff_lines = get_in_gff_lines(in_gff=in_arg.in_gff[index], new_chrom=in_arg.new_chrom[index], sequence_length=len(new_seq))
 	## Create a temp file for gff, if index is not equal to last iteration
 	annotation_name, annotation_ext = get_ref_basename(in_arg.ref_gff)
 	if index < iterations - 1:
@@ -190,6 +190,19 @@ def read_fasta(in_arg, index, prev_fasta_path):
 			raise FileNotFoundError(f"Error: File {filename_fa} does not exist.")
 		real_path_fa = os.path.realpath(filename_fa)
 		record = list(SeqIO.parse(in_arg.in_fasta[index], "fasta"))[0]
+		# Check for mismatch between FASTA record ID and command line chromosome name
+		if hasattr(in_arg, 'new_chrom') and in_arg.new_chrom is not None:
+			if record.id != in_arg.new_chrom[index]:
+				print(f"** WARNING: Mismatch detected between chromosome name in input FASTA ({record.id}) "
+                      f"and command line parameter ({in_arg.new_chrom[index]}).")
+				print(f"Using command line chromosome name: {in_arg.new_chrom[index]}")
+                # The actual override happens in add_new_chrom_seq where a new SeqRecord is created
+		elif hasattr(in_arg, 'chrom') and in_arg.chrom is not None:
+			if record.id != in_arg.chrom:
+				print(f"** WARNING: Mismatch detected between chromosome name in input FASTA ({record.id}) "
+					  f"and command line parameter ({in_arg.chrom}).")
+				print(f"Using command line chromosome name: {in_arg.chrom}")
+				# The actual override happens in modify_existing_chrom_seq where the existing sequence is modified
 	except IndexError:
 		raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.")
 	except Exception as e:
@@ -289,7 +302,7 @@ def valid_gff_line(line_elements):
 			return False
 	return True
 
-def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None):
+def get_in_gff_lines(in_gff=None, existing_chrom=None, new_chrom=None, sequence_length=None):
 	'''
 	Takes a gff file and returns a list of lists where 
 	each parent list item is a single line of the gff file
@@ -303,7 +316,7 @@ def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None):
 				continue
 			
 			# Handle differently based on whether we're adding a new chromosome or modifying existing
-			if line.startswith("##sequence-region"):
+			if line.startswith("##sequence-region") and new_chrom is not None:
 				## Paste ##sequence-region line which only exists in gtf/gff for adding new chromosome.
 				## Select user used delimiter based on content
 				if '\t' in line:
@@ -336,6 +349,11 @@ def get_in_gff_lines(in_gff, new_chrom=None, sequence_length=None):
 			else:
 				## Split, check and add feature lines
 				line_elements = line.split('\t')
+				chorme_id = existing_chrom if existing_chrom else new_chrom
+				if line_elements[0] != chorme_id:
+					print("** Warning: The chromosome name in the GFF file does not match the new chromosome name.")
+					print(f"Correct the chromosome name {line_elements[0]} to {chorme_id}")
+					line_elements[0] = chorme_id
 				if not valid_gff_line(line_elements):
 					exit()
 			in_gff_lines.append(line_elements)
@@ -667,6 +685,7 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
 	"""
 	Appends new annotations to an existing GFF file without modifying existing features.
 	"""
+	gff_splitor = ''
 	ref_gff_path = ref_gff
 	## Handle compressed .gz GFF files
 	if ref_gff.endswith('.gz'):
@@ -681,6 +700,12 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
         ## Copy all existing annotations to new GFF file
 		with open(ref_gff_path, "r", encoding="utf-8") as f:
 			for line in f:
+				if line.startswith("##sequence-region") and gff_splitor == '':
+					## Select user used delimiter based on content
+					if '\t' in line:
+						gff_splitor = ('\t')
+					else:
+						gff_splitor = (' ')
 				gff_out.write(line)
 		## Append new annotations if present
 		if in_gff_lines:
@@ -688,8 +713,10 @@ def create_new_gff_for_existing_gff(new_gff_name, ref_gff, in_gff_lines, chrom_i
 			for new_annotation in in_gff_lines:
 				if new_annotation[0] == "##sequence-region":
 					## Use predefined format for sequence-region line
+					if gff_splitor == '':
+						gff_splitor = new_annotation[-1]
 					## Remove format indicater, and add new line
-					gff_out.write(new_annotation[-1].join(new_annotation[:-1])+'\n')
+					gff_out.write(gff_splitor.join(new_annotation[:-1])+'\n')
 				elif new_annotation:
 					gff_out.write("\t".join(new_annotation))
 	
diff --git a/test_data/19/gold.fa b/test_data/19/gold.fa
new file mode 100644
index 0000000..0194856
--- /dev/null
+++ b/test_data/19/gold.fa
@@ -0,0 +1,8 @@
+>X
+ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK
+>Y
+AAAATTTTGGGGCCCC
+>H
+GGGGAATTCCCCGGGG
+>M
+CCCCGGGGAAAATTTT
diff --git a/test_data/19/gold.gtf b/test_data/19/gold.gtf
new file mode 100644
index 0000000..29476bf
--- /dev/null
+++ b/test_data/19/gold.gtf
@@ -0,0 +1,18 @@
+X	ref	exon	5	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	CDS	8	22	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	start_codon	5	7	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	stop_codon	23	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+##sequence-region Y 1 16
+Y	ref	exon	1	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	CDS	4	14	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	start_codon	1	3	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Y	ref	stop_codon	14	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+##sequence-region H 1 16
+H	ref	exon	1	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	CDS	4	14	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	start_codon	1	3	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	stop_codon	14	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+M	ref	exon	1	16	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	CDS	4	14	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	start_codon	1	3	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	stop_codon	14	16	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
diff --git a/test_data/19/in1.fa b/test_data/19/in1.fa
new file mode 100644
index 0000000..53a0c2f
--- /dev/null
+++ b/test_data/19/in1.fa
@@ -0,0 +1,2 @@
+>Z
+AAAATTTTGGGGCCCC
diff --git a/test_data/19/in1.gtf b/test_data/19/in1.gtf
new file mode 100644
index 0000000..26db7ce
--- /dev/null
+++ b/test_data/19/in1.gtf
@@ -0,0 +1,5 @@
+##sequence-region Y.11 1 16
+Z	ref	exon	1	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Z	ref	CDS	4	14	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Z	ref	start_codon	1	3	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
+Z	ref	stop_codon	14	16	.	+	0	gene_id "new_gene"; transcript_id "new_gene.1";
diff --git a/test_data/19/in2.fa b/test_data/19/in2.fa
new file mode 100644
index 0000000..baf880f
--- /dev/null
+++ b/test_data/19/in2.fa
@@ -0,0 +1,2 @@
+>Y
+GGGGAATTCCCCGGGG
diff --git a/test_data/19/in2.gtf b/test_data/19/in2.gtf
new file mode 100644
index 0000000..9e0b425
--- /dev/null
+++ b/test_data/19/in2.gtf
@@ -0,0 +1,5 @@
+##sequence-region F.11 3 27
+H	ref	exon	1	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	CDS	4	14	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	start_codon	1	3	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
+H	ref	stop_codon	14	16	.	+	0	gene_id "gene2"; transcript_id "gene2.1";
diff --git a/test_data/19/in3.fa b/test_data/19/in3.fa
new file mode 100644
index 0000000..38306a6
--- /dev/null
+++ b/test_data/19/in3.fa
@@ -0,0 +1,2 @@
+>M
+CCCCGGGGAAAATTTT
diff --git a/test_data/19/in3.gtf b/test_data/19/in3.gtf
new file mode 100644
index 0000000..46fe5e2
--- /dev/null
+++ b/test_data/19/in3.gtf
@@ -0,0 +1,5 @@
+##Test Data
+M	ref	exon	1	16	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+K	ref	CDS	4	14	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+Z	ref	start_codon	1	3	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
+M	ref	stop_codon	14	16	.	+	0	gene_id "gene4"; transcript_id "gene4.1";
diff --git a/test_data/19/ref.fa b/test_data/19/ref.fa
new file mode 100644
index 0000000..6fa4d80
--- /dev/null
+++ b/test_data/19/ref.fa
@@ -0,0 +1,2 @@
+>X
+ZZZZABBBBBDDDDDCCCCCIIIIIKKKKK
diff --git a/test_data/19/ref.gtf b/test_data/19/ref.gtf
new file mode 100644
index 0000000..6737be8
--- /dev/null
+++ b/test_data/19/ref.gtf
@@ -0,0 +1,4 @@
+X	ref	exon	5	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	CDS	8	22	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	start_codon	5	7	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
+X	ref	stop_codon	23	25	.	+	0	gene_id "ref_gene"; transcript_id "ref_gene.1";
diff --git a/test_reform.py b/test_reform.py
index 0d421c2..2768106 100644
--- a/test_reform.py
+++ b/test_reform.py
@@ -762,5 +762,45 @@ def test_case_18(self):
 
 		os.chdir(wd)
 
+	def test_case_19(self):
+		"""
+		Case 19:
+		Testing Reform with incorrect new chrom and comments in input FASTA 
+  		and annotation files. Also, testing reform with more formal printing
+		"""
+
+		wd = os.getcwd()
+		os.chdir('test_data/19/')
+
+		command = """
+		python3 ../../reform.py \
+		--new_chrom="Y,H,M" \
+		--in_fasta=in1.fa,in2.fa,in3.fa \
+		--in_gff=in1.gtf,in2.gtf,in3.gtf \
+		--ref_fasta=ref.fa \
+		--ref_gff=ref.gtf
+		"""
+
+		response = subprocess.getoutput(command)
+		print(response)
+
+		with open('gold.gtf', 'r') as f:
+			gold_gff = f.read()
+		with open('ref_reformed.gtf', 'r') as f:
+			new_gff = f.read()
+		print("Testing gtf")
+		self.assertListEqual(list(gold_gff), list(new_gff))
+		print("Done")
+
+		with open('gold.fa', 'r') as f:
+			gold_fa = f.read()
+		with open('ref_reformed.fa', 'r') as f:
+			new_fa = f.read()
+		print("Testing Fasta")
+		self.assertListEqual(list(gold_fa), list(new_fa))
+		print("Done")
+
+		os.chdir(wd)
+
 if __name__ == '__main__':
     unittest.main()

From df049016c14baddacc61b25b163ccff1422f9c90 Mon Sep 17 00:00:00 2001
From: Yuwei Sun <ys4680@nyu.edu>
Date: Mon, 21 Apr 2025 21:26:47 -0400
Subject: [PATCH 10/11] use f-string

---
 reform.py | 43 ++++++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 23 deletions(-)

diff --git a/reform.py b/reform.py
index ec5ecbc..ea6557c 100755
--- a/reform.py
+++ b/reform.py
@@ -7,11 +7,11 @@
 from Bio.Seq import Seq
 from Bio.SeqRecord import SeqRecord
 try:
-    import pgzip as gzip_module
-    print("\nUsing pgzip for gzip operations.\n")
+	import pgzip as gzip_module
+	print(f"\nUsing pgzip for gzip operations.\n")
 except ImportError:
-    import gzip as gzip_module
-    print("\npgzip not found, falling back to gzip.\n")
+	import gzip as gzip_module
+	print(f"\npgzip not found, falling back to gzip.\n")
 
 
 def main():
@@ -42,7 +42,7 @@ def main():
 				add_new_chrom_seq(in_arg, index, prev_fasta_path, prev_gff_path, iterations)
 
 	print("------------------------------------------")
-	print("Reform Complete")
+	print(f"Reform Complete")
 	print("------------------------------------------")
 	print(f"New .fa file created:  {os.path.realpath(new_fasta)}")
 	print(f"New {annotation_ext} file created: {os.path.realpath(new_gff_path)}")
@@ -71,9 +71,8 @@ def modify_existing_chrom_seq(in_arg, index, prev_fasta_path, prev_modifications
 		length_changed = len(str(record.seq)) - (down_position - position - 1)
 	prev_modifications.append((position,length_changed))
 	if position != down_position:
-		print("Removing nucleotides from position {} - {}".format(position, down_position))
-	print("Proceeding to insert sequence '{}' from {} at position {} on chromsome {}"
-		.format(record.description, in_arg.in_fasta[index], position, in_arg.chrom))
+		print(f"Removing nucleotides from position {position} - {down_position}")
+	print(f"Proceeding to insert sequence '{record.description}' from {in_arg.in_fasta[index]} at position {position} on chromosome {in_arg.chrom}")
 	## Build the new chromosome sequence with the inserted_seq 
 	## If the chromosome sequence length is in the header, replace it with new length
 	new_seq = existing_seq_str[:position] + str(record.seq) + existing_seq_str[down_position:]
@@ -207,7 +206,7 @@ def read_fasta(in_arg, index, prev_fasta_path):
 		raise ValueError(f"Error: {filename_fa} is not a valid FASTA file.")
 	except Exception as e:
 		raise ValueError(f"Error parsing FASTA file: {str(e)}")
-	print("Preparing to create new FASTA file")
+	print(f"Preparing to create new FASTA file")
 	print(f"Original FASTA: {real_path_fa}")
 	## Generate index of sequences from ref reference fasta
 	if prev_fasta_path:
@@ -290,14 +289,14 @@ def valid_gff_line(line_elements):
 	'''
 	if not line_elements[0].startswith("##sequence-region"):
 		if len(line_elements) != 9:
-			print("** ERROR: in_gff file does not have 9 columns, it has", len(line_elements))
+			print(f"** ERROR: in_gff file does not have 9 columns, it has {len(line_elements)}")
 			print(line_elements)
 			return False
 	else:
 		## Check if ##sequence-region line has 4 columns, the reason why use 5 here is because last element is
 		## spliting format indicator.
 		if len(line_elements) != 5:
-			print("** ERROR: ##sequence-region line does not have 4 columns, it has", len(line_elements) - 1)
+			print(f"** ERROR: ##sequence-region line does not have 4 columns, it has {len(line_elements) - 1}")
 			print(line_elements)
 			return False
 	return True
@@ -376,7 +375,7 @@ def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_mo
 				position += lc
 		if position > len(seq_str):
 			print("** ERROR: Position greater than length of chromosome.")
-			print("Chromosome: {}\Chromosome length: {}\nPosition: \n{}".format(chrom, len(seq_str), position))
+			print(f"Chromosome: {chrom}\nChromosome length: {len(seq_str)}\nPosition: {position}")
 			exit()
 		elif position == -1:
 			position = len(seq_str)
@@ -403,8 +402,8 @@ def get_position(index, positions, upstream, downstream, chrom, seq_str, prev_mo
 			else:
 				print("** ERROR: The upstream and downstream target sequences must be present and unique in the specified chromosome.")
 				print("Chromosome: {}\n".format(chrom))
-				print("Upstream sequence found {} times".format(upstream_seq_count))
-				print("Downstream sequence found {} times".format(downstream_seq_count))
+				print(f"Upstream sequence found {upstream_seq_count} times")
+				print(f"Downstream sequence found {downstream_seq_count} times")
 				exit()
 		else:
 			print("** ERROR: You must specify a valid position or upstream and downstream sequences.")
@@ -602,8 +601,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
 					elif gff_feat_start <= position and gff_feat_end <= down_position:
 						# Which side of the feature depends on the strand (we add this as a comment)
 						x = "3" if gff_feat_strand == "+" else "5"
-						print("Feature cut off - {} prime side of feature cut off ({} strand)"
-							.format(x, gff_feat_strand))
+						print(f"Feature cut off - {x} prime side of feature cut off ({gff_feat_strand} strand)")
 						new_comment = format_comment(
 							"{} prime side of feature cut-off by inserted sequence".format(x),
 							gff_ext
@@ -631,8 +629,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
 							and gff_feat_start <= down_position 
 							and gff_feat_end > down_position):
 							x = "5" if gff_feat_strand == "+" else "3"
-							print("Feature cut off - {} prime side of feature cut off ({} strand)"
-								.format(x, gff_feat_strand))
+							print(f"Feature cut off - {x} prime side of feature cut off ({gff_feat_strand} strand)")
 							new_comment = format_comment(
 								"{} prime side of feature cut-off by inserted sequence".format(x),
 								gff_ext
@@ -656,7 +653,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
 							gff_out.write(modified_line)
 							
 						else:
-							print("** Error: Unknown case for GFF modification. Exiting " + str(line_elements))
+							print(f"** Error: Unknown case for GFF modification. Exiting {line_elements}")
 							exit()
 							
 			# If we've iterated over the entire original gff
@@ -738,7 +735,7 @@ def format_comment(comment, ext):
 	elif ext.lower().startswith('gff'):
 		new_comment = "; reform_comment={}".format(comment)
 	else:
-		print("** Error: Unrecognized extension {} in format_comment(). Exiting".format(ext))
+		print(f"** Error: Unrecognized extension {ext} in format_comment(). Exiting")
 		exit()
 	return new_comment
 	
@@ -750,15 +747,15 @@ def rename_id(line):
 	attributes = line.split('\t')[8].strip()
 	elements = attributes.split(';')
 	if elements[0].startswith("ID="):
-		print("Renaming split feature {} --> {}_split".format(elements[0], elements[0]))
+		print(f"Renaming split feature {elements[0]} --> {elements[0]}_split")
 		return ("{}_split;{}".format(elements[0], ';'.join(elements[1:])))
 	elif elements[0].startswith("gene_id "):
 		gene_id = re.match(r'gene_id \"(.+)\"', elements[0])[1]
-		print('Renaming split feature {} --> {}_split'.format(gene_id, gene_id))
+		print(f"Renaming split feature {gene_id} --> {gene_id}_split")
 		return ('gene _id "{}_split";{}'.format(gene_id, ';'.join(elements[1:])))
 
 	else:
-		print("This feature will not be renamed because it does not has an ID/gene_id attribute:\n", line)
+		print(f"This feature will not be renamed because it does not have an ID/gene_id attribute:\n{line}")
 		return attributes
 		
 def get_input_args():

From 1c263aa19892bc40a55fdf027a8e9741d7652284 Mon Sep 17 00:00:00 2001
From: Yuwei Sun <ys4680@nyu.edu>
Date: Thu, 24 Apr 2025 21:43:11 -0400
Subject: [PATCH 11/11] Helper Function for Chromosome Length Calculation

---
 reform.py | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/reform.py b/reform.py
index ea6557c..ee7abdd 100755
--- a/reform.py
+++ b/reform.py
@@ -514,7 +514,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
 					if line.startswith("##sequence-region") and line_elements[1] == chrom_id:
 						## Edit the length of the chromosome 
 						original_length = int(line_elements[3])
-						new_length = original_length - (down_position - position) + new_seq_length
+						new_length = calculate_new_length(original_length, position, down_position, new_seq_length)
 						line = line.replace(str(original_length), str(new_length))
 					gff_out.write(line)
 				else:
@@ -555,7 +555,7 @@ def create_new_gff(new_gff_name, ref_gff, in_gff_lines, position, down_position,
 					# "chromosome" or "region")
 					elif gff_feat_type in ['chromosome', 'region']:
 						original_length = gff_feat_end
-						new_length = original_length - (down_position - position) + new_seq_length
+						new_length = calculate_new_length(original_length, position, down_position, new_seq_length)
 						line = line.replace(str(original_length), str(new_length))
 						gff_out.write(line)
 						
@@ -757,7 +757,14 @@ def rename_id(line):
 	else:
 		print(f"This feature will not be renamed because it does not have an ID/gene_id attribute:\n{line}")
 		return attributes
-		
+
+def calculate_new_length(original_length, position, down_position, new_seq_length):
+	"""
+	Calculate the new chromosome/region length after a modification.
+	Returns: The new calculated length
+	"""
+	return original_length - (down_position - position) + new_seq_length
+	
 def get_input_args():
 	parser = argparse.ArgumentParser()
 	chrom_group = parser.add_mutually_exclusive_group(required=True)