bge-barcoding · SchistoDan · Jan 7, 2026 · Dec 12, 2025
diff --git a/dist/gene_fetch-1.0.20-py3-none-any.whl b/dist/gene_fetch-1.0.20-py3-none-any.whl
diff --git a/dist/gene_fetch-1.0.20.tar.gz b/dist/gene_fetch-1.0.20.tar.gz
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "gene-fetch" 
-version = "1.0.20"
+version = "1.0.21"
 description = "Gene Fetch: High-throughput NCBI Sequence Retrieval Tool"
 authors = ["D. Parsons <d.parsons@nhm.ac.uk>, B. Price <b.price@nhm.ac.uk>"]
 license = "MIT"

diff --git a/src/gene_fetch/__init__.py b/src/gene_fetch/__init__.py
@@ -5,7 +5,7 @@
 using sample taxonomic information.
 """
 
-__version__ = "1.0.20"
+__version__ = "1.0.21"
 
 from .core import Config
 from .entrez_handler import EntrezHandler

diff --git a/src/gene_fetch/__pycache__/__init__.cpython-312.pyc b/src/gene_fetch/__pycache__/__init__.cpython-312.pyc
diff --git a/src/gene_fetch/__pycache__/__init__.cpython-313.pyc b/src/gene_fetch/__pycache__/__init__.cpython-313.pyc
diff --git a/src/gene_fetch/__pycache__/core.cpython-312.pyc b/src/gene_fetch/__pycache__/core.cpython-312.pyc
diff --git a/src/gene_fetch/__pycache__/core.cpython-313.pyc b/src/gene_fetch/__pycache__/core.cpython-313.pyc
diff --git a/src/gene_fetch/__pycache__/entrez_handler.cpython-312.pyc b/src/gene_fetch/__pycache__/entrez_handler.cpython-312.pyc
diff --git a/src/gene_fetch/__pycache__/entrez_handler.cpython-313.pyc b/src/gene_fetch/__pycache__/entrez_handler.cpython-313.pyc
diff --git a/src/gene_fetch/__pycache__/main.cpython-312.pyc b/src/gene_fetch/__pycache__/main.cpython-312.pyc
diff --git a/src/gene_fetch/__pycache__/main.cpython-313.pyc b/src/gene_fetch/__pycache__/main.cpython-313.pyc
diff --git a/src/gene_fetch/__pycache__/output_manager.cpython-312.pyc b/src/gene_fetch/__pycache__/output_manager.cpython-312.pyc
diff --git a/src/gene_fetch/__pycache__/output_manager.cpython-313.pyc b/src/gene_fetch/__pycache__/output_manager.cpython-313.pyc
diff --git a/src/gene_fetch/__pycache__/processors.cpython-312.pyc b/src/gene_fetch/__pycache__/processors.cpython-312.pyc
diff --git a/src/gene_fetch/__pycache__/processors.cpython-313.pyc b/src/gene_fetch/__pycache__/processors.cpython-313.pyc
diff --git a/src/gene_fetch/__pycache__/sequence_processor.cpython-312.pyc b/src/gene_fetch/__pycache__/sequence_processor.cpython-312.pyc
diff --git a/src/gene_fetch/__pycache__/sequence_processor.cpython-313.pyc b/src/gene_fetch/__pycache__/sequence_processor.cpython-313.pyc
diff --git a/src/gene_fetch/core.py b/src/gene_fetch/core.py
@@ -288,6 +288,8 @@ def __init__(self, email, api_key):
             "psbA": "psba",
             "trnh-psba": "psba",
             "psba-trnh": "psba",
+            "lsu": "28s",
+            "ssu": "18s",
         }
 
         # Define gene type categories
@@ -329,7 +331,7 @@ def __init__(self, email, api_key):
                 "28S rRNA[Gene]",
                 "rrn28[Gene]",
                 "LSU rRNA[Gene]",
-                "28S ribosomal RNA[rRNA]"
+                "28S ribosomal RNA[rRNA]",
                 "28S rRNA[rRNA]",
                 "LSU rRNA[rRNA]",
                 "rrn28[rRNA]",

diff --git a/src/gene_fetch/sequence_processor.py b/src/gene_fetch/sequence_processor.py
@@ -208,7 +208,7 @@ def extract_nucleotide(
             variations = self.config._protein_coding_genes[base_gene]
             gene_variations = {v.split("[")[0].strip('"').lower() for v in variations}
 
-            # Add common pattern variations for different writing styles
+            # Common pattern variations for different naming conventions
             if base_gene == "rbcl":
                 pattern_variations = [
                     "rbcl",
@@ -333,6 +333,17 @@ def extract_nucleotide(
                 "rrn16",
                 "rrn 16",
             ]
+        elif base_gene == "12s" or base_gene == "12s rrna" or base_gene == "rrn12":
+            pattern_variations = [
+                "12s",
+                "12s rrna",
+                "12s ribosomal rna",
+                "12s ribosomal",
+                "12 s rrna",
+                "12 s",
+                "rrn12",
+                "rrn 12"
+            ]
         elif base_gene == "18s" or base_gene == "18s rrna" or base_gene == "rrn18":
             pattern_variations = [
                 "18s",
@@ -1049,7 +1060,7 @@ def try_fetch_at_taxid(
                             # Get summaries and sort by length
                             try:
                                 sorted_summaries = []
-                                batch_size = 200
+                                batch_size = 500
 
                                 for i in range(0, len(id_list), batch_size):
                                     batch_ids = id_list[i : i + batch_size]
@@ -1093,9 +1104,9 @@ def try_fetch_at_taxid(
                                         key=lambda x: x[1], reverse=True
                                     )
 
-                                    # Take only top 250 IDs by sequence length
+                                    # Take only top 10 IDs by sequence length (provides fallback if some records are invalid)
                                     processed_ids = [
-                                        item[0] for item in sorted_summaries[:250]
+                                        item[0] for item in sorted_summaries[:10]
                                     ]
                                     logger.info(
                                         f"Successfully filtered to top proteins by length (longest: {sorted_summaries[0][1]} aa)"
@@ -1351,9 +1362,9 @@ def try_fetch_at_taxid(
                                         key=lambda x: x[1], reverse=True
                                     )
 
-                                    # Take only top 250 IDs by sequence length
+                                    # Take only top 10 IDs by sequence length (provides fallback if some records are invalid)
                                     processed_ids = [
-                                        item[0] for item in sorted_summaries[:250]
+                                        item[0] for item in sorted_summaries[:10]
                                     ]
                                     logger.info(
                                         f"Successfully filtered to top nucleotide sequences by length (longest: {sorted_summaries[0][1]} bp)"
@@ -1771,11 +1782,12 @@ def extract_rRNA(self, record, gene_name, single_mode=False):
 
         # Define alternative names for different rRNA types
         rRNA_alternatives = {
-            "16s": ["16s", "rrs", "rrn16", "ssu", "small subunit"],  # Small subunit bacterial
+            "16s": ["16s", "rrs", "rrn16", "ssu", "small subunit", "s-rrna"],  # Small subunit bacterial
             "18s": ["18s", "rrn18", "ssu", "small subunit"],  # Small subunit eukaryotic
-            "23s": ["23s", "rrl", "rrn23", "lsu", "large subunit"],  # Large subunit bacterial
-            "28s": ["28s", "rrn28", "lsu", "large subunit"],  # Large subunit eukaryotic
-            "12s": ["12s", "mt-rrn1", "mt 12s"],  # Mitochondrial SSU
+            "23s": ["23s", "rrl", "rrn23", "lsu", "large subunit", "l-rrna"],  # Large subunit bacterial
+            "28s": ["28s", "rrn28", "lsu", "large subunit", "l-rrna"],  # Large subunit eukaryotic
+            "12s": ["12s", "mt-rrn1", "mt-rnr1", "mt 12s", "s-rrna"],  # Mitochondrial SSU
+            "16s": ["16s", "mt-rrn2", "mt-rnr2", "mt 16s", "l-rrna"],  # Mitochondrial LSU
             "5s": ["5s", "rrn5", "rrn5s", "rrna 5s"],  # 5S bacterial
         }