Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file added dist/gene_fetch-1.0.20-py3-none-any.whl
Binary file not shown.
Binary file added dist/gene_fetch-1.0.20.tar.gz
Binary file not shown.
519 changes: 0 additions & 519 deletions poetry.lock

This file was deleted.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "gene-fetch"
version = "1.0.20"
version = "1.0.21"
description = "Gene Fetch: High-throughput NCBI Sequence Retrieval Tool"
authors = ["D. Parsons <d.parsons@nhm.ac.uk>, B. Price <b.price@nhm.ac.uk>"]
license = "MIT"
Expand Down
2 changes: 1 addition & 1 deletion src/gene_fetch/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
using sample taxonomic information.
"""

__version__ = "1.0.20"
__version__ = "1.0.21"

from .core import Config
from .entrez_handler import EntrezHandler
Expand Down
Binary file removed src/gene_fetch/__pycache__/__init__.cpython-312.pyc
Binary file not shown.
Binary file removed src/gene_fetch/__pycache__/__init__.cpython-313.pyc
Binary file not shown.
Binary file removed src/gene_fetch/__pycache__/core.cpython-312.pyc
Binary file not shown.
Binary file removed src/gene_fetch/__pycache__/core.cpython-313.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file removed src/gene_fetch/__pycache__/main.cpython-312.pyc
Binary file not shown.
Binary file removed src/gene_fetch/__pycache__/main.cpython-313.pyc
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
4 changes: 3 additions & 1 deletion src/gene_fetch/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,8 @@ def __init__(self, email, api_key):
"psbA": "psba",
"trnh-psba": "psba",
"psba-trnh": "psba",
"lsu": "28s",
"ssu": "18s",
}

# Define gene type categories
Expand Down Expand Up @@ -329,7 +331,7 @@ def __init__(self, email, api_key):
"28S rRNA[Gene]",
"rrn28[Gene]",
"LSU rRNA[Gene]",
"28S ribosomal RNA[rRNA]"
"28S ribosomal RNA[rRNA]",
"28S rRNA[rRNA]",
"LSU rRNA[rRNA]",
"rrn28[rRNA]",
Expand Down
32 changes: 22 additions & 10 deletions src/gene_fetch/sequence_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ def extract_nucleotide(
variations = self.config._protein_coding_genes[base_gene]
gene_variations = {v.split("[")[0].strip('"').lower() for v in variations}

# Add common pattern variations for different writing styles
# Common pattern variations for different naming conventions
if base_gene == "rbcl":
pattern_variations = [
"rbcl",
Expand Down Expand Up @@ -333,6 +333,17 @@ def extract_nucleotide(
"rrn16",
"rrn 16",
]
elif base_gene == "12s" or base_gene == "12s rrna" or base_gene == "rrn12":
pattern_variations = [
"12s",
"12s rrna",
"12s ribosomal rna",
"12s ribosomal",
"12 s rrna",
"12 s",
"rrn12",
"rrn 12"
]
elif base_gene == "18s" or base_gene == "18s rrna" or base_gene == "rrn18":
pattern_variations = [
"18s",
Expand Down Expand Up @@ -1049,7 +1060,7 @@ def try_fetch_at_taxid(
# Get summaries and sort by length
try:
sorted_summaries = []
batch_size = 200
batch_size = 500

for i in range(0, len(id_list), batch_size):
batch_ids = id_list[i : i + batch_size]
Expand Down Expand Up @@ -1093,9 +1104,9 @@ def try_fetch_at_taxid(
key=lambda x: x[1], reverse=True
)

# Take only top 250 IDs by sequence length
# Take only top 10 IDs by sequence length (provides fallback if some records are invalid)
processed_ids = [
item[0] for item in sorted_summaries[:250]
item[0] for item in sorted_summaries[:10]
]
logger.info(
f"Successfully filtered to top proteins by length (longest: {sorted_summaries[0][1]} aa)"
Expand Down Expand Up @@ -1351,9 +1362,9 @@ def try_fetch_at_taxid(
key=lambda x: x[1], reverse=True
)

# Take only top 250 IDs by sequence length
# Take only top 10 IDs by sequence length (provides fallback if some records are invalid)
processed_ids = [
item[0] for item in sorted_summaries[:250]
item[0] for item in sorted_summaries[:10]
]
logger.info(
f"Successfully filtered to top nucleotide sequences by length (longest: {sorted_summaries[0][1]} bp)"
Expand Down Expand Up @@ -1771,11 +1782,12 @@ def extract_rRNA(self, record, gene_name, single_mode=False):

# Define alternative names for different rRNA types
rRNA_alternatives = {
"16s": ["16s", "rrs", "rrn16", "ssu", "small subunit"], # Small subunit bacterial
"16s": ["16s", "rrs", "rrn16", "ssu", "small subunit", "s-rrna"], # Small subunit bacterial
"18s": ["18s", "rrn18", "ssu", "small subunit"], # Small subunit eukaryotic
"23s": ["23s", "rrl", "rrn23", "lsu", "large subunit"], # Large subunit bacterial
"28s": ["28s", "rrn28", "lsu", "large subunit"], # Large subunit eukaryotic
"12s": ["12s", "mt-rrn1", "mt 12s"], # Mitochondrial SSU
"23s": ["23s", "rrl", "rrn23", "lsu", "large subunit", "l-rrna"], # Large subunit bacterial
"28s": ["28s", "rrn28", "lsu", "large subunit", "l-rrna"], # Large subunit eukaryotic
"12s": ["12s", "mt-rrn1", "mt-rnr1", "mt 12s", "s-rrna"], # Mitochondrial SSU
"16s": ["16s", "mt-rrn2", "mt-rnr2", "mt 16s", "l-rrna"], # Mitochondrial LSU
"5s": ["5s", "rrn5", "rrn5s", "rrna 5s"], # 5S bacterial
}

Expand Down