diff --git a/README.md b/README.md
index 2c7c1150..9ddbafbe 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
# seqspec
-
-[](https://pypi.org/project/seqspec/0.3.1/)
+

[](LICENSE)
diff --git a/docs/INSTALLATION.md b/docs/INSTALLATION.md
index 9e7db60a..46e683a2 100644
--- a/docs/INSTALLATION.md
+++ b/docs/INSTALLATION.md
@@ -7,16 +7,8 @@ authors:
# Installation
-The development version can be installed with
-
-```bash
-pip install git+https://github.com/pachterlab/seqspec@devel
-```
-
-The official release can be installed directly from pypi
-
```bash
-pip install seqspec
+pip install git+https://github.com/IGVF-DACC/seqspec.git@main
```
Verify the installation
diff --git a/docs/SEQSPEC_TOOL.md b/docs/SEQSPEC_TOOL.md
index 1e355e71..27037676 100644
--- a/docs/SEQSPEC_TOOL.md
+++ b/docs/SEQSPEC_TOOL.md
@@ -61,6 +61,12 @@ Check that the `seqspec` file is correctly formatted and consistent with the [sp
seqspec check [-h] [-o OUT] yaml
```
+```python
+from seqspec.seqspec_check import run_check
+
+run_check(schema_fn: str, spec_fn: str, o: str)
+```
+
- optionally, `-o OUT` can be used to write the output to a file.
- `yaml` corresponds to the `seqspec` file.
@@ -133,6 +139,12 @@ $ seqspec check spec.yaml
seqspec find [-h] [-o OUT] [-s Selector] -m MODALITY [-i IDs] yaml
```
+```python
+from seqspec.seqspec_find import run_find
+
+run_find(spec_fn: str, modality: str, id: str, idtype: str, o: str)
+```
+
- optionally, `-o OUT` can be used to write the output to a file.
- optionally, `-s Selector` is the type of the ID you are searching for (default: region). Can be one of
- read
@@ -195,6 +207,12 @@ $ seqspec find -m rna -s region-type -i barcode spec.yaml
seqspec file [-h] [-o OUT] [-i IDs] -m MODALITY [-s SELECTOR] [-f FORMAT] [-k KEY] yaml
```
+```python
+from seqspec.seqspec_file import run_file
+
+run_file(spec_fn: str, m: str, ids: List[str], idtype: str, fmt: str, k: str, o: str, fp=False)
+```
+
- optionally, `-o OUT` can be used to write the output to a file.
- optionally, `-s Selector` is the type of the ID you are searching for (default: read). Can be one of
- read
@@ -266,6 +284,11 @@ Automatically fill in missing fields in the spec.
seqspec format [-h] [-o OUT] yaml
```
+```python
+from seqspec.seqspec_format import run_format
+run_format(spec_fn: str, o: str)
+```
+
- `-o OUT` the path to create the formatted `seqspec` file.
- `yaml` corresponds to the `seqspec` file.
@@ -283,11 +306,16 @@ $ seqspec format -o spec.yaml spec.yaml
Identify the position of elements in a spec for use in downstream tools. Returns the 0-indexed position of elements contained in a given region in the 5'->3' direction.
-```
+```bash
seqspec index [-o OUT] [-t TOOL] [--rev] -m MODALITY -r REGION yaml
seqspec index [-h] [-o OUT] [-t TOOL] [-s SELECTOR] [--rev] -m MODALITY [-i IDs] yaml
```
+```python
+from seqspec.seqspec_index import run_index
+run_index(spec_fn: str, modality: str, ids: List[str], idtype: str, fmt: str, rev: str, subregion_type: str, o)
+```
+
- optionally, `-o OUT` can be used to write the output to a file.
- optionally, `--rev` can be set to return the 3'->5' index.
- optionally, `-t TOOL` returns the indices in the format specified by the tool. One of:
@@ -342,6 +370,11 @@ $ seqspec index -m atac -t kb -s file spec.yaml
seqspec info [-h] [-k KEY] [-f FORMAT] [-o OUT] yaml
```
+```python
+from seqspec.seqspec_info import run_info
+run_info(spec_fn: str, f: str, k=None, o=None)
+```
+
- optionally, `-o OUT` path to write the info.
- optionally, `-k KEY` the object to display (default: meta). Can be one of
- modalities
@@ -413,6 +446,11 @@ $ seqspec info -f json -k sequence_spec spec.yaml
seqspec init [-h] -n NAME -m MODALITIES -r READS [-o OUT] newick
```
+```python
+from seqspec.seqspec_info import run_info
+run_info(spec_fn: str, f: str, k: str = None, o: str = None)
+```
+
- optionally, `-o OUT` path to create `seqspec` file.
- `-m MODALITIES` is a comma-separated list of modalities (e.g. rna,atac)
- `-n NAME` is the name associated with the `seqspec` file.
@@ -432,7 +470,7 @@ $ seqspec init -n myassay -m rna -o spec.yaml -r rna,R1.fastq.gz,r1_primer,26,po
$ seqspec init -n myassay -m rna,atac -o spec.yaml -r rna,rna_R1.fastq.gz,rna_r1_primer,26,pos:rna,rna_R2.fastq.gz,rna_r2_primer,100,neg:atac,atac_R1.fastq.gz,atac_r1_primer,100,pos:atac,atac_R2.fastq.gz,atac_r1_primer,16,neg:atac,atac_R3.fastq.gz,atac_r2_primer,100,neg "(((rna_r1_primer:0,barcode:16,umi:12,cdna:150,rna_r2_primer:0)rna),(barcode:16,atac_r1_primer:1,gdna:150,atac_r2_primer)atac)"
```
-## `seqsoec methods`: Convert seqspec file into methods section
+## `seqspec methods`: Convert seqspec file into methods section
Generate a methods section from a seqspec file.
@@ -440,6 +478,11 @@ Generate a methods section from a seqspec file.
seqspec methods [-h] -m MODALITY [-o OUT] yaml
```
+```python
+from seqspec.seqspec_methods import run_methods
+run_methods(spec_fn: str, m: str, o: str)
+```
+
- optionally, `-o OUT` path to write the methods section.
- `-m MODALITY` is the modality to write the methods for.
- `yaml` corresponds to the `seqspec` file.
@@ -479,6 +522,13 @@ The library was sequenced on a Illumina NovaSeq 6000 (EFO:0008637) using the Nov
seqspec modify [-h] [--read-id READID] [--read-name READNAME] [--primer-id PRIMERID] [--strand STRAND] [--files FILES] [--region-id REGIONID] [--region-type REGIONTYPE] [--region-name REGIONNAME] [--sequence-type SEQUENCETYPE] [--sequence SEQUENCE] [--min-len MINLEN] [--max-len MAXLEN] [-o OUT] [-i IDs] [-s SELECTOR] -m MODALITY yaml
```
+```python
+from seqspec.seqspec_modify import run_modify_read, run_modify_region
+
+run_modify_read(spec, modality, target_read, read_id, read_name, primer_id, min_len, max_len, strand, files)
+run_modify_region(spec, modality, target_region, region_id, region_type, name, sequence_type, sequence, min_len, max_len)
+```
+
Read modifications
- optionally, `--read-id READID` specifies the new `read_id`.
@@ -529,6 +579,12 @@ $ seqspec modify -m atac -o mod_spec.yaml -i atac_R1 --files "R1_1.fastq.gz,fast
seqspec onlist [-h] [-o OUT] [-s SELECTOR] [-f FORMAT] [-i IDs] -m MODALITY yaml
```
+```python
+from seqspec.seqspec_onlist import run_onlist
+
+run_onlist(spec_fn, modality, ids, idtype, fmt, o)
+```
+
- optionally, `-o OUT` to set the path of the onlist file.
- `-m MODALITY` is the modality in which you are searching for the region.
- `-i ID` is the `id` of the object to search for the onlist.
@@ -563,6 +619,11 @@ Print sequence and/or library structure as ascii, png, or html.
seqspec print [-h] [-o OUT] [-f FORMAT] yaml
```
+```python
+from seqspec.seqspec_print import run_seqspec_print
+run_seqspec_print(spec_fn, fmt, o)
+```
+
- optionally, `-o OUT` to set the path of printed file.
- optionally, `-f FORMAT` is the format of the printed file. Can be one of:
- `library-ascii`: prints an ascii tree of the library_spec
@@ -651,6 +712,11 @@ $ seqspec print -o spec.png -f seqspec-png spec.yaml
seqspec split [-h] -o OUT yaml
```
+```python
+from seqspec.seqspec_split import run_split
+run_split(spec_fn, o)
+```
+
- optionally, `-o OUT` name prepended to split specs.
- `yaml` corresponds to the `seqspec` file.
@@ -673,6 +739,11 @@ split.tag.yaml
seqspec version [-h] [-o OUT] yaml
```
+```python
+from seqspec.seqspec_version import run_version
+run_version(spec_fn, o)
+```
+
- optionally, `-o OUT` path to file to write output.
- `yaml` corresponds to the `seqspec` file.
@@ -693,6 +764,11 @@ This is a hidden subcommand that upgrades an old version of the spec to the curr
seqspec upgrade [-h] [-o OUT] yaml
```
+```python
+from seqspec.seqspec_upgrade import run_upgrade
+run_upgrade(spec_fn, o)
+```
+
### Examples
```bash
diff --git a/docs/SPECIFICATION.md b/docs/SPECIFICATION.md
index f4ad0be3..63b83843 100644
--- a/docs/SPECIFICATION.md
+++ b/docs/SPECIFICATION.md
@@ -153,6 +153,7 @@ Each `Region` has the following properties which are useful to annotate the elem
- `rna`: The modality corresponding to assaying RNA.
- `s5`: A sequencing primer or adaptor typically used in the Nextera kit in conjunction with ME1.
- `s7`: A sequencing primer or adaptor typically used in the Nextera kit in conjunction with ME2.
+ - `sgrna_target`: A sequence corresponding to the guide RNA spacer region that determines the genomic target of CRISPR-based perturbations.
- `tag`: A short sequence of DNA or RNA used to label or identify a sample, protein, or other grouping.
- `truseq_read1`: The first read primer in a paired-end sequencing run using the Illumina TruSeq Library preparation kit.
- `truseq_read2`: The second read primer in a paired-end sequencing run using the Illumina TruSeq Library preparation kit.
diff --git a/docs/assays/10xcrispr.spec.yaml b/docs/assays/10xcrispr.spec.yaml
index 87e4d369..a95cc5ae 100644
--- a/docs/assays/10xcrispr.spec.yaml
+++ b/docs/assays/10xcrispr.spec.yaml
@@ -203,7 +203,7 @@ library_spec:
parent_id: crispr
- !Region
region_id: sgrna_target
- region_type: crispr
+ region_type: sgrna_target
name: sgrna_target
sequence_type: onlist
sequence: NNNNNNNNNNNNNNNNNXXX
diff --git a/docs/assays/sccrispra.spec.yaml b/docs/assays/sccrispra.spec.yaml
index e0f27906..c825dc37 100644
--- a/docs/assays/sccrispra.spec.yaml
+++ b/docs/assays/sccrispra.spec.yaml
@@ -223,7 +223,7 @@ library_spec:
- !Region
parent_id: crispr_R2_001.fastq.gz
region_id: gRNA
- region_type: gRNA
+ region_type: sgrna_target
name: Guide RNAs
sequence_type: onlist
sequence: NNNNNNNNNNNNNNNNNNNN
diff --git a/requirements.txt b/requirements.txt
index 4d234556..acff0508 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ jsonschema
newick
requests
biopython
-packaging
\ No newline at end of file
+packaging
+matplotlib>=3.4.0
\ No newline at end of file
diff --git a/seqspec/File.py b/seqspec/File.py
index 70a52f1e..423dcedf 100644
--- a/seqspec/File.py
+++ b/seqspec/File.py
@@ -24,26 +24,18 @@ def __init__(
self.md5 = md5
def __repr__(self) -> str:
- d = {
- "file_id": self.file_id,
- "filename": self.filename,
- "filetype": self.filetype,
- "filesize": self.filesize,
- "url": self.url,
- "urltype": self.urltype,
- "md5": self.md5,
- }
+ d = self.to_dict()
return f"{d}"
def to_dict(self):
d = {
- "file_id": self.file_id,
- "filename": self.filename,
- "filetype": self.filetype,
- "filesize": self.filesize,
- "url": self.url,
- "urltype": self.urltype,
- "md5": self.md5,
+ "file_id": getattr(self, "file_id", None),
+ "filename": getattr(self, "filename", None),
+ "filetype": getattr(self, "filetype", None),
+ "filesize": getattr(self, "filesize", None),
+ "url": getattr(self, "url", None),
+ "urltype": getattr(self, "urltype", None),
+ "md5": getattr(self, "md5", None),
}
return d
diff --git a/seqspec/Read.py b/seqspec/Read.py
index 6681db38..6984998d 100644
--- a/seqspec/Read.py
+++ b/seqspec/Read.py
@@ -32,24 +32,13 @@ def set_files(self, files: Optional[List["File"]] = []):
self.files = files
def __repr__(self) -> str:
- d = {
- "read_id": self.read_id,
- "name": self.name,
- "modality": self.modality,
- "primer_id": self.primer_id,
- "min_len": self.min_len,
- "max_len": self.max_len,
- "strand": self.strand,
- "files": self.files,
- }
+ d = self.to_dict()
return f"{d}"
def to_dict(self):
# TODO is this necessary for backwards compatibility?
- if self.files:
- files = [i.to_dict() for i in self.files]
- else:
- files = []
+ files = getattr(self, "files", [])
+ files = [i.to_dict() for i in files]
d = {
"read_id": self.read_id,
"name": self.name,
diff --git a/seqspec/Region.py b/seqspec/Region.py
index 9a2a7fdf..f7edbc01 100644
--- a/seqspec/Region.py
+++ b/seqspec/Region.py
@@ -417,7 +417,7 @@ def __init__(
url: str,
urltype: str,
md5: str,
- location: Optional[str],
+ # location: Optional[str],
) -> None:
super().__init__()
self.file_id = file_id
diff --git a/seqspec/__init__.py b/seqspec/__init__.py
index 260c070a..ad6cb61b 100644
--- a/seqspec/__init__.py
+++ b/seqspec/__init__.py
@@ -1 +1,8 @@
-__version__ = "0.3.1"
+__version__ = "0.3.0"
+
+
+def get_version():
+ """
+ Returns the version of the package.
+ """
+ return __version__
diff --git a/seqspec/main.py b/seqspec/main.py
index d64a881b..b800ab78 100644
--- a/seqspec/main.py
+++ b/seqspec/main.py
@@ -1,46 +1,48 @@
-from . import __version__
-import argparse
-import sys
-from .seqspec_format import setup_format_args, validate_format_args
-from .seqspec_print import setup_print_args, validate_print_args
-from .seqspec_check import setup_check_args, validate_check_args
-from .seqspec_find import setup_find_args, validate_find_args
-
-# from .seqspec_genbank import setup_genbank_args, validate_genbank_args
-from .seqspec_modify import setup_modify_args, validate_modify_args
-from .seqspec_index import setup_index_args, validate_index_args
-from .seqspec_info import setup_info_args, validate_info_args
-
-from .seqspec_split import setup_split_args, validate_split_args
-from .seqspec_init import setup_init_args, validate_init_args
-from .seqspec_onlist import setup_onlist_args, validate_onlist_args
-from .seqspec_version import setup_version_args, validate_version_args
-from .seqspec_methods import setup_methods_args, validate_methods_args
-from .seqspec_file import setup_file_args, validate_file_args
-from .seqspec_upgrade import setup_upgrade_args, validate_upgrade_args
+"""Main module for seqspec CLI.
-import warnings
-
-# Steps to add new subcommands
-# Create seqspec_subcommand.py (create setup_subcmd_args, validate_subcmd_args, run_subcmd in that file)
-# (in this file) from seqspec_subcmd import setup_subcmd_args, validate_subcmd_args
-# Add setup_subcmd_args to command_to_parser along with its key==str(subcmd)
-# Add validate_subcmd_args to COMMAND_TO_FUNCTION along with its key==str(subcmd)
+This module provides the main entry point for the seqspec command-line interface.
+It handles argument parsing, command routing, and execution of subcommands.
+"""
+import sys
+import warnings
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+from typing import Dict, Callable, Any
-def main():
- warnings.simplefilter("default", DeprecationWarning)
+from . import __version__
- # setup parsers
- parser = argparse.ArgumentParser(
+# Import subcommand modules
+from .seqspec_format import setup_format_args, run_format
+from .seqspec_print import setup_print_args, run_print
+from .seqspec_check import setup_check_args, run_check
+from .seqspec_find import setup_find_args, run_find
+from .seqspec_convert import setup_convert_args, run_convert
+from .seqspec_modify import setup_modify_args, run_modify
+from .seqspec_index import setup_index_args, run_index
+from .seqspec_info import setup_info_args, run_info
+from .seqspec_split import setup_split_args, run_split
+from .seqspec_init import setup_init_args, run_init
+from .seqspec_onlist import setup_onlist_args, run_onlist
+from .seqspec_version import setup_version_args, run_version
+from .seqspec_methods import setup_methods_args, run_methods
+from .seqspec_file import setup_file_args, run_file
+from .seqspec_upgrade import setup_upgrade_args, run_upgrade
+
+
+def setup_parser():
+ """Create and configure the main argument parser.
+
+ Returns:
+ Configured ArgumentParser instance.
+ """
+ parser = ArgumentParser(
description=f"""
seqspec {__version__}: A machine-readable file format for genomic library sequence and structure.
GitHub: https://github.com/pachterlab/seqspec
Documentation: https://pachterlab.github.io/seqspec/
-
""",
- formatter_class=argparse.RawTextHelpFormatter,
+ formatter_class=RawTextHelpFormatter,
)
subparsers = parser.add_subparsers(
@@ -54,7 +56,7 @@ def main():
"find": setup_find_args(subparsers),
"file": setup_file_args(subparsers),
"format": setup_format_args(subparsers),
- # "genbank": setup_genbank_args(subparsers),
+ "convert": setup_convert_args(subparsers),
"index": setup_index_args(subparsers),
"info": setup_info_args(subparsers),
"init": setup_init_args(subparsers),
@@ -67,7 +69,18 @@ def main():
"version": setup_version_args(subparsers),
}
- # Show help when no arguments are given
+ return parser, command_to_parser
+
+
+def handle_no_args(
+ parser: ArgumentParser, command_to_parser: Dict[str, ArgumentParser]
+) -> None:
+ """Handle case when no arguments are provided.
+
+ Args:
+ parser: Main argument parser.
+ command_to_parser: Dictionary mapping commands to their parsers.
+ """
if len(sys.argv) == 1:
parser.print_help(sys.stderr)
sys.exit(1)
@@ -80,27 +93,43 @@ def main():
parser.print_help(sys.stderr)
sys.exit(1)
+
+def main() -> None:
+ """Main entry point for the seqspec CLI."""
+ warnings.simplefilter("default", DeprecationWarning)
+
+ parser, command_to_parser = setup_parser()
+ handle_no_args(parser, command_to_parser)
+
args = parser.parse_args()
- # Setup validator and runner for all subcommands (validate and run if valid)
- COMMAND_TO_FUNCTION = {
- "format": validate_format_args,
- "print": validate_print_args,
- "check": validate_check_args,
- "find": validate_find_args,
- "index": validate_index_args,
- "info": validate_info_args,
- "init": validate_init_args,
- "methods": validate_methods_args,
- "modify": validate_modify_args,
- "onlist": validate_onlist_args,
- "split": validate_split_args,
- "version": validate_version_args,
- "file": validate_file_args,
- "upgrade": validate_upgrade_args,
- # "genbank": validate_genbank_args,
+ # Setup validator and runner for all subcommands
+ command_to_function: Dict[str, Callable[[ArgumentParser, Namespace], Any]] = {
+ "format": run_format,
+ "print": run_print,
+ "check": run_check,
+ "find": run_find,
+ "index": run_index,
+ "info": run_info,
+ "init": run_init,
+ "methods": run_methods,
+ "modify": run_modify,
+ "onlist": run_onlist,
+ "split": run_split,
+ "version": run_version,
+ "file": run_file,
+ "upgrade": run_upgrade,
+ "convert": run_convert,
}
- COMMAND_TO_FUNCTION[sys.argv[1]](parser, args)
+
+ try:
+ command_to_function[sys.argv[1]](parser, args)
+ except KeyError:
+ parser.print_help(sys.stderr)
+ sys.exit(1)
+ except Exception as e:
+ print(f"Error: {str(e)}", file=sys.stderr)
+ sys.exit(1)
if __name__ == "__main__":
diff --git a/seqspec/schema/seqspec.schema.json b/seqspec/schema/seqspec.schema.json
index 997d14a0..5b3ddd06 100644
--- a/seqspec/schema/seqspec.schema.json
+++ b/seqspec/schema/seqspec.schema.json
@@ -71,6 +71,8 @@
"massively parallel reporter assay (OBI:0002675)",
"chromosome conformation capture-on-chip assay (OBI:0002458)",
"single nucleus methylation chromatin conformation capture seq (NTR:0000745)",
+ "in vitro CRISPR screen using flow cytometry (OBI:0003661)",
+ "in vitro CRISPR screen using single-cell RNA-seq (OBI:0003660)",
"Custom"
]
},
@@ -141,6 +143,7 @@
"Illumina NextSeq 2000 (EFO:0010963)",
"Illumina NovaSeq X (NTR:0000765)",
"PacBio RS II (EFO:0008631)",
+ "Illumina NovaSeq X (EFO:0022840)",
"Custom"
]
},
@@ -265,6 +268,7 @@
"enum": [
"atac",
"barcode",
+ "bead_TSO",
"cdna",
"crispr",
"custom_primer",
@@ -292,6 +296,7 @@
"rna",
"s5",
"s7",
+ "sgrna_target",
"tag",
"truseq_read1",
"truseq_read2",
diff --git a/seqspec/schema/seqspec_igvf.schema.json b/seqspec/schema/seqspec_igvf.schema.json
new file mode 100644
index 00000000..9d0ffd3a
--- /dev/null
+++ b/seqspec/schema/seqspec_igvf.schema.json
@@ -0,0 +1,398 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "Assay.schema.json",
+ "title": "Assay",
+ "description": "A Assay of DNA",
+ "type": "object",
+ "properties": {
+ "seqspec_version": {
+ "description": "Version of the seqspec specification used",
+ "type": "string",
+ "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$"
+ },
+ "assay_id": {
+ "description": "Identifier for the assay",
+ "type": "string"
+ },
+ "name": {
+ "description": "The name of the assay",
+ "type": "string"
+ },
+ "doi": {
+ "description": "the doi of the paper that describes the assay",
+ "type": "string"
+ },
+ "date": {
+ "description": "The seqspec creation date",
+ "type": "string",
+ "pattern": "^(0?[1-9]|[12][0-9]|3[01])\\s(January|February|March|April|May|June|July|August|September|October|November|December)\\s(19|20)\\d\\d$"
+ },
+ "description": {
+ "description": "A short description of the assay",
+ "type": "string"
+ },
+ "modalities": {
+ "description": "The modalities the assay targets",
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": ["dna", "rna", "tag", "protein", "atac", "crispr"]
+ }
+ },
+ "lib_struct": {
+ "description": "The link to Teichmann's libstructs page derived for this sequence",
+ "type": "string"
+ },
+ "library_protocol": {
+ "description": "The protocol/machine/tool to generate the library insert",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "protocol_id": { "type": "string" },
+ "name": { "type": ["string", "null"] },
+ "modality": { "type": "string" }
+ }
+ },
+ "minItems": 1
+ }
+ ]
+ },
+ "library_kit": {
+ "description": "The kit used to make the library sequence_protocol compatible",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "kit_id": { "type": "string" },
+ "name": { "type": ["string", "null"] },
+ "modality": { "type": "string" }
+ }
+ },
+ "minItems": 1
+ }
+ ]
+ },
+ "sequence_protocol": {
+ "description": "The protocol/machine/tool to generate sequences",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "protocol_id": { "type": "string" },
+ "name": { "type": ["string", "null"] },
+ "modality": { "type": "string" }
+ }
+ },
+ "minItems": 1
+ }
+ ]
+ },
+ "sequence_kit": {
+ "description": "The kit used with the protocol to sequence the library",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "kit_id": { "type": "string" },
+ "name": { "type": ["string", "null"] },
+ "modality": { "type": "string" }
+ }
+ },
+ "minItems": 1
+ }
+ ]
+ },
+ "sequence_spec": {
+ "description": "The spec for the sequencer",
+ "type": "array",
+ "items": {
+ "$ref": "#/$defs/read"
+ }
+ },
+ "library_spec": {
+ "description": "The spec for the assay",
+ "type": "array",
+ "items": {
+ "$ref": "#/$defs/region"
+ }
+ }
+ },
+ "required": [
+ "seqspec_version",
+ "assay_id",
+ "name",
+ "doi",
+ "date",
+ "description",
+ "modalities"
+ ],
+ "$defs": {
+ "region": {
+ "title": "Region",
+ "description": "A region of DNA",
+ "type": "object",
+ "properties": {
+ "region_id": {
+ "description": "identifier for the region",
+ "type": "string"
+ },
+ "region_type": {
+ "description": "the type of region",
+ "type": "string",
+ "enum": [
+ "atac",
+ "barcode",
+ "bead_TSO",
+ "cdna",
+ "crispr",
+ "custom_primer",
+ "dna",
+ "fastq",
+ "fastq_link",
+ "gdna",
+ "hic",
+ "illumina_p5",
+ "illumina_p7",
+ "index5",
+ "index7",
+ "linker",
+ "ME1",
+ "ME2",
+ "methyl",
+ "named",
+ "nextera_read1",
+ "nextera_read2",
+ "poly_A",
+ "poly_G",
+ "poly_T",
+ "poly_C",
+ "protein",
+ "rna",
+ "s5",
+ "s7",
+ "sgrna_target",
+ "tag",
+ "truseq_read1",
+ "truseq_read2",
+ "umi"
+ ]
+ },
+ "sequence_type": {
+ "description": "The type of the sequence",
+ "type": "string",
+ "enum": ["fixed", "random", "onlist", "joined"]
+ },
+ "sequence": {
+ "description": "The sequence",
+ "type": "string"
+ },
+ "min_len": {
+ "description": "The minimum length of the sequence",
+ "type": "integer",
+ "minimum": 0,
+ "maximum": 2048
+ },
+ "max_len": {
+ "description": "The maximum length of the sequence",
+ "type": "integer",
+ "minimum": 0,
+ "maximum": 2048
+ },
+ "onlist": {
+ "description": "The file containing the sequence if seq_type = onlist",
+ "type": ["object", "null"],
+ "properties": {
+ "file_id": {
+ "description": "filename",
+ "type": "string"
+ },
+ "filename": {
+ "description": "filename for the onlist",
+ "type": "string"
+ },
+ "filetype": {
+ "description": "the type of file",
+ "type": "string"
+ },
+ "filesize": {
+ "description": "the size of the file in bytes",
+ "type": "integer"
+ },
+ "url": {
+ "description": "The path or url to the file",
+ "type": "string"
+ },
+ "urltype": {
+ "description": "type of file path",
+ "type": "string",
+ "enum": ["local", "ftp", "http", "https"]
+ },
+ "location": {
+ "description": "location of onlist",
+ "type": "string",
+ "enum": ["local", "remote"]
+ },
+ "md5": {
+ "description": "md5sum for the file pointed to by filename",
+ "type": "string" }
+ },
+ "required": [
+ "file_id",
+ "filename",
+ "filetype",
+ "filesize",
+ "url",
+ "urltype"
+ ]
+ },
+ "regions": {
+ "description": "The regions being joined",
+ "type": "array",
+ "items": {
+ "$ref": "#/$defs/region"
+ }
+ }
+ },
+ "required": [
+ "region_id",
+ "region_type",
+ "sequence_type",
+ "sequence",
+ "min_len",
+ "max_len"
+ ],
+ "if": {
+ "properties": {
+ "min_len": {
+ "const": 0
+ }
+ }
+ },
+ "then": {
+ "properties": {
+ "sequence": {
+ "type": "string",
+ "pattern": "^[ACGTRYMKSWHBVDNX]*$"
+ }
+ }
+ },
+ "else": {
+ "properties": {
+ "sequence": {
+ "type": "string",
+ "minLength": 1,
+ "pattern": "^[ACGTRYMKSWHBVDNX]+$"
+ }
+ }
+ }
+ },
+ "read": {
+ "title": "Read",
+ "type": "object",
+ "properties": {
+ "read_id": {
+ "type": "string",
+ "description": "The unique identifier for the read.",
+ "pattern": "^IGVF.*"
+ },
+ "name": {
+ "type": "string",
+ "description": "The name of the read."
+ },
+ "modality": {
+ "type": "string",
+ "description": "The modality of the assay generating the read."
+ },
+ "primer_id": {
+ "type": "string",
+ "description": "The region id of the primer used."
+ },
+ "min_len": {
+ "type": "integer",
+ "minimum": 0,
+ "description": "The minimum length of the read, must be greater than or equal to 0."
+ },
+ "max_len": {
+ "type": "integer",
+ "exclusiveMinimum": 0,
+ "description": "The maximum length of the read, must be greater than 0."
+ },
+ "strand": {
+ "type": "string",
+ "enum": ["pos", "neg"],
+ "description": "The strand orientation of the read, either positive ('pos') or negative ('neg')."
+ },
+ "files": {
+ "description": "An array of files containing the reads",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "file_id": {
+ "description": "filename",
+ "type": "string"
+ },
+ "filename": {
+ "description": "filename",
+ "type": "string"
+ },
+ "filetype": {
+ "description": "the type of file",
+ "type": "string"
+ },
+ "filesize": {
+ "description": "the size of the file in bytes",
+ "type": "integer"
+ },
+ "url": {
+ "description": "The path or url to the file",
+ "type": "string"
+ },
+ "urltype": {
+ "description": "type of file path",
+ "type": "string",
+ "enum": ["local", "ftp", "http", "https"]
+ },
+ "md5": {
+ "description": "md5sum for the file pointed to by filename",
+ "type": "string",
+ "pattern": "^[a-f0-9]{32}$"
+ }
+ }
+ }
+ }
+ },
+ "required": [
+ "read_id",
+ "modality",
+ "primer_id",
+ "min_len",
+ "max_len",
+ "strand"
+ ],
+ "additionalProperties": false
+ }
+ }
+ }
+
\ No newline at end of file
diff --git a/seqspec/schema/seqspec_igvf_onlist_skip.schema.json b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json
new file mode 100644
index 00000000..9908307f
--- /dev/null
+++ b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json
@@ -0,0 +1,397 @@
+{
+ "$schema": "https://json-schema.org/draft/2020-12/schema",
+ "$id": "Assay.schema.json",
+ "title": "Assay",
+ "description": "A Assay of DNA",
+ "type": "object",
+ "properties": {
+ "seqspec_version": {
+ "description": "Version of the seqspec specification used",
+ "type": "string",
+ "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$"
+ },
+ "assay_id": {
+ "description": "Identifier for the assay",
+ "type": "string"
+ },
+ "name": {
+ "description": "The name of the assay",
+ "type": "string"
+ },
+ "doi": {
+ "description": "the doi of the paper that describes the assay",
+ "type": "string"
+ },
+ "date": {
+ "description": "The seqspec creation date",
+ "type": "string",
+ "pattern": "^(0?[1-9]|[12][0-9]|3[01])\\s(January|February|March|April|May|June|July|August|September|October|November|December)\\s(19|20)\\d\\d$"
+ },
+ "description": {
+ "description": "A short description of the assay",
+ "type": "string"
+ },
+ "modalities": {
+ "description": "The modalities the assay targets",
+ "type": "array",
+ "items": {
+ "type": "string",
+ "enum": ["dna", "rna", "tag", "protein", "atac", "crispr"]
+ }
+ },
+ "lib_struct": {
+ "description": "The link to Teichmann's libstructs page derived for this sequence",
+ "type": "string"
+ },
+ "library_protocol": {
+ "description": "The protocol/machine/tool to generate the library insert",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "protocol_id": { "type": "string" },
+ "name": { "type": ["string", "null"] },
+ "modality": { "type": "string" }
+ }
+ },
+ "minItems": 1
+ }
+ ]
+ },
+ "library_kit": {
+ "description": "The kit used to make the library sequence_protocol compatible",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "kit_id": { "type": "string" },
+ "name": { "type": ["string", "null"] },
+ "modality": { "type": "string" }
+ }
+ },
+ "minItems": 1
+ }
+ ]
+ },
+ "sequence_protocol": {
+ "description": "The protocol/machine/tool to generate sequences",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "protocol_id": { "type": "string" },
+ "name": { "type": ["string", "null"] },
+ "modality": { "type": "string" }
+ }
+ },
+ "minItems": 1
+ }
+ ]
+ },
+ "sequence_kit": {
+ "description": "The kit used with the protocol to sequence the library",
+ "anyOf": [
+ {
+ "type": "string"
+ },
+ {
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "kit_id": { "type": "string" },
+ "name": { "type": ["string", "null"] },
+ "modality": { "type": "string" }
+ }
+ },
+ "minItems": 1
+ }
+ ]
+ },
+ "sequence_spec": {
+ "description": "The spec for the sequencer",
+ "type": "array",
+ "items": {
+ "$ref": "#/$defs/read"
+ }
+ },
+ "library_spec": {
+ "description": "The spec for the assay",
+ "type": "array",
+ "items": {
+ "$ref": "#/$defs/region"
+ }
+ }
+ },
+ "required": [
+ "seqspec_version",
+ "assay_id",
+ "name",
+ "doi",
+ "date",
+ "description",
+ "modalities"
+ ],
+ "$defs": {
+ "region": {
+ "title": "Region",
+ "description": "A region of DNA",
+ "type": "object",
+ "properties": {
+ "region_id": {
+ "description": "identifier for the region",
+ "type": "string"
+ },
+ "region_type": {
+ "description": "the type of region",
+ "type": "string",
+ "enum": [
+ "atac",
+ "barcode",
+ "bead_TSO",
+ "cdna",
+ "crispr",
+ "custom_primer",
+ "dna",
+ "fastq",
+ "fastq_link",
+ "gdna",
+ "hic",
+ "illumina_p5",
+ "illumina_p7",
+ "index5",
+ "index7",
+ "linker",
+ "ME1",
+ "ME2",
+ "methyl",
+ "named",
+ "nextera_read1",
+ "nextera_read2",
+ "poly_A",
+ "poly_G",
+ "poly_T",
+ "poly_C",
+ "protein",
+ "rna",
+ "s5",
+ "s7",
+ "sgrna_target",
+ "tag",
+ "truseq_read1",
+ "truseq_read2",
+ "umi"
+ ]
+ },
+ "sequence_type": {
+ "description": "The type of the sequence",
+ "type": "string",
+ "enum": ["fixed", "random", "onlist", "joined"]
+ },
+ "sequence": {
+ "description": "The sequence",
+ "type": "string"
+ },
+ "min_len": {
+ "description": "The minimum length of the sequence",
+ "type": "integer",
+ "minimum": 0,
+ "maximum": 2048
+ },
+ "max_len": {
+ "description": "The maximum length of the sequence",
+ "type": "integer",
+ "minimum": 0,
+ "maximum": 2048
+ },
+ "onlist": {
+ "description": "The file containing the sequence if seq_type = onlist",
+ "type": ["object", "null"],
+ "properties": {
+ "file_id": {
+ "description": "filename",
+ "type": "string"
+ },
+ "filename": {
+ "description": "filename for the onlist",
+ "type": "string"
+ },
+ "filetype": {
+ "description": "the type of file",
+ "type": "string"
+ },
+ "filesize": {
+ "description": "the size of the file in bytes",
+ "type": "integer"
+ },
+ "url": {
+ "description": "The path or url to the file",
+ "type": "string"
+ },
+ "urltype": {
+ "description": "type of file path",
+ "type": "string",
+ "enum": ["local", "ftp", "http", "https"]
+ },
+ "location": {
+ "description": "location of onlist",
+ "type": "string",
+ "enum": ["local", "remote"]
+ },
+ "md5": {
+ "description": "md5sum for the file pointed to by filename",
+ "type": "string" }
+ },
+ "required": [
+ "file_id",
+ "filename",
+ "filetype",
+ "filesize",
+ "url",
+ "urltype"
+ ]
+ },
+ "regions": {
+ "description": "The regions being joined",
+ "type": "array",
+ "items": {
+ "$ref": "#/$defs/region"
+ }
+ }
+ },
+ "required": [
+ "region_id",
+ "region_type",
+ "sequence_type",
+ "sequence",
+ "min_len",
+ "max_len"
+ ],
+ "if": {
+ "properties": {
+ "min_len": {
+ "const": 0
+ }
+ }
+ },
+ "then": {
+ "properties": {
+ "sequence": {
+ "type": "string",
+ "pattern": "^[ACGTRYMKSWHBVDNX]*$"
+ }
+ }
+ },
+ "else": {
+ "properties": {
+ "sequence": {
+ "type": "string",
+ "minLength": 1,
+ "pattern": "^[ACGTRYMKSWHBVDNX]+$"
+ }
+ }
+ }
+ },
+ "read": {
+ "title": "Read",
+ "type": "object",
+ "properties": {
+ "read_id": {
+ "type": "string",
+ "description": "The unique identifier for the read."
+ },
+ "name": {
+ "type": "string",
+ "description": "The name of the read."
+ },
+ "modality": {
+ "type": "string",
+ "description": "The modality of the assay generating the read."
+ },
+ "primer_id": {
+ "type": "string",
+ "description": "The region id of the primer used."
+ },
+ "min_len": {
+ "type": "integer",
+ "minimum": 0,
+ "description": "The minimum length of the read, must be greater than or equal to 0."
+ },
+ "max_len": {
+ "type": "integer",
+ "exclusiveMinimum": 0,
+ "description": "The maximum length of the read, must be greater than 0."
+ },
+ "strand": {
+ "type": "string",
+ "enum": ["pos", "neg"],
+ "description": "The strand orientation of the read, either positive ('pos') or negative ('neg')."
+ },
+ "files": {
+ "description": "An array of files containing the reads",
+ "type": "array",
+ "items": {
+ "type": "object",
+ "properties": {
+ "file_id": {
+ "description": "filename",
+ "type": "string"
+ },
+ "filename": {
+ "description": "filename",
+ "type": "string"
+ },
+ "filetype": {
+ "description": "the type of file",
+ "type": "string"
+ },
+ "filesize": {
+ "description": "the size of the file in bytes",
+ "type": "integer"
+ },
+ "url": {
+ "description": "The path or url to the file",
+ "type": "string"
+ },
+ "urltype": {
+ "description": "type of file path",
+ "type": "string",
+ "enum": ["local", "ftp", "http", "https"]
+ },
+ "md5": {
+ "description": "md5sum for the file pointed to by filename",
+ "type": "string",
+ "pattern": "^[a-f0-9]{32}$"
+ }
+ }
+ }
+ }
+ },
+ "required": [
+ "read_id",
+ "modality",
+ "primer_id",
+ "min_len",
+ "max_len",
+ "strand"
+ ],
+ "additionalProperties": false
+ }
+ }
+ }
+
\ No newline at end of file
diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py
index 702dfbe9..672ebaa2 100644
--- a/seqspec/seqspec_check.py
+++ b/seqspec/seqspec_check.py
@@ -1,12 +1,24 @@
+"""Check module for seqspec CLI.
+
+This module provides functionality to validate seqspec files against the specification schema.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+from typing import List, Dict, Optional
+
from jsonschema import Draft4Validator
import yaml
from os import path
+
from seqspec.utils import load_spec, file_exists
from seqspec.Assay import Assay
-from argparse import RawTextHelpFormatter
+
+from seqspec import get_version
def setup_check_args(parser):
+ """Create and configure the check command subparser."""
subparser = parser.add_parser(
"check",
description="""
@@ -19,182 +31,384 @@ def setup_check_args(parser):
help="Validate seqspec file against specification",
formatter_class=RawTextHelpFormatter,
)
- subparser.add_argument("yaml", help="Sequencing specification yaml file")
subparser.add_argument(
"-o",
+ "--output",
metavar="OUT",
- help=("Path to output file"),
+ help="Path to output file",
+ type=Path,
+ default=None,
+ )
+ subparser.add_argument(
+ "-s",
+ "--skip",
+ metavar="SKIP",
+ help="Skip checks",
type=str,
default=None,
+ choices=["igvf", "igvf_onlist_skip"],
)
+
+ subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
+
return subparser
-def validate_check_args(parser, args):
- spec_fn = args.yaml
- o = args.o
- schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json")
+def validate_check_args(parser: ArgumentParser, args: Namespace) -> None:
+ """Validate the check command arguments."""
+ if not Path(args.yaml).exists():
+ parser.error(f"Input file does not exist: {args.yaml}")
- return run_check(schema_fn, spec_fn, o)
+ if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+ parser.error(f"Output path exists but is not a file: {args.output}")
-def run_check(schema_fn, spec_fn, o):
- spec = load_spec(spec_fn)
+def format_error(errobj, idx=0):
+ return f"[error {idx}] {errobj['error_message']}"
- with open(schema_fn, "r") as stream:
- schema = yaml.load(stream, Loader=yaml.Loader)
- v = Draft4Validator(schema)
- errors = check(v, spec, spec_fn)
+def seqspec_check(
+ spec: Assay, spec_fn: str, filter_type: Optional[str] = None
+) -> List[Dict]:
+ """Core functionality to check a seqspec and return filtered errors.
- if errors:
- if o:
- with open(o, "w") as f:
- print("\n".join(errors), file=f)
- else:
- print("\n".join(errors))
+ Args:
+ spec: The Assay object to check
+ spec_fn: Path to the spec file, used for relative path resolution
+ filter_type: Optional filter type to apply to errors (e.g. "igvf", "igvf_onlist_skip")
+
+ Returns:
+ List of error dictionaries
+ """
+ errors = check(spec, spec_fn, filter_type)
+ if filter_type:
+ errors = filter_errors(errors, filter_type)
return errors
-def check(schema: Draft4Validator, spec: Assay, spec_fn: str):
- errors = []
- idx = 0
+def run_check(parser: ArgumentParser, args: Namespace):
+ """Run the check command."""
+ validate_check_args(parser, args)
+
+ spec = load_spec(args.yaml)
+ errors = seqspec_check(spec, args.yaml, args.skip)
+
+ if args.output:
+ with open(args.output, "w") as f:
+ for idx, e in enumerate(errors, 1):
+ print(format_error(e, idx), file=f)
+ else:
+ for idx, e in enumerate(errors, 1):
+ print(format_error(e, idx))
+ return errors
+
+
+IGVF_ONLIST_SKIP_FILTERS = [
+ {"error_type": "check_onlist_files_exist", "error_object": "onlist"}
+]
- # with open("del.json", "w") as f:
- # json.dump(spec.to_dict(), f, indent=4)
-
- for idx, error in enumerate(schema.iter_errors(spec.to_dict()), 1):
- errors.append(
- f"[error {idx}] {error.message} in spec[{']['.join(repr(index) for index in error.path)}]"
- )
- idx += 1
- # check that the modalities are unique
- if len(spec.modalities) != len(set(spec.modalities)):
- errors.append(
- f"[error {idx}] modalities [{', '.join(spec.modalities)}] are not unique"
- )
- idx += 1
- # check that region_ids of the first level of the spec correspond to the modalities
- # one for each modality
- modes = spec.modalities
- rgns = spec.library_spec
- for r in rgns:
- rid = r.region_id
- if rid not in modes:
- errors.append(
- f"[error {idx}] region_id '{rid}' of the first level of the spec does not correspond to a modality [{', '.join(modes)}]"
+def filter_errors(errors, filter_type):
+ filters = None
+ if filter_type == "igvf_onlist_skip":
+ filters = IGVF_ONLIST_SKIP_FILTERS
+
+ if filters:
+ ferrors = []
+ for error in errors:
+ # Check if this specific error combination exists in the filters
+ should_filter = any(
+ error["error_type"] == filter_item["error_type"]
+ and error["error_object"] == filter_item["error_object"]
+ for filter_item in filters
)
- idx += 1
- # get all of the onlist files in the spec and check that they exist relative to the path of the spec
- modes = spec.modalities
- olrgns = []
- for m in modes:
- olrgns += [i.onlist for i in spec.get_libspec(m).get_onlist_regions()]
-
- # check paths relative to spec_fn
- for ol in olrgns:
- if ol.urltype == "local":
- if ol.filename[:-3] == ".gz":
- check = path.join(path.dirname(spec_fn), ol.filename[:-3])
- if not path.exists(check):
- errors.append(f"[error {idx}] {ol.filename[:-3]} does not exist")
- idx += 1
- else:
- check = path.join(path.dirname(spec_fn), ol.filename)
- check_gz = path.join(path.dirname(spec_fn), ol.filename + ".gz")
- if not path.exists(check) and not path.exists(check_gz):
- errors.append(f"[error {idx}] {ol.filename} does not exist")
- idx += 1
- elif ol.urltype == "http" or ol.urltype == "https" or ol.urltype == "ftp":
- # ping the link with a simple http request to check if the file exists at that URI
- if spec.seqspec_version == "0.3.0":
- if not file_exists(ol.url):
- errors.append(f"[error {idx}] {ol.filename} does not exist")
- idx += 1
- else:
- if not file_exists(ol.filename):
- errors.append(f"[error {idx}] {ol.filename} does not exist")
- idx += 1
+ # Only keep errors that don't match our filter criteria
+ if not should_filter:
+ ferrors.append(error)
+ return ferrors
+ else:
+ return errors
- # read ids should be unique
- read_ids = set()
- for read in spec.sequence_spec:
- if read.read_id in read_ids:
- errors.append(
- f"[error {idx}] read_id '{read.read_id}' is not unique across all reads"
+
+def check(spec: Assay, spec_fn: str, skip: str = None):
+ # Variety of checks against schema
+ def check_schema(spec: Assay, spec_fn: str, errors=[], idx=0):
+ if skip == "igvf":
+ schema_fn = path.join(
+ path.dirname(__file__), "schema/seqspec_igvf.schema.json"
+ )
+ elif skip == "igvf_onlist_skip":
+ schema_fn = path.join(
+ path.dirname(__file__), "schema/seqspec_igvf_onlist_skip.schema.json"
)
- idx += 1
else:
- read_ids.add(read.read_id)
-
- # iterate through reads in sequence_spec and check that the fastq files exist
- for read in spec.sequence_spec:
- spec_fn = path.dirname(spec_fn)
- for f in read.files:
- if f.urltype == "local":
- check = path.join(spec_fn, f.filename)
- if not path.exists(check):
- errors.append(f"[error {idx}] {f.filename} does not exist")
- idx += 1
- elif f.urltype == "http" or f.urltype == "https" or f.urltype == "ftp":
- # ping the link with a simple http request to check if the file exists at that URI
- if not file_exists(f.url):
- errors.append(f"[error {idx}] {f.filename} does not exist")
- idx += 1
+ schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json")
+ with open(schema_fn, "r") as stream:
+ schema = yaml.load(stream, Loader=yaml.Loader)
+ validator = Draft4Validator(schema)
+ for idx, error in enumerate(validator.iter_errors(spec.to_dict()), 1):
+ err_elements = [repr(index) for index in error.path]
+ err_path = f"spec[{']['.join(err_elements)}]"
+ errobj = {
+ "error_type": "check_schema",
+ "error_message": f"{error.message} in {err_path}",
+ "error_object": err_elements[-1],
+ }
+ # errors.append(f"[error {idx}] {error.message} in {err_path}]")
+ errors.append(errobj)
+ idx += 1
+ return (errors, idx)
- # check that the primer ids, strand tuple pairs are unique across all reads
- primer_strand_pairs = set()
- for read in spec.sequence_spec:
- if (read.primer_id, read.strand) in primer_strand_pairs:
- errors.append(
- f"[error {idx}] primer_id '{read.primer_id}' and strand '{read.strand}' tuple is not unique across all reads"
- )
+ # Modalities are unique
+ def check_unique_modalities(spec, spec_fn, errors, idx):
+ if len(spec.modalities) != len(set(spec.modalities)):
+ errobj = {
+ "error_type": "check_unique_modalities",
+ "error_message": f"modalities [{', '.join(spec.modalities)}] are not unique",
+ "error_object": "modalities",
+ }
+ # errors.append(
+ # f"[error {idx}] modalities [{', '.join(spec.modalities)}] are not unique"
+ # )
+ errors.append(errobj)
idx += 1
- else:
- primer_strand_pairs.add((read.primer_id, read.strand))
+ return (errors, idx)
- # TODO add option to check md5sum
+ # Region_ids of the first level correspond to the modalities (one per modality)
+ def check_region_ids_modalities(spec, spec_fn, errors, idx):
+ modes = spec.modalities
+ rgns = spec.library_spec
+ for r in rgns:
+ rid = r.region_id
+ if rid not in modes:
+ errobj = {
+ "error_type": "check_region_ids_modalities",
+ "error_message": f"region_id '{rid}' of the first level of the spec does not correspond to a modality [{', '.join(modes)}]",
+ "error_object": "region",
+ }
+ # errors.append(
+ # f"[error {idx}] region_id '{rid}' of the first level of the spec does not correspond to a modality [{', '.join(modes)}]"
+ # )
+ errors.append(errobj)
+ idx += 1
+ return (errors, idx)
- # check that the region_id is unique across all regions
- rgn_ids = set()
- for m in modes:
- for rgn in spec.get_libspec(m).get_leaves():
- if rgn.region_id in rgn_ids:
- errors.append(
- f"[error {idx}] region_id '{rgn.region_id}' is not unique across all regions"
- )
+ # Onlist files exist relative to the path of the spec or http
+ def check_onlist_files_exist(spec, spec_fn, errors, idx):
+ modes = spec.modalities
+ olrgns = []
+ for m in modes:
+ olrgns += [i.onlist for i in spec.get_libspec(m).get_onlist_regions()]
+
+ # check paths relative to spec_fn
+ for ol in olrgns:
+ if ol.urltype == "local":
+ if ol.filename[:-3] == ".gz":
+ check = path.join(path.dirname(spec_fn), ol.filename[:-3])
+ if not path.exists(check):
+ errobj = {
+ "error_type": "check_onlist_files_exist",
+ "error_message": f"{ol.filename[:-3]} does not exist",
+ "error_object": "onlist",
+ }
+ # errors.append(
+ # f"[error {idx}] {ol.filename[:-3]} does not exist"
+ # )
+ errors.append(errobj)
+ idx += 1
+ else:
+ check = path.join(path.dirname(spec_fn), ol.filename)
+ check_gz = path.join(path.dirname(spec_fn), ol.filename + ".gz")
+ if not path.exists(check) and not path.exists(check_gz):
+ errobj = {
+ "error_type": "check_onlist_files_exist",
+ "error_message": f"{ol.filename} does not exist",
+ "error_object": "onlist",
+ }
+ # errors.append(f"[error {idx}] {ol.filename} does not exist")
+ errors.append(errobj)
+ idx += 1
+ elif ol.urltype == "http" or ol.urltype == "https" or ol.urltype == "ftp":
+ # ping the link with a simple http request to check if the file exists at that URI
+ if spec.seqspec_version == get_version():
+ if not file_exists(ol.url):
+ errobj = {
+ "error_type": "check_onlist_files_exist",
+ "error_message": f"{ol.filename} does not exist",
+ "error_object": "onlist",
+ }
+
+ # errors.append(f"[error {idx}] {ol.filename} does not exist")
+ errors.append(errobj)
+ idx += 1
+ else:
+ if not file_exists(ol.filename):
+ errobj = {
+ "error_type": "check_onlist_files_exist",
+ "error_message": f"{ol.filename} does not exist",
+ "error_object": "onlist",
+ }
+ # errors.append(f"[error {idx}] {ol.filename} does not exist")
+ errors.append(errobj)
+ idx += 1
+ return (errors, idx)
+
+ # Read ids are unique
+ def check_unique_read_ids(spec, spec_fn, errors, idx):
+ read_ids = set()
+ for read in spec.sequence_spec:
+ if read.read_id in read_ids:
+ errobj = {
+ "error_type": "check_unique_read_ids",
+ "error_message": f"read_id '{read.read_id}' is not unique across all reads",
+ "error_object": "read",
+ }
+ # errors.append(
+ # f"[error {idx}] read_id '{read.read_id}' is not unique across all reads"
+ # )
+ errors.append(errobj)
idx += 1
else:
- rgn_ids.add(rgn.region_id)
+ read_ids.add(read.read_id)
+ return (errors, idx)
- # check that the modality is in the reads
- for read in spec.sequence_spec:
- if read.modality not in modes:
- errors.append(
- f"[error {idx}] '{read.read_id}' modality '{read.modality}' does not exist in the modalities"
- )
- idx += 1
+ # Read files exist
+ def check_read_files_exist(spec, spec_fn, errors, idx):
+ for read in spec.sequence_spec:
+ spec_fn = path.dirname(spec_fn)
+ for f in read.files:
+ if f.urltype == "local":
+ check = path.join(spec_fn, f.filename)
+ if not path.exists(check):
+ errobj = {
+ "error_type": "check_read_files_exist",
+ "error_message": f"{f.filename} does not exist",
+ "error_object": "file",
+ }
+ # errors.append(f"[error {idx}] {f.filename} does not exist")
+ errors.append(errobj)
+ idx += 1
+ elif f.urltype == "http" or f.urltype == "https" or f.urltype == "ftp":
+ # ping the link with a simple http request to check if the file exists at that URI
+ if not file_exists(f.url):
+ errobj = {
+ "error_type": "check_read_files_exist",
+ "error_message": f"{f.filename} does not exist",
+ "error_object": "file",
+ }
+ # errors.append(f"[error {idx}] {f.filename} does not exist")
+ errors.append(errobj)
+ idx += 1
+ return (errors, idx)
+
+ # Primer ids, strand tuple pairs are unique across all reads
+ def check_unique_read_primer_strand_pairs(spec, spec_fn, errors, idx):
+ primer_strand_pairs = set()
+ for read in spec.sequence_spec:
+ if (read.primer_id, read.strand) in primer_strand_pairs:
+ errobj = {
+ "error_type": "check_unique_read_primer_strand_pairs",
+ "error_message": f"primer_id '{read.primer_id}' and strand '{read.strand}' tuple is not unique across all reads",
+ "error_object": "read",
+ }
+ # errors.append(
+ # f"[error {idx}] primer_id '{read.primer_id}' and strand '{read.strand}' tuple is not unique across all reads"
+ # )
+ errors.append(errobj)
+ idx += 1
+ else:
+ primer_strand_pairs.add((read.primer_id, read.strand))
+ return (errors, idx)
+
+ # TODO add option to check md5sum
+ def check_md5sum(spec, spec_fn, errors, idx):
+ return (errors, idx)
+
+ # Region_id is unique across all regions
+ def check_unique_region_ids(spec, spec_fn, errors, idx):
+ modes = spec.modalities
+ rgn_ids = set()
+ for m in modes:
+ for rgn in spec.get_libspec(m).get_leaves():
+ if rgn.region_id in rgn_ids:
+ errobj = {
+ "error_type": "check_unique_region_ids",
+ "error_message": f"region_id '{rgn.region_id}' is not unique across all regions",
+ "error_object": "region",
+ }
+ # errors.append(
+ # f"[error {idx}] region_id '{rgn.region_id}' is not unique across all regions"
+ # )
+ errors.append(errobj)
+ idx += 1
+ else:
+ rgn_ids.add(rgn.region_id)
+ return (errors, idx)
+
+ # Modality is in the reads
+ def check_read_modalities(spec, spec_fn, errors, idx):
+ modes = spec.modalities
+ for read in spec.sequence_spec:
+ if read.modality not in modes:
+ errobj = {
+ "error_type": "check_read_modalities",
+ "error_message": f"read '{read.read_id}' modality '{read.modality}' does not exist in the modalities",
+ "error_object": "read",
+ }
+ # errors.append(
+ # f"[error {idx}] '{read.read_id}' modality '{read.modality}' does not exist in the modalities"
+ # )
+ errors.append(errobj)
+ idx += 1
+ return (errors, idx)
# check that the unique primer ids exist as a region id in the library_spec
- for read in spec.sequence_spec:
- if read.primer_id not in rgn_ids:
- errors.append(
- f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist in the library_spec"
- )
- idx += 1
+ # TODO is there a better way to get the rgn_ids?
+ def check_primer_ids_in_region_ids(spec, spec_fn, errors, idx):
+ # first get all unique region_ids
+ modes = spec.modalities
+ rgn_ids = set()
+ for m in modes:
+ for rgn in spec.get_libspec(m).get_leaves():
+ if rgn.region_id in rgn_ids:
+ pass
+ else:
+ rgn_ids.add(rgn.region_id)
+
+ # then check that the primer ids exist in the region_ids
+ for read in spec.sequence_spec:
+ if read.primer_id not in rgn_ids:
+ errobj = {
+ "error_type": "check_primer_ids_in_region_ids",
+ "error_message": f"'{read.read_id}' primer_id '{read.primer_id}' does not exist in the library_spec",
+ "error_object": "read",
+ }
+ # errors.append(
+ # f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist in the library_spec"
+ # )
+ errors.append(errobj)
+ idx += 1
+ return (errors, idx)
# NOTE: this is a strong assumption that may be relaxed in the future
# check that the primer id for each read is in the leaves of the spec for that modality
- for read in spec.sequence_spec:
- mode = spec.get_libspec(read.modality)
- leaves = mode.get_leaves()
- if read.primer_id not in [i.region_id for i in leaves]:
- errors.append(
- f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist as an atomic region in the library_spec for modality '{read.modality}'"
- )
- idx += 1
+ def check_primer_ids_in_libspec_leaves(spec, spec_fn, errors, idx):
+ for read in spec.sequence_spec:
+ mode = spec.get_libspec(read.modality)
+ leaves = mode.get_leaves()
+ if read.primer_id not in [i.region_id for i in leaves]:
+ errobj = {
+ "error_type": "check_primer_ids_in_libspec_leaves",
+ "error_message": f"'{read.read_id}' primer_id '{read.primer_id}' does not exist as an atomic region in the library_spec for modality '{read.modality}'",
+ "error_object": "read",
+ }
+ # errors.append(
+ # f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist as an atomic region in the library_spec for modality '{read.modality}'"
+ # )
+ errors.append(errobj)
+ idx += 1
+ return (errors, idx)
# check that the max read len is not longer than the max len of the lib spec after the primer
# for read in spec.sequence_spec:
@@ -202,89 +416,172 @@ def check(schema: Draft4Validator, spec: Assay, spec_fn: str):
# leaves = mode.get_leaves()
# idx = [i.region_id for i in leaves].index(read.primer_id)
- # if a region has a sequence type "fixed" then it should not contain subregions
- # if a region has a sequence type "joiend" then it should contain subregions
- # if a region has a sequence type "random" then it should not contain subregions and should be all X's
- # if a region has a sequence type "onlist" then it should have an onlist object
- def seqtype_check(rgn, errors, idx):
- # this is a recursive function that iterates through all regions and checks the sequence type
- if rgn.sequence_type == "fixed" and rgn.regions:
- errors.append(
- f"[error {idx}] '{rgn.region_id}' sequence_type is 'fixed' and contains subregions"
- )
- idx += 1
- if rgn.sequence_type == "joined" and not rgn.regions:
- errors.append(
- f"[error {idx}] '{rgn.region_id}' sequence_type is 'joined' and does not contain subregions"
- )
- idx += 1
- if rgn.sequence_type == "random" and rgn.regions:
- errors.append(
- f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and contains subregions"
- )
- idx += 1
- if rgn.sequence_type == "random" and rgn.sequence != "X" * rgn.max_len:
- errors.append(
- f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and sequence is not all X's"
- )
- idx += 1
- if rgn.sequence_type == "onlist" and not rgn.onlist:
- errors.append(
- f"[error {idx}] '{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object"
- )
- idx += 1
- if rgn.regions:
- for r in rgn.regions:
- errors, idx = seqtype_check(r, errors, idx)
- return (errors, idx)
+ def check_sequence_types(spec, spec_fn, errors, idx):
+ modes = spec.modalities
+
+ # if a region has a sequence type "fixed" then it should not contain subregions
+ # if a region has a sequence type "joiend" then it should contain subregions
+ # if a region has a sequence type "random" then it should not contain subregions and should be all X's
+ # if a region has a sequence type "onlist" then it should have an onlist object
+ def seqtype_check(rgn, errors, idx):
+ # this is a recursive function that iterates through all regions and checks the sequence type
+ if rgn.sequence_type == "fixed" and rgn.regions:
+ errobj = {
+ "error_type": "check_sequence_types",
+ "error_message": f"'{rgn.region_id}' sequence_type is 'fixed' and contains subregions",
+ "error_object": "region",
+ }
+ # errors.append(
+ # f"[error {idx}] '{rgn.region_id}' sequence_type is 'fixed' and contains subregions"
+ # )
+ errors.append(errobj)
+ idx += 1
+ if rgn.sequence_type == "joined" and not rgn.regions:
+ errobj = {
+ "error_type": "check_sequence_types",
+ "error_message": f"'{rgn.region_id}' sequence_type is 'joined' and does not contain subregions",
+ "error_object": "region",
+ }
+ # errors.append(
+ # f"[error {idx}] '{rgn.region_id}' sequence_type is 'joined' and does not contain subregions"
+ # )
+ errors.append(errobj)
+ idx += 1
+ if rgn.sequence_type == "random" and rgn.regions:
+ errobj = {
+ "error_type": "check_sequence_types",
+ "error_message": f"'{rgn.region_id}' sequence_type is 'random' and contains subregions",
+ "error_object": "region",
+ }
+ errors.append(errobj)
+ idx += 1
+ if rgn.sequence_type == "random" and (
+ set(rgn.sequence) != {"X"}
+ or not (rgn.min_len <= len(rgn.sequence) <= rgn.max_len)
+ ):
+ errobj = {
+ "error_type": "check_sequence_types",
+ "error_message": f"'{rgn.region_id}' sequence_type is 'random' and sequence is not all X's",
+ "error_object": "region",
+ }
+ errors.append(errobj)
+ idx += 1
+ if rgn.sequence_type == "onlist" and not rgn.onlist:
+ errobj = {
+ "error_type": "check_sequence_types",
+ "error_message": f"'{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object",
+ "error_object": "region",
+ }
+ errors.append(errobj)
+ idx += 1
+ if rgn.regions:
+ for r in rgn.regions:
+ errors, idx = seqtype_check(r, errors, idx)
+ return (errors, idx)
+
+ for m in modes:
+ for rgn in [spec.get_libspec(m)]:
+ errors, idx = seqtype_check(rgn, errors, idx)
- for m in modes:
- for rgn in [spec.get_libspec(m)]:
- errors, idx = seqtype_check(rgn, errors, idx)
+ return (errors, idx)
# check the lengths of every region against the max_len, using a recursive function
- def len_check(rgn, errors, idx):
- if rgn.regions:
- for r in rgn.regions:
- errors, idx = len_check(r, errors, idx)
- if rgn.max_len < rgn.min_len:
- errors.append(
- f"[error {idx}] '{rgn.region_id}' max_len is less than min_len"
- )
- idx += 1
+ def check_region_lengths(spec, spec_fn, errors, idx):
+ modes = spec.modalities
+
+ def len_check(rgn, errors, idx):
+ if rgn.regions:
+ for r in rgn.regions:
+ errors, idx = len_check(r, errors, idx)
+ if rgn.max_len < rgn.min_len:
+ errobj = {
+ "error_type": "check_region_lengths",
+ "error_message": f"'{rgn.region_id}' max_len is less than min_len",
+ "error_object": "region",
+ }
+ # errors.append(
+ # f"[error {idx}] '{rgn.region_id}' max_len is less than min_len"
+ # )
+ errors.append(errobj)
+ idx += 1
+ return (errors, idx)
+
+ for m in modes:
+ for rgn in [spec.get_libspec(m)]:
+ errors, idx = len_check(rgn, errors, idx)
return (errors, idx)
- for m in modes:
- for rgn in [spec.get_libspec(m)]:
- errors, idx = len_check(rgn, errors, idx)
+ # errors, idx = check_region_lengths(spec, spec_fn, errors, idx)
# check that the length of the sequence is equal to the max_len using a recursive function
# an assumption in the code and spec is that the displayed sequence is equal to the max_len
- def seq_len_check(rgn, errors, idx):
- if rgn.regions:
- for r in rgn.regions:
- errors, idx = seq_len_check(r, errors, idx)
- if rgn.sequence and (
- len(rgn.sequence) < rgn.min_len or len(rgn.sequence) > rgn.max_len
- ):
- # noqa
- errors.append(
- f"[error {idx}] '{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})"
- )
- idx += 1
- return (errors, idx)
+ def check_sequence_lengths(spec, spec_fn, errors, idx):
+ modes = spec.modalities
+
+ def seq_len_check(rgn, errors, idx):
+ if rgn.regions:
+ for r in rgn.regions:
+ errors, idx = seq_len_check(r, errors, idx)
+ if rgn.sequence and (
+ len(rgn.sequence) < rgn.min_len or len(rgn.sequence) > rgn.max_len
+ ):
+ # noqa
+ errobj = {
+ "error_type": "check_sequence_lengths",
+ "error_message": f"'{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})",
+ "error_object": "region",
+ }
+ # errors.append(
+ # f"[error {idx}] '{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})"
+ # )
+ errors.append(errobj)
+ idx += 1
+ return (errors, idx)
- for m in modes:
- for rgn in [spec.get_libspec(m)]:
- errors, idx = seq_len_check(rgn, errors, idx)
+ for m in modes:
+ for rgn in [spec.get_libspec(m)]:
+ errors, idx = seq_len_check(rgn, errors, idx)
+ return (errors, idx)
# check that the number of files in each "File" object for all Read object are all the same length
- nfiles = []
- for read in spec.sequence_spec:
- nfiles.append(len(read.files))
+ def check_read_file_count(spec, spec_fn, errors, idx):
+ nfiles = []
+ for read in spec.sequence_spec:
+ nfiles.append(len(read.files))
+
+ if len(set(nfiles)) != 1:
+ errobj = {
+ "error_type": "check_read_file_count",
+ "error_message": "Reads must have the same number of files",
+ "error_object": "read",
+ }
+ # errors.append(f"[error {idx}] Reads must have the same number of files")
+ errors.append(errobj)
+ idx += 1
+ return (errors, idx)
- if len(set(nfiles)) != 1:
- errors.append(f"[error {idx}] Reads must have the same number of files")
- idx += 1
+ # errors, idx = check_read_file_count(spec, spec_fn, errors, idx)
+
+ errors = []
+ idx = 0
+ checks = {
+ "check_schema": check_schema,
+ "check_unique_modalities": check_unique_modalities,
+ "check_region_ids_modalities": check_region_ids_modalities,
+ "check_onlist_files_exist": check_onlist_files_exist,
+ "check_unique_read_ids": check_unique_read_ids,
+ "check_read_files_exist": check_read_files_exist,
+ "check_unique_read_primer_strand_pairs": check_unique_read_primer_strand_pairs,
+ "check_unique_region_ids": check_unique_region_ids,
+ "check_read_modalities": check_read_modalities,
+ "check_primer_ids_in_region_ids": check_primer_ids_in_region_ids,
+ "check_primer_ids_in_libspec_leaves": check_primer_ids_in_libspec_leaves,
+ "check_sequence_types": check_sequence_types,
+ "check_region_lengths": check_region_lengths,
+ "check_sequence_lengths": check_sequence_lengths,
+ "check_read_file_count": check_read_file_count,
+ }
+ for k, v in checks.items():
+ errors, idx = v(spec, spec_fn, errors, idx)
return errors
diff --git a/seqspec/seqspec_convert.py b/seqspec/seqspec_convert.py
new file mode 100644
index 00000000..11a4d1a1
--- /dev/null
+++ b/seqspec/seqspec_convert.py
@@ -0,0 +1,434 @@
+"""Convert module for seqspec CLI.
+
+This module provides functionality to convert between different formats (seqspec, genbank, token).
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+import json
+import numpy as np
+from typing import Dict, List, Tuple
+import os
+
+from seqspec.utils import load_genbank, load_spec
+from seqspec.Region import Region
+from seqspec.Assay import Assay
+
+
+# Load schema and constants
+schema_fn = os.path.join(os.path.dirname(__file__), "schema/seqspec.schema.json")
+with open(schema_fn, "r") as f:
+ schema = json.load(f)
+REGION_TYPES = schema["$defs"]["region"]["properties"]["region_type"]["enum"]
+MODALITIES = schema["properties"]["modalities"]["items"]["enum"]
+SEQUENCE_TYPES = schema["$defs"]["region"]["properties"]["sequence_type"]["enum"]
+
+
+def setup_convert_args(parser) -> ArgumentParser:
+ """Create and configure the convert command subparser."""
+ subparser = parser.add_parser(
+ "convert",
+ description="""
+Convert between different formats (seqspec, genbank, token).
+
+Examples:
+seqspec convert -ifmt seqspec -ofmt token spec.yaml -o output_dir # Convert seqspec to token format
+seqspec convert -ifmt genbank -ofmt seqspec input.gb -o spec.yaml # Convert genbank to seqspec
+---
+""",
+ help="Convert between different formats",
+ formatter_class=RawTextHelpFormatter,
+ )
+ choices = ["genbank", "seqspec", "token"]
+ subparser.add_argument(
+ "-ifmt",
+ "--input-format",
+ help="Input format",
+ type=str,
+ default="seqspec",
+ choices=choices,
+ )
+ subparser.add_argument(
+ "-ofmt",
+ "--output-format",
+ help="Output format",
+ type=str,
+ default="token",
+ choices=choices,
+ )
+ subparser.add_argument(
+ "-o",
+ "--output",
+ metavar="OUT",
+ help="Path to output file or directory",
+ type=Path,
+ default=None,
+ required=False,
+ )
+ subparser.add_argument(
+ "input_file",
+ metavar="IN",
+ help="Path to input file",
+ type=Path,
+ )
+ return subparser
+
+
+def validate_convert_args(parser: ArgumentParser, args: Namespace) -> None:
+ """Validate the convert command arguments."""
+ if not Path(args.input_file).exists():
+ parser.error(f"Input file does not exist: {args.input_file}")
+
+ if args.output and Path(args.output).exists():
+ if args.output_format == "token":
+ if not Path(args.output).is_dir():
+ parser.error(
+ f"Output path exists but is not a directory: {args.output}"
+ )
+ else:
+ if not Path(args.output).is_file():
+ parser.error(f"Output path exists but is not a file: {args.output}")
+
+
+def run_convert(parser: ArgumentParser, args: Namespace) -> None:
+ """Run the convert command."""
+ validate_convert_args(parser, args)
+
+ CONVERT = {
+ ("genbank", "seqspec"): gb_to_seqspec,
+ ("seqspec", "token"): seqspec_to_token,
+ }
+
+ file = load_input_file(args.input_file, args.input_format)
+ result = CONVERT[(args.input_format, args.output_format)](file)
+
+ if args.output:
+ if args.output_format == "token":
+ save_tokenized_spec(*result, str(args.output)) # noqa
+ else:
+ # Handle other output formats here
+ pass
+ else:
+ return result
+
+
+def load_input_file(fn: Path, ifmt: str):
+ """Load input file based on format."""
+ LOAD = {
+ "genbank": load_genbank,
+ "seqspec": load_spec,
+ # "token": load_token,
+ }
+ return LOAD[ifmt](fn)
+
+
+def get_feature_names() -> List[str]:
+ """Generate ordered list of column names"""
+ features = []
+
+ # Modality one-hot features
+ features.extend([f"modality_{mod}" for mod in MODALITIES])
+
+ # Region type one-hot features
+ features.extend([f"region_{rt}" for rt in REGION_TYPES])
+
+ # Sequence type one-hot features
+ features.extend([f"seq_{st}" for st in SEQUENCE_TYPES])
+
+ # Numerical features
+ features.extend(["min_len", "max_len", "position"])
+
+ return features
+
+
+def save_tokenized_spec(
+ matrix: np.ndarray, row_identifiers: List[Tuple[str, str, str]], output_path: str
+):
+ """
+ Save tokenized spec output to three files:
+ - spec.npy: The matrix data
+ - rows.txt: Tab-separated list of (spec_id, modality, region_type)
+ - cols.txt: List of column names
+
+ Args:
+ matrix: The tokenized matrix from tokenize_specs
+ row_identifiers: List of (spec_id, modality, region_type) tuples
+ output_path: Path to save the output (directory)
+ """
+ # Create output directory if needed
+ output_dir = Path(output_path)
+ output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Save matrix
+ np.save(output_dir / "spec.npy", matrix)
+
+ # Save row identifiers (tab-separated)
+ with open(output_dir / "rows.txt", "w") as f:
+ for spec_id, modality, region_type in row_identifiers:
+ f.write(f"{spec_id}\t{modality}\t{region_type}\n")
+
+ # Save column names
+ feature_names = get_feature_names()
+ with open(output_dir / "cols.txt", "w") as f:
+ for feature in feature_names:
+ f.write(f"{feature}\n")
+
+
+def seqspec_to_token(spec):
+ # for each modalitiy, make a dictionary of regions
+ specs_regions = {}
+ modalities = spec.list_modalities()
+ for modality in modalities:
+ regions = [i.to_dict() for i in spec.get_libspec(modality).get_leaves()]
+ specs_regions[modality] = regions
+
+ # Convert to tokenized matrix
+ return tokenize_specs({spec.assay_id: specs_regions})
+
+
+def tokenize_specs(
+ specs_regions: Dict[str, Dict[str, List[Dict]]],
+) -> Tuple[np.ndarray, List[Tuple[str, str, str]]]:
+ """
+ Convert specs into a single matrix where each row represents a complete region specification
+
+ Args:
+ specs_regions: Dict[spec_id -> Dict[modality -> List[region_dict]]]
+
+ Returns:
+ - Matrix where each row is [modality_onehot, region_type_onehot, sequence_type_onehot, min_len, max_len, position]
+ - List of (spec_id, modality, region_type) identifying each row
+ """
+ # Calculate feature dimensions
+ n_modality_features = len(MODALITIES)
+ n_region_type_features = len(REGION_TYPES)
+ n_sequence_type_features = len(SEQUENCE_TYPES)
+
+ # Total features = one-hot encodings + numerical features + position
+ total_features = (
+ n_modality_features # modality one-hot
+ + n_region_type_features # region_type one-hot
+ + n_sequence_type_features # sequence_type one-hot
+ + 2 # min_len, max_len
+ + 1 # position in region list (1-based)
+ )
+
+ # features = get_feature_names()
+
+ rows = [] # Will hold our feature vectors
+ row_identifiers = [] # Will hold (spec_id, modality, region_type) tuples
+
+ for spec_id, modality_regions in specs_regions.items():
+ for modality, regions in modality_regions.items():
+ # Enumerate regions to get position (1-based)
+ for position, region in enumerate(regions, start=1):
+ # Create feature vector for this region
+ feature_vector = np.zeros(total_features).astype(int)
+ current_idx = 0
+
+ # Add modality one-hot
+ modality_idx = MODALITIES.index(modality)
+ feature_vector[modality_idx] = 1
+ current_idx += n_modality_features
+
+ # Add region_type one-hot
+ region_type_idx = REGION_TYPES.index(region["region_type"])
+ feature_vector[current_idx + region_type_idx] = 1
+ current_idx += n_region_type_features
+
+ # Add sequence_type one-hot
+ sequence_type_idx = SEQUENCE_TYPES.index(region["sequence_type"])
+ feature_vector[current_idx + sequence_type_idx] = 1
+ current_idx += n_sequence_type_features
+
+ # Add lengths
+ feature_vector[current_idx] = region["min_len"]
+ feature_vector[current_idx + 1] = region["max_len"]
+ current_idx += 2
+
+ # Add position
+ feature_vector[current_idx] = position
+
+ # Store feature vector and identifier
+ rows.append(feature_vector)
+ row_identifiers.append((spec_id, modality, region["region_type"]))
+
+ return np.array(rows), row_identifiers
+
+
+def gb_to_seqspec(gb):
+ ex = gb_to_list(gb)
+ nested_json = nest_intervals(ex)
+ filled_regions = fill_gaps(gb.sequence, nested_json)
+ regions = convert(filled_regions)
+ reads = []
+ spec = Assay(
+ "genbank",
+ "illumina",
+ "genbank thing",
+ "doi",
+ "date",
+ ["source"],
+ "description",
+ "",
+ "",
+ "",
+ "",
+ reads,
+ regions,
+ )
+ return spec
+
+
+def gb_to_list(gb):
+ feat = []
+ label = "source"
+ for f in gb.features:
+ id = f.key
+
+ if "complement" in f.location:
+ start, stop = tuple(map(int, f.location[11:-1].split("..")))
+ else:
+ start, stop = tuple(map(int, f.location.split("..")))
+
+ # convert to 0-index
+ start -= 1
+ length = stop - start
+ seq = gb.sequence[start:stop]
+
+ for q in f.qualifiers:
+ if q.key == "/label=":
+ label = q.value
+ break
+ feat.append(
+ {
+ "id": id,
+ "label": label,
+ "start": start,
+ "stop": stop,
+ "length": length,
+ "seq": seq,
+ }
+ )
+ return feat
+
+
+def nest_intervals(intervals):
+ def nest(start_index, end_limit):
+ nested = []
+
+ i = start_index
+ while i < len(intervals) and intervals[i]["start"] < end_limit:
+ current_interval = intervals[i]
+ child, next_index = nest(i + 1, current_interval["stop"])
+ interval_obj = {
+ "id": current_interval["id"],
+ "label": current_interval["label"],
+ "start": current_interval["start"],
+ "stop": current_interval["stop"],
+ "length": current_interval["length"],
+ "seq": current_interval["seq"],
+ "regions": child,
+ }
+ nested.append(interval_obj)
+ i = next_index
+
+ return nested, i
+
+ result, _ = nest(0, intervals[0]["stop"])
+ return result
+
+
+def fill_gaps(seq, regions, parent_start=0, parent_stop=0):
+ if len(regions) == 0:
+ return []
+
+ # Insert a filler at the start if necessary
+ if regions[0]["start"] > parent_start:
+ start = parent_start
+ stop = regions[0]["start"]
+ s = seq[start:stop]
+ regions.insert(
+ 0,
+ {
+ "id": "filler_start",
+ "label": "filler_start",
+ "start": start,
+ "stop": stop,
+ "length": stop - start,
+ "seq": s,
+ "regions": [],
+ },
+ )
+
+ new_regions = []
+ for i, region in enumerate(regions):
+ # Append the current region
+ new_regions.append(region)
+
+ # Recursive call for nested regions
+ if "regions" in region:
+ region["regions"] = fill_gaps(
+ seq, region["regions"], region["start"], region["stop"]
+ )
+
+ # Check for gap and insert a filler
+ if i < len(regions) - 1 and region["stop"] < regions[i + 1]["start"]:
+ filler_id = f'filler_{region["id"]}_{regions[i+1]["id"]}'
+ start = region["stop"]
+ stop = regions[i + 1]["start"]
+ s = seq[start:stop]
+ new_regions.append(
+ {
+ "id": filler_id,
+ "label": filler_id,
+ "start": start,
+ "stop": stop,
+ "length": stop - start,
+ "seq": s,
+ "regions": [],
+ }
+ )
+
+ # Insert a filler at the end if necessary
+ if new_regions[-1]["stop"] < parent_stop:
+ start = new_regions[-1]["stop"]
+ stop = parent_stop
+ s = seq[start:stop]
+ new_regions.append(
+ {
+ "id": "filler_end",
+ "label": "filler_end",
+ "start": start,
+ "stop": stop,
+ "length": stop - start,
+ "seq": s,
+ "regions": [],
+ }
+ )
+
+ return new_regions
+
+
+# convert filled regions to seqspec, must be recursive function
+# regions is a list
+def convert(regions):
+ if len(regions) == 0:
+ return []
+ new_regions = []
+ for r in regions:
+ rgn = Region(
+ r["id"],
+ "",
+ r["label"],
+ "fixed",
+ r["seq"],
+ r["length"],
+ r["length"],
+ None,
+ None,
+ )
+ if len(r["regions"]) > 0:
+ rgn.regions = convert(r["regions"])
+ new_regions.append(rgn)
+ return new_regions
diff --git a/seqspec/seqspec_diff.py b/seqspec/seqspec_diff.py
index c4e6a57a..b4ba3a88 100644
--- a/seqspec/seqspec_diff.py
+++ b/seqspec/seqspec_diff.py
@@ -1,45 +1,156 @@
-from .Region import Region
-from seqspec.Assay import Assay
+"""Diff module for seqspec.
+
+This module provides functionality to compare two seqspec files and identify differences.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, Namespace
+from typing import List
+
from seqspec.utils import load_spec
+from seqspec.Assay import Assay
+from seqspec.Region import Region
-def setup_diff_args(parser):
- parser_diff = parser.add_parser(
+def setup_diff_args(parser) -> ArgumentParser:
+ """Create and configure the diff command subparser."""
+ subparser = parser.add_parser(
"diff",
- description="diff two seqspecs",
- help="diff two seqspecs",
+ description="""
+Compare two seqspec files and identify differences.
+
+Examples:
+seqspec diff spec1.yaml spec2.yaml # Compare two specs and print differences
+seqspec diff spec1.yaml spec2.yaml -o diff.txt # Compare specs and save differences to file
+---
+""",
+ help="Compare two seqspec files and identify differences",
+ )
+ subparser.add_argument(
+ "yamlA", help="First sequencing specification yaml file", type=str
)
- parser_diff.add_argument("yamlA", help="Sequencing specification yaml file A")
- parser_diff.add_argument("yamlB", help="Sequencing specification yaml file B")
- parser_diff.add_argument(
+ subparser.add_argument(
+ "yamlB", help="Second sequencing specification yaml file", type=str
+ )
+ subparser.add_argument(
"-o",
+ "--output",
metavar="OUT",
- help=("Path to output file"),
- type=str,
+ help="Path to output file",
+ type=Path,
default=None,
)
- return parser_diff
+ return subparser
+
+
+def validate_diff_args(parser: ArgumentParser, args: Namespace) -> None:
+ """Validate the diff command arguments."""
+ if not Path(args.yamlA).exists():
+ parser.error(f"Input file A does not exist: {args.yamlA}")
+ if not Path(args.yamlB).exists():
+ parser.error(f"Input file B does not exist: {args.yamlB}")
+ if args.output and args.output.exists() and not args.output.is_file():
+ parser.error(f"Output path exists but is not a file: {args.output}")
+
+
+def run_diff(parser: ArgumentParser, args: Namespace) -> None:
+ """Run the diff command."""
+ validate_diff_args(parser, args)
+
+ spec_a = load_spec(args.yamlA)
+ spec_b = load_spec(args.yamlB)
+
+ differences = compare_specs(spec_a, spec_b)
+
+ if args.output:
+ args.output.write_text(differences)
+ else:
+ print(differences)
+
+
+def compare_specs(spec_a: Assay, spec_b: Assay) -> str:
+ """Compare two specs and return a string describing their differences."""
+ differences = []
+
+ # Compare modalities
+ modalities_a = set(spec_a.modalities)
+ modalities_b = set(spec_b.modalities)
+
+ if modalities_a != modalities_b:
+ differences.append("Modalities differ:")
+ differences.append(f" Spec A: {', '.join(sorted(modalities_a))}")
+ differences.append(f" Spec B: {', '.join(sorted(modalities_b))}")
+
+ # Compare common modalities
+ common_modalities = modalities_a.intersection(modalities_b)
+ for modality in common_modalities:
+ regions_a = spec_a.get_libspec(modality).get_leaves()
+ regions_b = spec_b.get_libspec(modality).get_leaves()
+
+ region_diffs = compare_regions(regions_a, regions_b)
+ if region_diffs:
+ differences.append(f"\nModality '{modality}' differences:")
+ differences.extend(region_diffs)
+
+ return "\n".join(differences) if differences else "No differences found"
+
+
+def compare_regions(regions_a: List[Region], regions_b: List[Region]) -> List[str]:
+ """Compare two lists of regions and return a list of differences."""
+ differences = []
+
+ # Create lookup dictionaries
+ regions_a_dict = {r.region_id: r for r in regions_a}
+ regions_b_dict = {r.region_id: r for r in regions_b}
+
+ # Find regions unique to each spec
+ unique_to_a = set(regions_a_dict.keys()) - set(regions_b_dict.keys())
+ unique_to_b = set(regions_b_dict.keys()) - set(regions_a_dict.keys())
+
+ if unique_to_a:
+ differences.append(" Regions only in spec A:")
+ for region_id in sorted(unique_to_a):
+ differences.append(f" - {region_id}")
+
+ if unique_to_b:
+ differences.append(" Regions only in spec B:")
+ for region_id in sorted(unique_to_b):
+ differences.append(f" - {region_id}")
+ # Compare common regions
+ common_regions = set(regions_a_dict.keys()).intersection(set(regions_b_dict.keys()))
+ for region_id in sorted(common_regions):
+ region_a = regions_a_dict[region_id]
+ region_b = regions_b_dict[region_id]
-def validate_diff_args(parser, args):
- # if everything is valid the run_diff
- A_fn = args.yamlA
- B_fn = args.yamlB
- # o = args.o
- A = load_spec(A_fn)
- B = load_spec(B_fn)
+ region_diffs = diff_regions(region_a, region_b)
+ if region_diffs:
+ differences.append(f" Region '{region_id}' differences:")
+ differences.extend(f" - {diff}" for diff in region_diffs)
- # load in two specs
- run_diff(A, B)
+ return differences
-def run_diff(A: Assay, B: Assay):
- # What does it mean to diff two assays?
- # Only compare on modalities?
- # itx: pull out regions that have the same name?
- # itx:
- pass
+def diff_regions(region_a: Region, region_b: Region) -> List[str]:
+ """Compare two regions and return a list of differences."""
+ differences = []
+ # Compare basic properties
+ if region_a.region_type != region_b.region_type:
+ differences.append(
+ f"region_type: {region_a.region_type} != {region_b.region_type}"
+ )
+ if region_a.name != region_b.name:
+ differences.append(f"name: {region_a.name} != {region_b.name}")
+ if region_a.sequence_type != region_b.sequence_type:
+ differences.append(
+ f"sequence_type: {region_a.sequence_type} != {region_b.sequence_type}"
+ )
+ if region_a.sequence != region_b.sequence:
+ differences.append(f"sequence: {region_a.sequence} != {region_b.sequence}")
+ if region_a.min_len != region_b.min_len:
+ differences.append(f"min_len: {region_a.min_len} != {region_b.min_len}")
+ if region_a.max_len != region_b.max_len:
+ differences.append(f"max_len: {region_a.max_len} != {region_b.max_len}")
-def diff_regions(R1: Region, R2: Region):
- pass
+ return differences
diff --git a/seqspec/seqspec_file.py b/seqspec/seqspec_file.py
index 31be20dd..d1ad8e54 100644
--- a/seqspec/seqspec_file.py
+++ b/seqspec/seqspec_file.py
@@ -1,16 +1,22 @@
+"""File module for seqspec.
+
+This module provides functionality to list and format files present in seqspec files.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+from typing import Dict, List, Optional
+from collections import defaultdict
+import json
+
from seqspec.utils import load_spec
from seqspec.Assay import Assay
-from collections import defaultdict
from seqspec.File import File
-from typing import Dict, List, Optional
-from argparse import RawTextHelpFormatter
-import json
from seqspec import seqspec_find
-import os
-import argparse
-def setup_file_args(parser):
+def setup_file_args(parser) -> ArgumentParser:
+ """Create and configure the file command subparser."""
subparser = parser.add_parser(
"file",
description="""
@@ -29,36 +35,37 @@ def setup_file_args(parser):
)
subparser_required = subparser.add_argument_group("required arguments")
- subparser.add_argument("yaml", help="Sequencing specification yaml file")
+ subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
subparser.add_argument(
"-o",
+ "--output",
metavar="OUT",
- help=("Path to output file"),
- type=str,
+ help="Path to output file",
+ type=Path,
default=None,
)
subparser.add_argument(
"-i",
+ "--ids",
metavar="IDs",
- help=("Ids to list"),
+ help="Ids to list",
type=str,
default=None,
)
subparser_required.add_argument(
"-m",
+ "--modality",
metavar="MODALITY",
- help=("Modality"),
+ help="Modality",
type=str,
- default=None,
required=True,
)
- # the object we are using to index
- # choices = ["read", "region", "file", "onlist", "region-type"]
choices = ["read", "region", "file", "region-type"]
subparser.add_argument(
"-s",
+ "--selector",
metavar="SELECTOR",
- help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"),
+ help=f"Selector for ID, [{', '.join(choices)}] (default: read)",
type=str,
default="read",
choices=choices,
@@ -66,6 +73,7 @@ def setup_file_args(parser):
choices = ["paired", "interleaved", "index", "list", "json"]
subparser.add_argument(
"-f",
+ "--format",
metavar="FORMAT",
help=f"Format, [{', '.join(choices)}], default: paired",
type=str,
@@ -84,6 +92,7 @@ def setup_file_args(parser):
]
subparser.add_argument(
"-k",
+ "--key",
metavar="KEY",
help=f"Key, [{', '.join(choices)}], default: file_id",
type=str,
@@ -94,7 +103,7 @@ def setup_file_args(parser):
# option to get the full path of the file
subparser.add_argument(
"--fullpath",
- help=argparse.SUPPRESS,
+ help="Use full path for local files",
action="store_true",
default=False,
)
@@ -102,59 +111,50 @@ def setup_file_args(parser):
return subparser
-def validate_file_args(parser, args):
- spec_fn = os.path.abspath(args.yaml)
- o = args.o
- m = args.m # modality
- idtype = args.s # selector
- fmt = args.f # format
- ids = args.i # ids
- k = args.k # key
- fp = args.fullpath
+def validate_file_args(parser: ArgumentParser, args: Namespace) -> None:
+ """Validate the file command arguments."""
+ if not Path(args.yaml).exists():
+ parser.error(f"Input file does not exist: {args.yaml}")
- if (k == "filesize" or k == "filetype" or k == "urltype" or k == "md5") and (
- fmt == "paired" or fmt == "interleaved" or fmt == "index"
- ):
- parser.error(f"-f {fmt} valid only with -k file_id, filename, url")
+ if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+ parser.error(f"Output path exists but is not a file: {args.output}")
- return run_file(spec_fn, m, ids, idtype, fmt, k, o, fp=fp)
+ if args.key in ["filesize", "filetype", "urltype", "md5"] and args.format in [
+ "paired",
+ "interleaved",
+ "index",
+ ]:
+ parser.error(
+ f"Format '{args.format}' valid only with key 'file_id', 'filename', or 'url'"
+ )
-def run_file(spec_fn, m, ids, idtype, fmt, k, o, fp=False):
- spec = load_spec(spec_fn)
- if ids is None:
- ids = []
- else:
- ids = ids.split(",")
- files = file(spec, m, ids, idtype, fmt, k, spec_fn, fp)
-
- if files:
- if o:
- with open(o, "w") as f:
- print(files, file=f)
- else:
- print(files)
- return
-
-
-def file(
+def seqspec_file(
spec: Assay,
modality: str,
- ids: List[str],
- idtype: str,
- fmt: str,
- k: Optional[str],
- spec_fn: str,
- fp: bool = False,
-):
+ ids: Optional[List[str]] = None,
+ selector: str = "read",
+) -> Dict[str, List[File]]:
+ """Core functionality to list files from a seqspec.
+
+ Args:
+ spec: The Assay object to operate on
+ spec_fn: Path to the spec file, used for relative path resolution
+ modality: The modality to list files for
+ ids: Optional list of IDs to filter by
+ selector: Type of ID to filter by (read, region, file, region-type)
+
+ Returns:
+ Dictionary mapping IDs to lists of File objects
+ """
# NOTE: LIST FILES DOES NOT RESPECT ORDERING OF INPUT IDs LIST
# NOTE: seqspec file -s read gets the files for the read, not the files mapped from the regions associated with the read.
LIST_FILES = {
"read": list_read_files,
"region": list_region_files,
- "file": list_files,
- # "onlist": list_onlist_files,
+ "file": list_all_files,
}
+
LIST_FILES_BY_ID = {
"read": list_files_by_read_id,
"file": list_files_by_file_id,
@@ -162,34 +162,60 @@ def file(
"region-type": list_files_by_region_type,
}
- if len(ids) == 0:
- # list all the files
- files = LIST_FILES[idtype](spec, modality)
+ # Get files based on whether we're filtering by IDs
+ if not ids:
+ # list all files
+ return LIST_FILES[selector](spec, modality)
else:
- # list only the id files
- files = LIST_FILES_BY_ID[idtype](spec, modality, ids)
-
- FORMAT = {
- "list": format_list_files_metadata,
- "paired": format_list_files,
- "interleaved": format_list_files,
- "index": format_list_files,
- "json": format_json_files,
- }
+ # list files by id
+ return LIST_FILES_BY_ID[selector](spec, modality, ids)
+
+
+def run_file(parser: ArgumentParser, args: Namespace) -> None:
+ """Run the file command."""
+ validate_file_args(parser, args)
- x = FORMAT[fmt](files, fmt, k, spec_fn, fp)
- return x
+ spec = load_spec(args.yaml)
+ ids = args.ids.split(",") if args.ids else []
+
+ files = seqspec_file(
+ spec=spec,
+ modality=args.modality,
+ ids=ids,
+ selector=args.selector,
+ )
+
+ if files:
+ FORMAT = {
+ "list": format_list_files_metadata,
+ "paired": format_list_files,
+ "interleaved": format_list_files,
+ "index": format_list_files,
+ "json": format_json_files,
+ }
+
+ result = FORMAT[args.format](
+ files, args.format, args.key, Path(args.yaml), args.fullpath
+ )
+
+ if args.output:
+ args.output.write_text(str(result))
+ else:
+ print(result)
-def list_read_files(spec, modality):
+def list_read_files(spec: Assay, modality: str) -> Dict[str, List[File]]:
+ """List files for all reads in a modality."""
files = defaultdict(list)
reads = spec.get_seqspec(modality)
for rd in reads:
- files[rd.read_id] = rd.files
+ if rd.files:
+ files[rd.read_id] = rd.files
return files
-def list_files(spec, modality):
+def list_all_files(spec: Assay, modality: str) -> Dict[str, List[File]]:
+ """List all files in a modality."""
files_rd = list_read_files(spec, modality)
files_rgn = list_region_files(spec, modality)
return {**files_rd, **files_rgn}
@@ -210,131 +236,155 @@ def list_region_files(spec, modality):
def format_list_files_metadata(
- files: Dict[str, List[File]], fmt, k, spec_fn="", fp=False
-):
- x = ""
+ files: Dict[str, List[File]],
+ fmt: str,
+ k: str,
+ spec_fn: Path = Path(""),
+ fp: bool = False,
+) -> str:
+ """Format file metadata as a tab-separated list."""
+ x = []
if k == "all":
for items in zip(*files.values()):
for key, item in zip(files.keys(), items):
- x += f"{key}\t{item.file_id}\t{item.filename}\t{item.filetype}\t{item.filesize}\t{item.url}\t{item.urltype}\t{item.md5}\n"
- x = x[:-1]
-
+ x.append(
+ f"{key}\t{item.file_id}\t{item.filename}\t{item.filetype}\t{item.filesize}\t{item.url}\t{item.urltype}\t{item.md5}"
+ )
else:
for items in zip(*files.values()):
for key, item in zip(files.keys(), items):
attr = str(getattr(item, k))
id = item.file_id
- x += f"{key}\t{id}\t{attr}\n"
- x = x[:-1]
+ x.append(f"{key}\t{id}\t{attr}")
+ return "\n".join(x)
- return x
-
-def format_json_files(files: Dict[str, List[File]], fmt, k, spec_fn="", fp=False):
+def format_json_files(
+ files: Dict[str, List[File]],
+ fmt: str,
+ k: str,
+ spec_fn: Path = Path(""),
+ fp: bool = False,
+) -> str:
+ """Format files as JSON."""
x = []
for items in zip(*files.values()):
if k == "all":
for key, item in zip(files.keys(), items):
d = item.to_dict()
if item.urltype == "local" and fp:
- d["url"] = os.path.join(os.path.dirname(spec_fn), d["url"])
+ d["url"] = str(spec_fn.parent / d["url"])
x.append(d)
else:
for key, item in zip(files.keys(), items):
attr = getattr(item, k)
if k == "url" and item.urltype == "local" and fp:
- attr = os.path.join(os.path.dirname(spec_fn), attr)
+ attr = str(spec_fn.parent / attr)
x.append({"file_id": item.file_id, k: attr})
return json.dumps(x, indent=4)
-def format_list_files(files: Dict[str, List[File]], fmt, k=None, spec_fn="", fp=False):
- x = ""
+def format_list_files(
+ files: Dict[str, List[File]],
+ fmt: str,
+ k: Optional[str] = None,
+ spec_fn: Path = Path(""),
+ fp: bool = False,
+) -> str:
+ """Format files as a list based on the format type."""
+ x = []
+
if fmt == "paired":
- x = ""
for items in zip(*files.values()):
- t = ""
+ t = []
for i in items:
if k:
attr = str(getattr(i, k))
if k == "url" and i.urltype == "local" and fp:
- attr = os.path.join(os.path.dirname(spec_fn), attr)
- t += f"{attr}\t"
+ attr = str(spec_fn.parent / attr)
+ t.append(attr)
else:
- t += f"{i.filename}\t"
- x += f"{t[:-1]}\n"
- x = x[:-1]
+ t.append(i.filename)
+ x.append("\t".join(t))
- elif fmt == "interleaved" or fmt == "list":
+ elif fmt in ["interleaved", "list"]:
for items in zip(*files.values()):
for item in items:
id = item.filename
if k:
id = str(getattr(item, k))
if k == "url" and item.urltype == "local" and fp:
- id = os.path.join(os.path.dirname(spec_fn), id)
- x += id + "\n"
- x = x[:-1]
+ id = str(spec_fn.parent / id)
+ x.append(id)
+
elif fmt == "index":
+ t = []
for items in zip(*files.values()):
for item in items:
id = item.filename
if k:
id = str(getattr(item, k))
if k == "url" and item.urltype == "local" and fp:
- id = os.path.join(os.path.dirname(spec_fn), id)
- x += id + ","
- x = x[:-1]
+ id = str(spec_fn.parent / id)
+ t.append(id)
+ x.append(",".join(t))
- return x
+ return "\n".join(x)
-def list_files_by_read_id(spec, modality, read_ids):
+def list_files_by_read_id(
+ spec: Assay, modality: str, read_ids: List[str]
+) -> Dict[str, List[File]]:
+ """List files for specific read IDs."""
seqspec = spec.get_seqspec(modality)
files = defaultdict(list)
ids = set(read_ids)
# TODO return the files in the order of the ids given in the input
# NOTE ORDERING HERE IS IMPORANT SEE GET_INDEX_BY_FILES FUNCTION
for read in seqspec:
- if read.read_id in ids:
- for file in read.files:
- # files[read.read_id].append(file.filename)
- files[read.read_id].append(file)
+ if read.read_id in ids and read.files:
+ files[read.read_id].extend(read.files)
return files
-def list_files_by_file_id(spec, modality, file_ids):
+def list_files_by_file_id(
+ spec: Assay, modality: str, file_ids: List[str]
+) -> Dict[str, List[File]]:
+ """List files for specific file IDs."""
seqspec = spec.get_seqspec(modality)
ids = set(file_ids)
files = defaultdict(list)
# TODO: NOTE ORDERING HERE IS IMPORTANT SEE RUN_LIST_FILES FUNCTION
for read in seqspec:
- for file in read.files:
- if file.filename in ids:
- # files[read.read_id].append(file.filename)
- files[read.read_id].append(file)
+ if read.files:
+ for file in read.files:
+ if file.filename in ids:
+ files[read.read_id].append(file)
return files
-def list_files_by_region_id(spec, modality, file_ids):
+def list_files_by_region_id(
+ spec: Assay, modality: str, region_ids: List[str]
+) -> Dict[str, List[File]]:
+ """List files for specific region IDs."""
files = list_region_files(spec, modality)
-
- ids = set(file_ids)
+ ids = set(region_ids)
new_files = defaultdict(list)
- for region_id, files in files.items():
+ for region_id, region_files in files.items():
if region_id in ids:
- new_files[region_id] += files
+ new_files[region_id].extend(region_files)
return new_files
-def list_files_by_region_type(spec, modality, file_ids):
+def list_files_by_region_type(
+ spec: Assay, modality: str, region_types: List[str]
+) -> Dict[str, List[File]]:
+ """List files for specific region types."""
files = list_region_files(spec, modality)
-
- ids = set(file_ids)
+ ids = set(region_types)
new_files = defaultdict(list)
- for region_id, files in files.items():
+ for region_id, region_files in files.items():
r = seqspec_find.find_by_region_id(spec, modality, region_id)[0]
- rt = r.region_type
- if rt in ids:
- new_files[region_id] += files
+ if r.region_type in ids:
+ new_files[region_id].extend(region_files)
return new_files
diff --git a/seqspec/seqspec_find.py b/seqspec/seqspec_find.py
index f5aec054..d1cd9f5d 100644
--- a/seqspec/seqspec_find.py
+++ b/seqspec/seqspec_find.py
@@ -1,13 +1,24 @@
+"""Find module for seqspec CLI.
+
+This module provides functionality to search for objects within seqspec files.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS
+import warnings
+import yaml
+from typing import List, Optional, Union
+
from seqspec.utils import load_spec
from seqspec.Assay import Assay
-import yaml
-import argparse
-import warnings
-from argparse import RawTextHelpFormatter
-from seqspec.seqspec_file import list_files
+from seqspec.Read import Read
+from seqspec.Region import Region
+from seqspec.File import File
+from seqspec.seqspec_file import list_all_files
-def setup_find_args(parser):
+def setup_find_args(parser) -> ArgumentParser:
+ """Create and configure the find command subparser."""
subparser = parser.add_parser(
"find",
description="""
@@ -24,45 +35,48 @@ def setup_find_args(parser):
)
subparser_required = subparser.add_argument_group("required arguments")
- subparser.add_argument("yaml", help="Sequencing specification yaml file")
+ subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
subparser.add_argument(
"-o",
+ "--output",
metavar="OUT",
- help=("Path to output file"),
- type=str,
+ help="Path to output file",
+ type=Path,
default=None,
)
# depracate
- subparser.add_argument("--rtype", help=argparse.SUPPRESS, action="store_true")
+ subparser.add_argument("--rtype", help=SUPPRESS, action="store_true")
choices = ["read", "region", "file", "region-type"]
subparser.add_argument(
"-s",
- metavar="Selector",
- help=(f"Selector, [{','.join(choices)}] (default: region)"),
+ "--selector",
+ metavar="SELECTOR",
+ help=f"Selector, [{','.join(choices)}] (default: region)",
type=str,
default="region",
choices=choices,
)
subparser_required.add_argument(
"-m",
+ "--modality",
metavar="MODALITY",
- help=("Modality"),
+ help="Modality",
type=str,
- default=None,
required=True,
)
# depracate -r
subparser_required.add_argument(
"-r",
metavar="REGION",
- help=argparse.SUPPRESS,
+ help=SUPPRESS,
type=str,
default=None,
)
subparser_required.add_argument(
"-i",
- metavar="IDs",
- help=("IDs"),
+ "--id",
+ metavar="ID",
+ help="ID to search for",
type=str,
default=None,
required=False,
@@ -71,8 +85,14 @@ def setup_find_args(parser):
return subparser
-def validate_find_args(parser, args):
- # IDs
+def validate_find_args(parser: ArgumentParser, args: Namespace) -> None:
+ """Validate the find command arguments."""
+ if not Path(args.yaml).exists():
+ parser.error(f"Input file does not exist: {args.yaml}")
+
+ if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+ parser.error(f"Output path exists but is not a file: {args.output}")
+
if args.r is not None:
warnings.warn(
"The '-r' argument is deprecated and will be removed in a future version. "
@@ -80,46 +100,73 @@ def validate_find_args(parser, args):
DeprecationWarning,
)
# Optionally map the old option to the new one
- if not args.i:
- args.i = args.r
-
- fn = args.yaml
- m = args.m
- o = args.o
- idtype = args.s # selector
- ids = args.i
-
- # run function
- return run_find(fn, m, ids, idtype, o)
-
-
-def run_find(spec_fn: str, modality: str, id: str, idtype: str, o: str):
- spec = load_spec(spec_fn)
- found = []
- if idtype == "region-type":
- found = find_by_region_type(spec, modality, id)
- elif idtype == "region":
- found = find_by_region_id(spec, modality, id)
- elif idtype == "read":
- found = find_by_read_id(spec, modality, id)
- elif idtype == "file":
- found = find_by_file_id(spec, modality, id)
- else:
- raise ValueError(f"Unknown idtype: {idtype}")
+ if not args.id:
+ args.id = args.r
+
+
+def seqspec_find(
+ spec: Assay, selector: str, modality: str, id: Optional[str] = None
+) -> Union[List[Read], List[Region], List[File]]:
+ """Core functionality to find objects in a seqspec file.
+
+ Args:
+ spec: The Assay object to search in
+ selector: Type of object to search for (read, region, file, region-type)
+ modality: The modality to search in
+ id: The ID to search for (optional)
+
+ Returns:
+ List of found objects matching the search criteria:
+ - List[Read] for "read" selector
+ - List[Region] for "region" and "region-type" selectors
+ - List[File] for "file" selector
+ - Empty list for unknown selectors
+ """
+ FIND = {
+ "region-type": find_by_region_type,
+ "region": find_by_region_id,
+ "read": find_by_read_id,
+ "file": find_by_file_id,
+ }
+
+ if selector not in FIND:
+ warnings.warn(
+ f"Unknown selector '{selector}'. Valid selectors are: {', '.join(FIND.keys())}"
+ )
+ return []
+
+ return FIND[selector](spec, modality, id)
+
- # post processing
- if o:
- with open(o, "w") as f:
+def run_find(parser: ArgumentParser, args: Namespace) -> None:
+ """Run the find command."""
+ validate_find_args(parser, args)
+
+ spec = load_spec(args.yaml)
+ found = seqspec_find(spec, args.selector, args.modality, args.id)
+
+ # Handle output
+ if args.output:
+ with open(args.output, "w") as f:
yaml.dump(found, f, sort_keys=False)
else:
print(yaml.dump(found, sort_keys=False))
- return
+def find_by_read_id(spec: Assay, modality: str, id: Optional[str]) -> List[Read]:
+ """Find reads by their ID.
+
+ Args:
+ spec: The seqspec specification.
+ modality: The modality to search in.
+ id: The read ID to search for.
-# TODO implement
-def find_by_read_id(spec: Assay, modality: str, id: str):
+ Returns:
+ A list of Read objects matching the ID.
+ """
rds = []
+ if id is None:
+ return rds
reads = spec.get_seqspec(modality)
for r in reads:
if r.read_id == id:
@@ -127,10 +174,21 @@ def find_by_read_id(spec: Assay, modality: str, id: str):
return rds
-# TODO implement
-def find_by_file_id(spec: Assay, modality: str, id: str):
+def find_by_file_id(spec: Assay, modality: str, id: Optional[str]) -> List[File]:
+ """Find files by their ID.
+
+ Args:
+ spec: The seqspec specification.
+ modality: The modality to search in.
+ id: The file ID to search for.
+
+ Returns:
+ A list of File objects matching the ID.
+ """
files = []
- lf = list_files(spec, modality)
+ if id is None:
+ return files
+ lf = list_all_files(spec, modality)
for k, v in lf.items():
for f in v:
if f.file_id == id:
@@ -138,13 +196,37 @@ def find_by_file_id(spec: Assay, modality: str, id: str):
return files
-def find_by_region_id(spec: Assay, modality: str, id: str):
+def find_by_region_id(spec: Assay, modality: str, id: Optional[str]) -> List[Region]:
+ """Find regions by their ID.
+
+ Args:
+ spec: The seqspec specification.
+ modality: The modality to search in.
+ id: The region ID to search for.
+
+ Returns:
+ A list of Region objects matching the ID.
+ """
+ if id is None:
+ return []
m = spec.get_libspec(modality)
regions = m.get_region_by_id(id)
return regions
-def find_by_region_type(spec: Assay, modality: str, id: str):
+def find_by_region_type(spec: Assay, modality: str, id: Optional[str]) -> List[Region]:
+ """Find regions by their type.
+
+ Args:
+ spec: The seqspec specification.
+ modality: The modality to search in.
+ id: The region type to search for.
+
+ Returns:
+ A list of Region objects matching the type.
+ """
+ if id is None:
+ return []
m = spec.get_libspec(modality)
regions = m.get_region_by_region_type(id)
return regions
diff --git a/seqspec/seqspec_format.py b/seqspec/seqspec_format.py
index 87eccada..b6d209f6 100644
--- a/seqspec/seqspec_format.py
+++ b/seqspec/seqspec_format.py
@@ -1,8 +1,25 @@
+"""Format module for seqspec CLI.
+
+This module provides functionality to automatically format and fill in missing fields
+in a seqspec specification file.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+
from seqspec.utils import load_spec
-from argparse import RawTextHelpFormatter
+from seqspec.Assay import Assay
-def setup_format_args(parser):
+def setup_format_args(parser) -> ArgumentParser:
+ """Create and configure the format command subparser.
+
+ Args:
+ parser: The main argument parser to add the format subparser to.
+
+ Returns:
+ The configured format subparser.
+ """
subparser = parser.add_parser(
"format",
description="""
@@ -16,34 +33,58 @@ def setup_format_args(parser):
help="Autoformat seqspec file",
formatter_class=RawTextHelpFormatter,
)
- # subparser_required = subparser.add_argument_group("required arguments")
- subparser.add_argument("yaml", help="Sequencing specification yaml file")
+ subparser.add_argument("yaml", type=Path, help="Sequencing specification yaml file")
subparser.add_argument(
"-o",
+ "--output",
metavar="OUT",
- help=("Path to output file"),
- type=str,
+ type=Path,
+ help="Path to output file",
default=None,
)
return subparser
-def validate_format_args(parser, args):
- fn = args.yaml
- o = args.o
+def validate_format_args(parser: ArgumentParser, args: Namespace) -> None:
+ """Validate the format command arguments.
+
+ Args:
+ parser: The argument parser.
+ args: The parsed arguments.
- run_format(spec_fn=fn, o=o)
+ Raises:
+ parser.error: If any validation fails.
+ """
+ if not Path(args.yaml).exists():
+ parser.error(f"Input file does not exist: {args.yaml}")
+ if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+ parser.error(f"Output path exists but is not a file: {args.output}")
-def run_format(spec_fn, o):
- spec = load_spec(spec_fn)
- format(spec)
- if o:
- spec.to_YAML(o)
+
+def run_format(parser: ArgumentParser, args: Namespace) -> None:
+ """Run the format command.
+
+ Args:
+ parser: The argument parser.
+ args: The parsed arguments.
+ """
+ validate_format_args(parser, args)
+
+ spec = load_spec(args.yaml)
+ format_spec(spec)
+
+ if args.output:
+ spec.to_YAML(args.output)
else:
print(spec.to_YAML())
-def format(spec):
- return spec.update_spec()
+def format_spec(spec: Assay) -> None:
+ """Format a seqspec specification by updating its fields.
+
+ Args:
+ spec: The seqspec specification to format.
+ """
+ spec.update_spec()
diff --git a/seqspec/seqspec_genbank.py b/seqspec/seqspec_genbank.py
deleted file mode 100644
index 419d67ce..00000000
--- a/seqspec/seqspec_genbank.py
+++ /dev/null
@@ -1,210 +0,0 @@
-from seqspec.utils import load_genbank
-import json
-from seqspec.Region import Region
-from seqspec.Assay import Assay
-
-
-def setup_genbank_args(parser):
- subparser = parser.add_parser(
- "genbank",
- description="get genbank about seqspec file",
- help="get genbank about seqspec file",
- )
-
- subparser.add_argument("gbk", help="Genbank file")
- subparser.add_argument(
- "-o",
- metavar="OUT",
- help=("Path to output file"),
- type=str,
- default=None,
- required=False,
- )
- return subparser
-
-
-def validate_genbank_args(parser, args):
- # if everything is valid the run_genbank
- fn = args.gbk
- o = args.o
- gb = load_genbank(fn)
-
- spec = run_genbank(gb)
-
- if o:
- spec.to_YAML(o)
- else:
- print(json.dumps(spec, sort_keys=False, indent=4))
-
-
-def run_genbank(gb):
- ex = gb_to_list(gb)
- nested_json = nest_intervals(ex)
- filled_regions = fill_gaps(gb.sequence, nested_json)
- regions = convert(filled_regions)
- spec = Assay(
- "genbank",
- "illumina",
- "genbank thing",
- "doi",
- "date",
- "description",
- ["source"],
- "",
- regions,
- )
- return spec
-
-
-def gb_to_list(gb):
- feat = []
- label = "source"
- for f in gb.features:
- id = f.key
-
- if "complement" in f.location:
- start, stop = tuple(map(int, f.location[11:-1].split("..")))
- else:
- start, stop = tuple(map(int, f.location.split("..")))
-
- # convert to 0-index
- start -= 1
- length = stop - start
- seq = gb.sequence[start:stop]
-
- for q in f.qualifiers:
- if q.key == "/label=":
- label = q.value
- break
- feat.append(
- {
- "id": id,
- "label": label,
- "start": start,
- "stop": stop,
- "length": length,
- "seq": seq,
- }
- )
- return feat
-
-
-def nest_intervals(intervals):
- def nest(start_index, end_limit):
- nested = []
-
- i = start_index
- while i < len(intervals) and intervals[i]["start"] < end_limit:
- current_interval = intervals[i]
- child, next_index = nest(i + 1, current_interval["stop"])
- interval_obj = {
- "id": current_interval["id"],
- "label": current_interval["label"],
- "start": current_interval["start"],
- "stop": current_interval["stop"],
- "length": current_interval["length"],
- "seq": current_interval["seq"],
- "regions": child,
- }
- nested.append(interval_obj)
- i = next_index
-
- return nested, i
-
- result, _ = nest(0, intervals[0]["stop"])
- return result
-
-
-def fill_gaps(seq, regions, parent_start=0, parent_stop=0):
- if len(regions) == 0:
- return []
-
- # Insert a filler at the start if necessary
- if regions[0]["start"] > parent_start:
- start = parent_start
- stop = regions[0]["start"]
- s = seq[start:stop]
- regions.insert(
- 0,
- {
- "id": "filler_start",
- "label": "filler_start",
- "start": start,
- "stop": stop,
- "length": stop - start,
- "seq": s,
- "regions": [],
- },
- )
-
- new_regions = []
- for i, region in enumerate(regions):
- # Append the current region
- new_regions.append(region)
-
- # Recursive call for nested regions
- if "regions" in region:
- region["regions"] = fill_gaps(
- seq, region["regions"], region["start"], region["stop"]
- )
-
- # Check for gap and insert a filler
- if i < len(regions) - 1 and region["stop"] < regions[i + 1]["start"]:
- filler_id = f'filler_{region["id"]}_{regions[i+1]["id"]}'
- start = region["stop"]
- stop = regions[i + 1]["start"]
- s = seq[start:stop]
- new_regions.append(
- {
- "id": filler_id,
- "label": filler_id,
- "start": start,
- "stop": stop,
- "length": stop - start,
- "seq": s,
- "regions": [],
- }
- )
-
- # Insert a filler at the end if necessary
- if new_regions[-1]["stop"] < parent_stop:
- start = new_regions[-1]["stop"]
- stop = parent_stop
- s = seq[start:stop]
- new_regions.append(
- {
- "id": "filler_end",
- "label": "filler_end",
- "start": start,
- "stop": stop,
- "length": stop - start,
- "seq": s,
- "regions": [],
- }
- )
-
- return new_regions
-
-
-# convert filled regions to seqspec, must be recursive function
-# regions is a list
-def convert(regions):
- if len(regions) == 0:
- return []
- new_regions = []
- for r in regions:
- rgn = Region(
- r["id"],
- "",
- r["label"],
- "fixed",
- r["seq"],
- r["length"],
- r["length"],
- None,
- None,
- )
- if len(r["regions"]) > 0:
- rgn.regions = convert(r["regions"])
- new_regions.append(rgn)
- return new_regions
diff --git a/seqspec/seqspec_index.py b/seqspec/seqspec_index.py
index 4bbc6576..fff7f59e 100644
--- a/seqspec/seqspec_index.py
+++ b/seqspec/seqspec_index.py
@@ -1,19 +1,28 @@
+"""Index module for seqspec CLI.
+
+This module provides functionality to identify the position of elements in a spec for use in downstream tools.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS
+import warnings
+from typing import List, Optional, Dict, Any
+
from seqspec.utils import load_spec, map_read_id_to_regions
from seqspec.seqspec_find import find_by_region_id
-import warnings
from seqspec.seqspec_file import list_files_by_file_id, list_read_files
-from argparse import SUPPRESS, RawTextHelpFormatter
-from seqspec.Region import complement_sequence
-from seqspec.Region import RegionCoordinateDifference
-
from seqspec.Region import (
+ complement_sequence,
+ RegionCoordinateDifference,
project_regions_to_coordinates,
itx_read,
)
from seqspec.Read import ReadCoordinate
+from seqspec.Assay import Assay
-def setup_index_args(parser):
+def setup_index_args(parser) -> ArgumentParser:
+ """Create and configure the index command subparser."""
subparser = parser.add_parser(
"index",
description="""
@@ -30,12 +39,13 @@ def setup_index_args(parser):
formatter_class=RawTextHelpFormatter,
)
subparser_required = subparser.add_argument_group("required arguments")
- subparser.add_argument("yaml", help="Sequencing specification yaml file")
+ subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
subparser.add_argument(
"-o",
+ "--output",
metavar="OUT",
- help=("Path to output file"),
- type=str,
+ help="Path to output file",
+ type=Path,
default=None,
)
@@ -50,6 +60,7 @@ def setup_index_args(parser):
choices = [
"chromap",
"kb",
+ "kb-single",
"relative",
"seqkit",
"simpleaf",
@@ -60,8 +71,9 @@ def setup_index_args(parser):
]
subparser.add_argument(
"-t",
+ "--tool",
metavar="TOOL",
- help=(f"Tool, [{', '.join(choices)}] (default: tab)"),
+ help=f"Tool, [{', '.join(choices)}] (default: tab)",
default="tab",
type=str,
choices=choices,
@@ -70,8 +82,9 @@ def setup_index_args(parser):
choices = ["read", "region", "file"]
subparser.add_argument(
"-s",
+ "--selector",
metavar="SELECTOR",
- help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"),
+ help=f"Selector for ID, [{', '.join(choices)}] (default: read)",
type=str,
default="read",
choices=choices,
@@ -90,16 +103,17 @@ def setup_index_args(parser):
subparser_required.add_argument(
"-m",
+ "--modality",
metavar="MODALITY",
- help=("Modality"),
+ help="Modality",
type=str,
- default=None,
required=True,
)
subparser_required.add_argument(
"-i",
+ "--ids",
metavar="IDs",
- help=("IDs"),
+ help="IDs",
type=str,
default=None,
required=False,
@@ -114,7 +128,8 @@ def setup_index_args(parser):
return subparser
-def validate_index_args(parser, args):
+def validate_index_args(parser: ArgumentParser, args: Namespace) -> None:
+ """Validate the index command arguments."""
if args.r is not None:
warnings.warn(
"The '-r' argument is deprecated and will be removed in a future version. "
@@ -122,8 +137,8 @@ def validate_index_args(parser, args):
DeprecationWarning,
)
# Optionally map the old option to the new one
- if not args.i:
- args.i = args.r
+ if not args.ids:
+ args.ids = args.r
if args.region:
warnings.warn(
"The '--region' argument is deprecated and will be removed in a future version. "
@@ -131,61 +146,67 @@ def validate_index_args(parser, args):
DeprecationWarning,
)
- fn = args.yaml
- m = args.m
- ids = args.i # this can be a list of ids (reads, regions, or files)
- t = args.t
- o = args.o
- subregion_type = args.subregion_type
- rev = args.rev
- idtype = args.s
+ if not Path(args.yaml).exists():
+ parser.error(f"Input file does not exist: {args.yaml}")
- if ids is None and (idtype == "read" or idtype == "region"):
+ if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+ parser.error(f"Output path exists but is not a file: {args.output}")
+
+ if args.ids is None and (args.selector == "read" or args.selector == "region"):
parser.error("Must specify ids with -i for -s read or -s region")
- return run_index(fn, m, ids, idtype, t, rev, subregion_type, o=o)
-
-
-def run_index(
- spec_fn,
- modality,
- ids,
- idtype,
- fmt,
- rev,
- subregion_type,
- o,
-):
- spec = load_spec(spec_fn)
- if ids is None:
- ids = []
- else:
- ids = ids.split(",")
- x = index(spec, modality, ids, idtype, fmt, rev, subregion_type)
+def seqspec_index(
+ spec: Assay,
+ modality: str,
+ ids: List[str],
+ idtype: str,
+ rev: bool = False,
+) -> List[Dict[str, Any]]:
+ """Core functionality to get index information from the spec.
+
+ Args:
+ spec: The Assay object to index
+ modality: The modality to index
+ ids: List of IDs to index
+ idtype: Type of ID (read, region, file)
+ rev: Whether to return 3'->5' region order
+
+ Returns:
+ List of index dictionaries containing region coordinates and strand information
+ """
+ GET_INDICES = {
+ "file": get_index_by_files,
+ }
- # post processing
- if o:
- with open(o, "w") as f:
- print(x, file=f)
- else:
- print(x)
+ GET_INDICES_BY_IDS = {
+ "file": get_index_by_file_ids,
+ "region": get_index_by_region_ids,
+ "read": get_index_by_read_ids,
+ }
+
+ if not ids:
+ return GET_INDICES[idtype](spec, modality)
+ return GET_INDICES_BY_IDS[idtype](spec, modality, ids)
- return
+def format_index(
+ indices: List[Dict[str, Any]], fmt: str, subregion_type: Optional[str] = None
+) -> str:
+ """Format index information into a specific output format.
-def index(
- spec,
- modality,
- ids,
- idtype,
- fmt,
- rev=False,
- subregion_type=None,
-):
+ Args:
+ indices: List of index dictionaries from seqspec_index
+ fmt: Output format to use
+ subregion_type: Optional subregion type for filtering
+
+ Returns:
+ Formatted index information as a string
+ """
FORMAT = {
"chromap": format_chromap,
"kb": format_kallisto_bus,
+ "kb-single": format_kallisto_bus_force_single,
"relative": format_relative,
"seqkit": format_seqkit_subseq,
"simpleaf": format_simpleaf,
@@ -195,22 +216,37 @@ def index(
"zumis": format_zumis,
}
- GET_INDICES = {
- "file": get_index_by_files,
- }
+ if fmt not in FORMAT:
+ warnings.warn(
+ f"Unknown format '{fmt}'. Valid formats are: {', '.join(FORMAT.keys())}"
+ )
+ return ""
- GET_INDICES_BY_IDS = {
- "file": get_index_by_file_ids,
- "region": get_index_by_region_ids,
- "read": get_index_by_read_ids,
- }
+ return FORMAT[fmt](indices, subregion_type)
- if len(ids) == 0:
- indices = GET_INDICES[idtype](spec, modality)
- else:
- indices = GET_INDICES_BY_IDS[idtype](spec, modality, ids)
- return FORMAT[fmt](indices, subregion_type)
+def run_index(parser: ArgumentParser, args: Namespace) -> None:
+ """Run the index command."""
+ validate_index_args(parser, args)
+
+ spec = load_spec(args.yaml)
+ ids = args.ids.split(",") if args.ids else []
+
+ indices = seqspec_index(
+ spec,
+ args.modality,
+ ids,
+ args.selector,
+ args.rev,
+ )
+
+ result = format_index(indices, args.tool, args.subregion_type)
+
+ if args.output:
+ with open(args.output, "w") as f:
+ print(result, file=f)
+ else:
+ print(result)
def get_index_by_files(spec, modality):
@@ -283,6 +319,9 @@ def get_index_by_primer(
return {read_id: new_rcs, "strand": rdc.read.strand}
+FEATURE_REGION_TYPES = {"CDNA", "GDNA", "PROTEIN", "TAG", "SGRNA_TARGET"}
+
+
def format_kallisto_bus(indices, subregion_type=None):
bcs = []
umi = []
@@ -295,12 +334,7 @@ def format_kallisto_bus(indices, subregion_type=None):
bcs.append(f"{idx},{cut.start},{cut.stop}")
elif cut.region_type.upper() == "UMI":
umi.append(f"{idx},{cut.start},{cut.stop}")
- elif (
- cut.region_type.upper() == "CDNA"
- or cut.region_type.upper() == "GDNA"
- or cut.region_type.upper() == "PROTEIN"
- or cut.region_type.upper() == "TAG"
- ):
+ elif cut.region_type.upper() in FEATURE_REGION_TYPES:
feature.append(f"{idx},{cut.start},{cut.stop}")
if len(umi) == 0:
umi.append("-1,-1,-1")
@@ -311,6 +345,38 @@ def format_kallisto_bus(indices, subregion_type=None):
return x
+def format_kallisto_bus_force_single(indices, subregion_type=None):
+ bcs = []
+ umi = []
+ feature = []
+ longest_feature = None
+ max_length = 0
+
+ for idx, region in enumerate(indices):
+ rg_strand = region.pop("strand") # noqa
+ for rgn, cuts in region.items():
+ for cut in cuts:
+ if cut.region_type.upper() == "BARCODE":
+ bcs.append(f"{idx},{cut.start},{cut.stop}")
+ elif cut.region_type.upper() == "UMI":
+ umi.append(f"{idx},{cut.start},{cut.stop}")
+ elif cut.region_type.upper() in FEATURE_REGION_TYPES:
+ length = cut.stop - cut.start
+ if length > max_length:
+ max_length = length
+ longest_feature = f"{idx},{cut.start},{cut.stop}"
+
+ if len(umi) == 0:
+ umi.append("-1,-1,-1")
+ if len(bcs) == 0:
+ bcs.append("-1,-1,-1")
+ if longest_feature:
+ feature.append(longest_feature)
+
+ x = ",".join(bcs) + ":" + ",".join(umi) + ":" + ",".join(feature)
+ return x
+
+
# this one should only return one string
# TODO: return to this
def format_seqkit_subseq(indices, subregion_type=None):
diff --git a/seqspec/seqspec_info.py b/seqspec/seqspec_info.py
index c01a9639..9a257311 100644
--- a/seqspec/seqspec_info.py
+++ b/seqspec/seqspec_info.py
@@ -1,13 +1,13 @@
from seqspec.utils import load_spec
import json
-from typing import List
-from seqspec.Region import Region
-from seqspec.Read import Read
+from typing import Dict
from seqspec.Assay import Assay
-from argparse import RawTextHelpFormatter
+from argparse import RawTextHelpFormatter, ArgumentParser, Namespace
+from pathlib import Path
-def setup_info_args(parser):
+def setup_info_args(parser) -> ArgumentParser:
+ """Create and configure the info command subparser."""
subparser = parser.add_parser(
"info",
description="""
@@ -24,12 +24,13 @@ def setup_info_args(parser):
formatter_class=RawTextHelpFormatter,
)
- subparser.add_argument("yaml", help="Sequencing specification yaml file")
+ subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
choices = ["modalities", "meta", "sequence_spec", "library_spec"]
subparser.add_argument(
"-k",
+ "--key",
metavar="KEY",
- help=(f"Object to display, [{', '.join(choices)}] (default: meta)"),
+ help=f"Object to display, [{', '.join(choices)}] (default: meta)",
type=str,
default="meta",
required=False,
@@ -37,8 +38,9 @@ def setup_info_args(parser):
choices = ["tab", "json"]
subparser.add_argument(
"-f",
+ "--format",
metavar="FORMAT",
- help=(f"The output format, [{', '.join(choices)}] (default: tab)"),
+ help=f"The output format, [{', '.join(choices)}] (default: tab)",
type=str,
default="tab",
required=False,
@@ -46,116 +48,239 @@ def setup_info_args(parser):
)
subparser.add_argument(
"-o",
+ "--output",
metavar="OUT",
- help=("Path to output file"),
- type=str,
+ help="Path to output file",
+ type=Path,
default=None,
required=False,
)
return subparser
-def validate_info_args(parser, args):
- spec_fn = args.yaml
- o = args.o
- k = args.k
- fmt = args.f
- return run_info(spec_fn, fmt, k, o)
+def validate_info_args(parser: ArgumentParser, args: Namespace) -> None:
+ """Validate the info command arguments."""
+ if not Path(args.yaml).exists():
+ parser.error(f"Input file does not exist: {args.yaml}")
+
+ if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+ parser.error(f"Output path exists but is not a file: {args.output}")
+
+
+def run_info(parser: ArgumentParser, args: Namespace) -> None:
+ """Run the info command."""
+ validate_info_args(parser, args)
+
+ spec = load_spec(args.yaml)
+
+ if args.key:
+ # Extract data
+ info = seqspec_info(spec, args.key)
+ # Format info
+ result = format_info(info, args.key, args.format)
+ if args.output:
+ with open(args.output, "w") as f:
+ if args.format == "json":
+ f.write(result)
+ else:
+ print(result, file=f)
+ else:
+ print(result)
-def run_info(spec_fn, f, k=None, o=None):
- # return json of the Assay object
- spec = load_spec(spec_fn)
- CMD = {
+
+def seqspec_info(spec: Assay, key: str) -> Dict:
+ """Get information from the spec based on the key.
+
+ Args:
+ spec: The Assay object to get info from
+ key: The type of information to retrieve (modalities, meta, sequence_spec, library_spec)
+
+ Returns:
+ Dictionary containing the requested information
+
+ Raises:
+ KeyError: If the requested key is not supported
+ """
+ INFO_FUNCS = {
"modalities": seqspec_info_modalities,
- "meta": seqspec_info,
+ "meta": seqspec_info_meta,
"sequence_spec": seqspec_info_sequence_spec,
"library_spec": seqspec_info_library_spec,
}
- s = ""
- if k:
- s = CMD[k](spec, f)
+ if key not in INFO_FUNCS:
+ raise KeyError(
+ f"Unsupported info key: {key}. Must be one of {list(INFO_FUNCS.keys())}"
+ )
+ return INFO_FUNCS[key](spec)
- if o:
- with open(o, "w") as f:
- json.dump(s, f, sort_keys=False, indent=4)
- else:
- print(s)
- return
+def format_info(info: Dict, key: str, fmt: str = "tab") -> str:
+ """Format information based on the key and format.
-def seqspec_info(spec, fmt):
- s = format_info(spec, fmt)
- return s
+ Args:
+ info: Dictionary containing the information to format
+ key: The type of information to format (modalities, meta, sequence_spec, library_spec)
+ fmt: Output format (tab or json)
+ Returns:
+ Formatted string
-def seqspec_info_library_spec(spec, fmt):
+ Raises:
+ KeyError: If the requested key is not supported
+ """
+ FORMAT_FUNCS = {
+ "modalities": format_modalities,
+ "meta": format_meta,
+ "sequence_spec": format_sequence_spec,
+ "library_spec": format_library_spec,
+ }
+ if key not in FORMAT_FUNCS:
+ raise KeyError(
+ f"Unsupported format key: {key}. Must be one of {list(FORMAT_FUNCS.keys())}"
+ )
+ return FORMAT_FUNCS[key](info, fmt)
+
+
+def seqspec_info_meta(spec: Assay) -> Dict:
+ """Get meta information about the spec.
+
+ Args:
+ spec: The Assay object to get info from
+
+ Returns:
+ Dictionary containing meta information
+ """
+ sd = spec.to_dict()
+ del sd["library_spec"]
+ del sd["sequence_spec"]
+ del sd["modalities"]
+ return {"meta": sd}
+
+
+def seqspec_info_library_spec(spec: Assay) -> Dict:
+ """Get library specification information.
+
+ Args:
+ spec: The Assay object to get info from
+
+ Returns:
+ Dictionary containing library specifications by modality
+ """
modalities = spec.list_modalities()
- s = ""
+ result = {}
for m in modalities:
libspec = spec.get_libspec(m)
- s += format_library_spec(m, libspec.get_leaves(), fmt)
- return s
+ result[m] = libspec.get_leaves()
+ return {"library_spec": result}
-def seqspec_info_sequence_spec(spec: Assay, fmt):
- reads = format_sequence_spec(spec.sequence_spec, fmt)
- return reads
+def seqspec_info_sequence_spec(spec: Assay) -> Dict:
+ """Get sequence specification information.
+ Args:
+ spec: The Assay object to get info from
-def seqspec_info_modalities(spec, fmt):
- modalities = format_modalities(spec.list_modalities(), fmt)
- return modalities
+ Returns:
+ Dictionary containing sequence specifications
+ """
+ return {"sequence_spec": spec.sequence_spec}
-def format_info(spec: Assay, fmt="tab"):
- sd = spec.to_dict()
- del sd["library_spec"]
- del sd["sequence_spec"]
- del sd["modalities"]
- s = ""
+def seqspec_info_modalities(spec: Assay) -> Dict:
+ """Get list of modalities.
+
+ Args:
+ spec: The Assay object to get info from
+
+ Returns:
+ Dictionary containing list of modalities
+ """
+ return {"modalities": spec.list_modalities()}
+
+
+def format_meta(info: Dict, fmt: str = "tab") -> str:
+ """Format meta information.
+
+ Args:
+ info: Dictionary containing meta information from seqspec_info_meta
+ fmt: Output format (tab or json)
+
+ Returns:
+ Formatted string
+ """
if fmt == "tab":
- for k, v in sd.items():
- s += f"{v}\t"
- s = s[:-1]
+ return "\t".join(str(v) for v in info["meta"].values())
elif fmt == "json":
- s = json.dumps(sd, sort_keys=False, indent=4)
+ return json.dumps(info["meta"], sort_keys=False, indent=4)
+ return ""
+
- return s
+def format_modalities(info: Dict, fmt: str = "tab") -> str:
+ """Format list of modalities.
+ Args:
+ info: Dictionary containing modalities from seqspec_info_modalities
+ fmt: Output format (tab or json)
-def format_modalities(modalities: List[str], fmt="tab"):
- s = ""
+ Returns:
+ Formatted string
+ """
if fmt == "tab":
- s = "\t".join(modalities)
+ return "\t".join(info["modalities"])
elif fmt == "json":
- s = json.dumps(modalities, sort_keys=False, indent=4)
- return s
+ return json.dumps(info["modalities"], sort_keys=False, indent=4)
+ return ""
-def format_sequence_spec(sequence_spec: List[Read], fmt="tab"):
- s = ""
+def format_sequence_spec(info: Dict, fmt: str = "tab") -> str:
+ """Format sequence specification.
+
+ Args:
+ info: Dictionary containing sequence specs from seqspec_info_sequence_spec
+ fmt: Output format (tab or json)
+
+ Returns:
+ Formatted string
+ """
if fmt == "tab":
- # format the output as a table
- for r in sequence_spec:
- s += f"{r.modality}\t{r.read_id}\t{r.strand}\t{r.min_len}\t{r.max_len}\t{r.primer_id}\t{r.name}\t{','.join([i.file_id for i in r.files])}\n"
- s = s[:-1]
+ lines = []
+ for r in info["sequence_spec"]:
+ files = ",".join([i.file_id for i in r.files]) if r.files else ""
+ lines.append(
+ f"{r.modality}\t{r.read_id}\t{r.strand}\t{r.min_len}\t{r.max_len}\t{r.primer_id}\t{r.name}\t{files}"
+ )
+ return "\n".join(lines)
elif fmt == "json":
- s = json.dumps([i.to_dict() for i in sequence_spec], sort_keys=False, indent=4)
- return s
+ return json.dumps(
+ [i.to_dict() for i in info["sequence_spec"]], sort_keys=False, indent=4
+ )
+ return ""
+
+
+def format_library_spec(info: Dict, fmt: str = "tab") -> str:
+ """Format library specification.
+ Args:
+ info: Dictionary containing library specs from seqspec_info_library_spec
+ fmt: Output format (tab or json)
-def format_library_spec(modality: str, library_spec: List[Region], fmt="tab"):
- s = ""
+ Returns:
+ Formatted string
+ """
if fmt == "tab":
- for r in library_spec:
- file = None
- if r.onlist:
- file = r.onlist.filename
- s += f"{modality}\t{r.region_id}\t{r.region_type}\t{r.name}\t{r.sequence_type}\t{r.sequence}\t{r.min_len}\t{r.max_len}\t{file}\n"
- s = s[:-1]
+ lines = []
+ for modality, regions in info["library_spec"].items():
+ for r in regions:
+ file = r.onlist.filename if r.onlist else None
+ lines.append(
+ f"{modality}\t{r.region_id}\t{r.region_type}\t{r.name}\t{r.sequence_type}\t{r.sequence}\t{r.min_len}\t{r.max_len}\t{file}"
+ )
+ return "\n".join(lines)
elif fmt == "json":
- s = json.dumps(
- {modality: [i.to_dict() for i in library_spec]}, sort_keys=False, indent=4
+ return json.dumps(
+ {m: [i.to_dict() for i in r] for m, r in info["library_spec"].items()},
+ sort_keys=False,
+ indent=4,
)
- return s
+ return ""
diff --git a/seqspec/seqspec_init.py b/seqspec/seqspec_init.py
index 066c5b65..a65af025 100644
--- a/seqspec/seqspec_init.py
+++ b/seqspec/seqspec_init.py
@@ -1,18 +1,25 @@
+"""Init module for seqspec CLI.
+
+This module provides functionality to generate new seqspec files from a newick tree format.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+from typing import List
+
+import newick
from seqspec.Assay import Assay
from seqspec.Region import Region
from seqspec.File import File
from seqspec.Read import Read
-from typing import List
-import newick
-from argparse import RawTextHelpFormatter
# example
+# seqspec init -n myassay -m 1 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna)" # seqspec init -n myassay -m 1 -o spec.yaml -r "rna,R1.fastq.gz,truseq_r1,16,pos:rna,R2.fastq.gz,truseq_r2,100,neg" " ((truseq_r1:10,barcode:16,umi:12,cdna:150)rna)"
+# seqspec init -n myassay -m 2 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna,((barcode:16)r1.fastq.gz,(gdna:150)r2.fastq.gz,(gdna:150)r3.fastq.gz)atac)"
-# seqspec init -n myassay -m 1 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna)"
-# seqspec init -n myassay -m 1 -o spec.yaml -r "rna,R1.fastq.gz,truseq_r1,16,pos:rna,R2.fastq.gz,truseq_r2,100,neg" "((truseq_r1:10,barcode:16,umi:12,cdna:150)rna)"
-# seqspec init -n myassay -m 2 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna,((barcode:16)r1.fastq.gz,(gdna:150)r2.fastq.gz,(gdna:150)r3.fastq.gz)atac)"
-def setup_init_args(parser):
+def setup_init_args(parser) -> ArgumentParser:
+ """Create and configure the init command subparser."""
subparser = parser.add_parser(
"init",
description="""
@@ -27,85 +34,90 @@ def setup_init_args(parser):
)
subparser_required = subparser.add_argument_group("required arguments")
subparser_required.add_argument(
- "-n", metavar="NAME", type=str, help="assay name", required=True
+ "-n", "--name", metavar="NAME", type=str, help="Assay name", required=True
)
- # -m "rna,atac"
subparser_required.add_argument(
"-m",
+ "--modalities",
metavar="MODALITIES",
type=str,
- help="list of comma-separated modalities (e.g. rna,atac)",
+ help="List of comma-separated modalities (e.g. rna,atac)",
required=True,
)
# -r "rna,R1.fastq.gz,truseq_r1,16,pos:rna,R2.fastq.gz,truseq_r2,100,neg"
subparser_required.add_argument(
"-r",
+ "--reads",
metavar="READS",
type=str,
- help="list of modalities, reads, primer_ids, lengths, and strand (e.g. modality,fastq_name,primer_id,len,strand:...)",
+ help="List of modalities, reads, primer_ids, lengths, and strand (e.g. modality,fastq_name,primer_id,len,strand:...)",
required=True,
)
-
subparser.add_argument(
"-o",
+ "--output",
metavar="OUT",
- help=("Path to output file"),
- type=str,
+ help="Path to output file",
+ type=Path,
default=None,
)
subparser.add_argument(
"newick",
- help=(
- "tree in newick format (https://marvin.cs.uidaho.edu/Teaching/CS515/newickFormat.html)"
- ),
+ help="Tree in newick format (https://marvin.cs.uidaho.edu/Teaching/CS515/newickFormat.html)",
)
return subparser
-def validate_init_args(parser, args):
- name = args.n
- modalities_str = args.m
- newick_str = args.newick
- o = args.o
- reads_str = args.r
+def validate_init_args(parser: ArgumentParser, args: Namespace) -> None:
+ """Validate the init command arguments."""
+ if not args.newick:
+ parser.error("Newick tree must be provided")
- if newick_str is None:
- parser.error("modality-FASTQs pairs must be provided")
+ if args.output and args.output.exists() and not args.output.is_file():
+ parser.error(f"Output path exists but is not a file: {args.output}")
- return run_init(name, modalities_str, newick_str, reads_str, o)
+def run_init(parser: ArgumentParser, args: Namespace) -> None:
+ """Run the init command."""
+ validate_init_args(parser, args)
-def run_init(name: str, modalities_str, newick_str, reads_str, o=None):
- modalities = modalities_str.split(",")
- reads = parse_reads_string(reads_str)
- tree = newick.loads(newick_str)
- if len(tree[0].descendants) != len(modalities):
- raise ValueError(
- "Number of modalities must match number of modality-FASTQs pairs"
- )
+ modalities = args.modalities.split(",")
+ reads = parse_reads_string(args.reads)
+ regions = newick_to_regions(args.newick)
- reads = parse_reads_string(reads_str)
- spec = init(name, modalities, tree[0].descendants, reads)
+ spec = seqspec_init(args.name, modalities, regions, reads)
+ yaml_str = spec.to_YAML()
+ if yaml_str is None:
+ raise ValueError("Failed to generate YAML string from assay")
- if o:
- spec.to_YAML(o)
+ if args.output:
+ args.output.write_text(yaml_str)
else:
- print(spec.to_YAML())
+ print(yaml_str)
+
- return
+def seqspec_init(
+ name: str, modalities: List[str], regions: List[Region], reads: List[Read]
+) -> Assay:
+ """Initialize a new seqspec specification.
+ Args:
+ name: Name of the assay
+ modalities: List of modalities
+ regions: List of Region objects
+ reads: List of read specifications
-def init(name: str, modalities, tree: List[newick.Node], reads: List[Read]):
- # make read for each fastq
- # make region for each modality
- # add modality regions to assay
- rgns = []
- for t in tree:
- r = Region(region_id="", region_type="", name="", sequence_type="")
- rgns.append(newick_to_region(t, r))
+ Returns:
+ Initialized Assay object
- assay = Assay(
+ Raises:
+ ValueError: If number of modalities doesn't match number of regions
+ """
+ if len(regions) != len(modalities):
+ raise ValueError("Number of modalities must match number of regions")
+
+ return Assay(
assay_id="",
name=name,
doi="",
@@ -118,59 +130,103 @@ def init(name: str, modalities, tree: List[newick.Node], reads: List[Read]):
sequence_kit="",
sequence_protocol="",
sequence_spec=reads,
- library_spec=rgns,
+ library_spec=regions,
)
- return assay
-def newick_to_region(
- node, region=Region(region_id="", region_type="", name="", sequence_type="")
-):
+def newick_to_regions(newick_str: str) -> List[Region]:
+ """Convert a newick string to a list of Region objects.
+
+ Args:
+ newick_str: Newick format string representing the library structure
+
+ Returns:
+ List of Region objects
+
+ Raises:
+ ValueError: If newick string is invalid
+ """
+ try:
+ tree = newick.loads(newick_str)
+ except Exception as e:
+ raise ValueError(f"Invalid newick string: {e}")
+
+ regions = []
+ for node in tree[0].descendants:
+ region = Region(region_id="", region_type="", name="", sequence_type="")
+ regions.append(newick_to_region(node, region))
+ return regions
+
+
+def newick_to_region(node: newick.Node, region: Region) -> Region:
+ """Convert a newick node to a Region object.
+
+ Args:
+ node: Newick tree node
+ region: Base region object to populate
+
+ Returns:
+ Populated Region object
+ """
region.region_id = node.name
region.name = node.name
- if len(node.descendants) == 0:
+ if not node.descendants:
region.min_len = int(node.length)
region.max_len = int(node.length)
return region
+
region.regions = []
- for n in node.descendants:
+ for descendant in node.descendants:
region.regions.append(
newick_to_region(
- n,
- Region(region_id=n.name, region_type="", name=n.name, sequence_type=""),
+ descendant,
+ Region(
+ region_id=descendant.name,
+ region_type="",
+ name=descendant.name,
+ sequence_type="",
+ ),
)
)
return region
-def parse_reads_string(input_string):
+def parse_reads_string(input_string: str) -> List[Read]:
+ """Parse a string of read specifications into Read objects.
+
+ Args:
+ input_string: String containing read specifications in format
+ "modality,read_id,primer_id,min_len,strand:..."
+
+ Returns:
+ List of Read objects
+ """
reads = []
- objects = input_string.split(":")
- for obj in objects:
- parts = obj.split(",")
- modality, read_id, primer_id, min_len, strand = parts
-
- read = Read(
- read_id=read_id,
- name=read_id,
- modality=modality,
- primer_id=primer_id,
- min_len=int(min_len),
- max_len=int(min_len),
- strand=strand,
- files=[
- File(
- file_id=read_id,
- filename=read_id,
- filetype="",
- filesize=0,
- url="",
- urltype="",
- md5="",
- )
- ],
+ for obj in input_string.split(":"):
+ modality, read_id, primer_id, min_len, strand = obj.split(",")
+
+ reads.append(
+ Read(
+ read_id=read_id,
+ name=read_id,
+ modality=modality,
+ primer_id=primer_id,
+ min_len=int(min_len),
+ max_len=int(min_len),
+ strand=strand,
+ files=[
+ File(
+ file_id=read_id,
+ filename=read_id,
+ filetype="",
+ filesize=0,
+ url="",
+ urltype="",
+ md5="",
+ )
+ ],
+ )
)
- reads.append(read)
return reads
diff --git a/seqspec/seqspec_methods.py b/seqspec/seqspec_methods.py
index 2d19915c..6f7a3b01 100644
--- a/seqspec/seqspec_methods.py
+++ b/seqspec/seqspec_methods.py
@@ -1,17 +1,27 @@
+"""Methods module for seqspec.
+
+This module provides functionality to convert seqspec files into methods sections.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+
from seqspec.utils import load_spec
from seqspec.Assay import Assay
from seqspec.Region import Region
-from argparse import RawTextHelpFormatter
+from seqspec.Read import Read, File
-def setup_methods_args(parser):
+def setup_methods_args(parser) -> ArgumentParser:
+ """Create and configure the methods command subparser."""
subparser = parser.add_parser(
"methods",
description="""
Convert seqspec file into methods section.
Examples:
-seqspec methods -m rna spec.yaml # Return methods section for rna modality
+seqspec methods -m rna -o methods.txt spec.yaml # Save methods section to file
+seqspec methods -m rna spec.yaml # Print methods section to stdout
---
""",
help="Convert seqspec file into methods section",
@@ -19,46 +29,50 @@ def setup_methods_args(parser):
)
subparser_required = subparser.add_argument_group("required arguments")
- subparser.add_argument("yaml", help="Sequencing specification yaml file")
+ subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str)
subparser_required.add_argument(
"-m",
+ "--modality",
metavar="MODALITY",
- help=("Modality"),
+ help="Modality",
type=str,
- default=None,
required=True,
)
subparser.add_argument(
"-o",
+ "--output",
metavar="OUT",
- help=("Path to output file"),
- type=str,
+ help="Path to output file",
+ type=Path,
default=None,
)
return subparser
-def validate_methods_args(parser, args):
- # if everything is valid the run_methods
- fn = args.yaml
- o = args.o
+def validate_methods_args(parser: ArgumentParser, args: Namespace) -> None:
+ """Validate the methods command arguments."""
+ if not Path(args.yaml).exists():
+ parser.error(f"Input file does not exist: {args.yaml}")
- m = args.m
- return run_methods(fn, m, o)
+ if args.output and args.output.exists() and not args.output.is_file():
+ parser.error(f"Output path exists but is not a file: {args.output}")
-def run_methods(spec_fn, m, o):
- spec = load_spec(spec_fn)
- m = methods(spec, m)
+def run_methods(parser: ArgumentParser, args: Namespace) -> None:
+ """Run the methods command."""
+ validate_methods_args(parser, args)
- if o:
- with open(o, "w") as f:
- print(m, file=f)
+ spec = load_spec(args.yaml)
+ methods_text = methods(spec, args.modality)
+
+ if args.output:
+ args.output.write_text(methods_text)
else:
- print(m)
+ print(methods_text)
-def methods(spec, modality):
+def methods(spec: Assay, modality: str) -> str:
+ """Generate methods section for spec and modality."""
m = f"""Methods
The {modality} portion of the {spec.name} assay was generated on {spec.date}.
"""
@@ -67,15 +81,16 @@ def methods(spec, modality):
# TODO: manage sequence/library protocol/kit for cases where each modality has different protocols/kits
-def format_library_spec(spec: Assay, m):
- leaves = spec.get_libspec(m).get_leaves()
+def format_library_spec(spec: Assay, modality: str) -> str:
+ """Format library specification for methods section."""
+ leaves = spec.get_libspec(modality).get_leaves()
lib_prot = None
if isinstance(spec.library_protocol, str):
lib_prot = spec.library_protocol
elif isinstance(spec.library_protocol, list):
for i in spec.library_protocol:
- if i.modality == m:
+ if i.modality == modality:
lib_prot = i.protocol_id
lib_kit = None
@@ -83,7 +98,7 @@ def format_library_spec(spec: Assay, m):
lib_kit = spec.library_kit
elif isinstance(spec.library_kit, list):
for i in spec.library_kit:
- if i.modality == m:
+ if i.modality == modality:
lib_kit = i.kit_id
seq_prot = None
@@ -91,7 +106,7 @@ def format_library_spec(spec: Assay, m):
seq_prot = spec.sequence_protocol
elif isinstance(spec.sequence_protocol, list):
for i in spec.sequence_protocol:
- if i.modality == m:
+ if i.modality == modality:
seq_prot = i.protocol_id
seq_kit = None
@@ -99,7 +114,7 @@ def format_library_spec(spec: Assay, m):
seq_kit = spec.sequence_kit
elif isinstance(spec.sequence_kit, list):
for i in spec.sequence_kit:
- if i.modality == m:
+ if i.modality == modality:
seq_kit = i.kit_id
s = f"""
@@ -112,13 +127,14 @@ def format_library_spec(spec: Assay, m):
\nSequence structure\n
The library was sequenced on a {seq_prot} using the {seq_kit} sequencing kit. The library was sequenced using the following configuration:\n
"""
- reads = spec.get_seqspec(m)
+ reads = spec.get_seqspec(modality)
for idx, r in enumerate(reads, 1):
s += format_read(r, idx)
return s
-def format_region(region: Region, idx: int = 1):
+def format_region(region: Region, idx: int = 1) -> str:
+ """Format region for methods section."""
s = f"{idx}. {region.name}: {region.min_len}-{region.max_len}bp {region.sequence_type} sequence ({region.sequence})"
if region.onlist:
s += f", onlist file: {region.onlist.filename}.\n"
@@ -127,14 +143,15 @@ def format_region(region: Region, idx: int = 1):
return s
-def format_read(read, idx: int = 1):
+def format_read(read: Read, idx: int = 1) -> str:
+ """Format read for methods section."""
s = f"- {read.name}: {read.max_len} cycles on the {'positive' if read.strand == 'pos' else 'negative'} strand using the {read.primer_id} primer. The following files contain the sequences in Read {idx}:\n"
- for idx, f in enumerate(read.files, 1):
- s += " " + format_read_file(f, idx)
- s = s[:-1]
+ if read.files:
+ for idx, f in enumerate(read.files, 1):
+ s += " " + format_read_file(f, idx)
return s
-def format_read_file(file, idx: int = 1):
- s = f"- File {idx}: {file.filename}\n"
- return s
+def format_read_file(file: File, idx: int = 1) -> str:
+ """Format read file for methods section."""
+ return f"- File {idx}: {file.filename}\n"
diff --git a/seqspec/seqspec_modify.py b/seqspec/seqspec_modify.py
index 006cf005..78aa1ecd 100644
--- a/seqspec/seqspec_modify.py
+++ b/seqspec/seqspec_modify.py
@@ -1,12 +1,20 @@
+"""Modify module for seqspec.
+
+This module provides functionality to modify attributes of various elements in seqspec files.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS
+from typing import List, Optional
+import warnings
+
from seqspec.utils import load_spec
from seqspec.File import File
-from argparse import RawTextHelpFormatter, SUPPRESS
-import warnings
+from seqspec.Assay import Assay
-# TODO fix modify to use the -s selector
-def setup_modify_args(parser):
- # given a spec, a region id and a list of key value property pairs, modify the spec
+def setup_modify_args(parser) -> ArgumentParser:
+ """Create and configure the modify command subparser."""
subparser = parser.add_parser(
"modify",
description="""
@@ -22,34 +30,34 @@ def setup_modify_args(parser):
formatter_class=RawTextHelpFormatter,
)
subparser_required = subparser.add_argument_group("required arguments")
- subparser.add_argument("yaml", help="Sequencing specification yaml file")
+ subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str)
# Read properties
subparser.add_argument(
"--read-id",
metavar="READID",
- help=("New ID of read"),
+ help="New ID of read",
type=str,
default=None,
)
subparser.add_argument(
"--read-name",
metavar="READNAME",
- help=("New name of read"),
+ help="New name of read",
type=str,
default=None,
)
subparser.add_argument(
"--primer-id",
metavar="PRIMERID",
- help=("New ID of primer"),
+ help="New ID of primer",
type=str,
default=None,
)
subparser.add_argument(
"--strand",
metavar="STRAND",
- help=("New strand"),
+ help="New strand",
type=str,
default=None,
)
@@ -59,45 +67,44 @@ def setup_modify_args(parser):
subparser.add_argument(
"--files",
metavar="FILES",
- help=("New files, (filename,filetype,filesize,url,urltype,md5:...)"),
+ help="New files, (filename,filetype,filesize,url,urltype,md5:...)",
type=str,
default=None,
)
# Region properties
-
subparser.add_argument(
"--region-id",
metavar="REGIONID",
- help=("New ID of region"),
+ help="New ID of region",
type=str,
default=None,
)
subparser.add_argument(
"--region-type",
metavar="REGIONTYPE",
- help=("New type of region"),
+ help="New type of region",
type=str,
default=None,
)
subparser.add_argument(
"--region-name",
metavar="REGIONNAME",
- help=("New name of region"),
+ help="New name of region",
type=str,
default=None,
)
subparser.add_argument(
"--sequence-type",
metavar="SEQUENCETYPE",
- help=("New type of sequence"),
+ help="New type of sequence",
type=str,
default=None,
)
subparser.add_argument(
"--sequence",
metavar="SEQUENCE",
- help=("New sequence"),
+ help="New sequence",
type=str,
default=None,
)
@@ -106,25 +113,25 @@ def setup_modify_args(parser):
subparser.add_argument(
"--min-len",
metavar="MINLEN",
- help=("Min region length"),
+ help="Min region length",
type=int,
default=None,
)
subparser.add_argument(
"--max-len",
metavar="MAXLEN",
- help=("Max region length"),
+ help="Max region length",
type=int,
default=None,
)
subparser.add_argument(
"-o",
+ "--output",
metavar="OUT",
- help=("Path to output file"),
- type=str,
+ help="Path to output file",
+ type=Path,
default=None,
- required=False,
)
subparser_required.add_argument(
"-r",
@@ -132,38 +139,38 @@ def setup_modify_args(parser):
help=SUPPRESS,
type=str,
default=None,
- required=False,
)
subparser_required.add_argument(
"-i",
metavar="IDs",
- help=("IDs"),
+ help="IDs",
type=str,
default=None,
- required=False,
)
choices = ["read", "region"]
subparser.add_argument(
"-s",
+ "--selector",
metavar="SELECTOR",
- help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"),
+ help=f"Selector for ID, [{', '.join(choices)}] (default: read)",
type=str,
default="read",
choices=choices,
)
subparser_required.add_argument(
"-m",
+ "--modality",
metavar="MODALITY",
- help=("Modality of the assay"),
+ help="Modality of the assay",
type=str,
- default=None,
required=True,
)
return subparser
-def validate_modify_args(parser, args):
+def validate_modify_args(parser: ArgumentParser, args: Namespace) -> None:
+ """Validate the modify command arguments."""
if args.r is not None:
warnings.warn(
"The '-r' argument is deprecated and will be removed in a future version. "
@@ -174,103 +181,99 @@ def validate_modify_args(parser, args):
if not args.i:
args.i = args.r
- # if everything is valid the run_format
- fn = args.yaml
- o = args.o
- modality = args.m
- # target_r = args.r
- idtype = args.s # selector
- ids = args.i
+ if not Path(args.yaml).exists():
+ parser.error(f"Input file does not exist: {args.yaml}")
- # Read properties
- read_id = args.read_id
- read_name = args.read_name
- primer_id = args.primer_id
- strand = args.strand
- files = args.files
+ if args.output and args.output.exists() and not args.output.is_file():
+ parser.error(f"Output path exists but is not a file: {args.output}")
- # Region properties
- region_id = args.region_id
- region_type = args.region_type
- region_name = args.region_name
- sequence_type = args.sequence_type
- sequence = args.sequence
- # Read and Region properties
- min_len = args.min_len
- max_len = args.max_len
+def run_modify(parser: ArgumentParser, args: Namespace) -> None:
+ """Run the modify command."""
+ validate_modify_args(parser, args)
- spec = load_spec(fn)
+ spec = load_spec(args.yaml)
+ # Read properties
read_kwd = {
- "read_id": read_id,
- "read_name": read_name,
- "primer_id": primer_id,
- "min_len": min_len,
- "max_len": max_len,
- "strand": strand,
- "files": files,
+ "read_id": args.read_id,
+ "read_name": args.read_name,
+ "primer_id": args.primer_id,
+ "min_len": args.min_len,
+ "max_len": args.max_len,
+ "strand": args.strand,
+ "files": args.files,
}
+ # Region properties
region_kwd = {
- "region_id": region_id,
- "region_type": region_type,
- "name": region_name,
- "sequence_type": sequence_type,
- "sequence": sequence,
- "min_len": min_len,
- "max_len": max_len,
+ "region_id": args.region_id,
+ "region_type": args.region_type,
+ "name": args.region_name,
+ "sequence_type": args.sequence_type,
+ "sequence": args.sequence,
+ "min_len": args.min_len,
+ "max_len": args.max_len,
}
- if idtype == "region":
- spec = run_modify_region(spec, modality, ids, **region_kwd)
- elif idtype == "read":
- spec = run_modify_read(spec, modality, ids, **read_kwd)
- # update region in spec
- # once the region is updated, update the spec
+ if args.selector == "region":
+ spec = run_modify_region(spec, args.modality, args.i, **region_kwd)
+ elif args.selector == "read":
+ spec = run_modify_read(spec, args.modality, args.i, **read_kwd)
+
+ # Update spec
spec.update_spec()
- if o:
- spec.to_YAML(o)
+
+ if args.output:
+ args.output.write_text(spec.to_YAML())
else:
print(spec.to_YAML())
def run_modify_read(
- spec,
- modality,
- target_read,
- read_id,
- read_name,
- primer_id,
- min_len,
- max_len,
- strand,
- files,
-):
+ spec: Assay,
+ modality: str,
+ target_read: str,
+ read_id: Optional[str] = None,
+ read_name: Optional[str] = None,
+ primer_id: Optional[str] = None,
+ min_len: Optional[int] = None,
+ max_len: Optional[int] = None,
+ strand: Optional[str] = None,
+ files: Optional[str] = None,
+) -> Assay:
+ """Modify read properties in spec."""
reads = spec.get_seqspec(modality)
if files:
- files = parse_files_string(files)
+ files_list = parse_files_string(files)
for r in reads:
if r.read_id == target_read:
r.update_read_by_id(
- read_id, read_name, modality, primer_id, min_len, max_len, strand, files
+ read_id,
+ read_name,
+ modality,
+ primer_id,
+ min_len,
+ max_len,
+ strand,
+ files_list,
)
-
return spec
def run_modify_region(
- spec,
- modality,
- target_region,
- region_id,
- region_type,
- name,
- sequence_type,
- sequence,
- min_len,
- max_len,
-):
+ spec: Assay,
+ modality: str,
+ target_region: str,
+ region_id: Optional[str] = None,
+ region_type: Optional[str] = None,
+ name: Optional[str] = None,
+ sequence_type: Optional[str] = None,
+ sequence: Optional[str] = None,
+ min_len: Optional[int] = None,
+ max_len: Optional[int] = None,
+) -> Assay:
+ """Modify region properties in spec."""
spec.get_libspec(modality).update_region_by_id(
target_region,
region_id,
@@ -281,27 +284,23 @@ def run_modify_region(
min_len,
max_len,
)
-
return spec
-# filename,filetype,filesize,url,urltype,md5:...
-def parse_files_string(input_string):
+def parse_files_string(input_string: str) -> List[File]:
+ """Parse files string into list of File objects. # filename,filetype,filesize,url,urltype,md5:..."""
files = []
- objects = input_string.split(":")
- for obj in objects:
- parts = obj.split(",")
- filename, filetype, filesize, url, urltype, md5 = parts
-
- file = File(
- file_id=filename,
- filename=filename,
- filetype=filetype,
- filesize=int(filesize),
- url=url,
- urltype=urltype,
- md5=md5,
+ for f in input_string.split(":"):
+ filename, filetype, filesize, url, urltype, md5 = f.split(",")
+ files.append(
+ File(
+ file_id=filename,
+ filename=filename,
+ filetype=filetype,
+ filesize=int(filesize),
+ url=url,
+ urltype=urltype,
+ md5=md5,
+ )
)
- files.append(file)
-
return files
diff --git a/seqspec/seqspec_onlist.py b/seqspec/seqspec_onlist.py
index a29c63ad..51198b96 100644
--- a/seqspec/seqspec_onlist.py
+++ b/seqspec/seqspec_onlist.py
@@ -1,16 +1,28 @@
-from seqspec.Assay import Assay
-from seqspec.Region import project_regions_to_coordinates, itx_read, Onlist
-from seqspec.utils import load_spec, map_read_id_to_regions
-from seqspec.seqspec_find import find_by_region_type, find_by_region_id
+"""Onlist module for seqspec CLI.
+
+This module provides functionality to generate and manage onlist files for seqspec regions.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS
+import warnings
import os
-from seqspec.utils import read_local_list, read_remote_list
import itertools
from typing import List
-from argparse import SUPPRESS, RawTextHelpFormatter
-import warnings
+from seqspec.Assay import Assay
+from seqspec.Region import project_regions_to_coordinates, itx_read, Onlist
+from seqspec.utils import (
+ load_spec,
+ map_read_id_to_regions,
+ read_local_list,
+ read_remote_list,
+)
+from seqspec.seqspec_find import find_by_region_type, find_by_region_id
-def setup_onlist_args(parser):
+
+def setup_onlist_args(parser) -> ArgumentParser:
+ """Create and configure the onlist command subparser."""
subparser = parser.add_parser(
"onlist",
description="""
@@ -25,20 +37,22 @@ def setup_onlist_args(parser):
formatter_class=RawTextHelpFormatter,
)
subparser_required = subparser.add_argument_group("required arguments")
- subparser.add_argument("yaml", help="Sequencing specification yaml file")
+ subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
subparser.add_argument(
"-o",
+ "--output",
metavar="OUT",
- help=("Path to output file"),
- type=str,
+ help="Path to output file",
+ type=Path,
default=None,
)
choices = ["read", "region", "region-type"]
subparser.add_argument(
"-s",
+ "--selector",
metavar="SELECTOR",
- help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"),
+ help=f"Selector for ID, [{', '.join(choices)}] (default: read)",
type=str,
default="read",
choices=choices,
@@ -55,6 +69,7 @@ def setup_onlist_args(parser):
format_choices = ["product", "multi"]
subparser.add_argument(
"-f",
+ "--format",
metavar="FORMAT",
type=str,
default="product",
@@ -63,16 +78,18 @@ def setup_onlist_args(parser):
)
subparser_required.add_argument(
"-i",
- metavar="IDs",
- help=("IDs"),
+ "--id",
+ metavar="ID",
+ help="ID to search for",
type=str,
default=None,
required=False,
)
subparser_required.add_argument(
"-m",
+ "--modality",
metavar="MODALITY",
- help=("Modality"),
+ help="Modality",
type=str,
default=None,
required=True,
@@ -81,7 +98,14 @@ def setup_onlist_args(parser):
return subparser
-def validate_onlist_args(parser, args):
+def validate_onlist_args(parser: ArgumentParser, args: Namespace) -> None:
+ """Validate the onlist command arguments."""
+ if not Path(args.yaml).exists():
+ parser.error(f"Input file does not exist: {args.yaml}")
+
+ if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+ parser.error(f"Output path exists but is not a file: {args.output}")
+
if args.r is not None:
warnings.warn(
"The '-r' argument is deprecated and will be removed in a future version. "
@@ -89,32 +113,26 @@ def validate_onlist_args(parser, args):
DeprecationWarning,
)
# Optionally map the old option to the new one
- if not args.i:
- args.i = args.r
-
- fn = args.yaml
- m = args.m
- ids = args.i
- fmt = args.f
- o = args.o
- idtype = args.s
+ if not args.id:
+ args.id = args.r
- return run_onlist(fn, m, ids, idtype, fmt, o)
+def run_onlist(parser: ArgumentParser, args: Namespace) -> None:
+ """Run the onlist command."""
+ validate_onlist_args(parser, args)
-def run_onlist(spec_fn, modality, ids, idtype, fmt, o):
# the base path is the path to the spec file
- base_path = os.path.dirname(os.path.abspath(spec_fn))
+ base_path = args.yaml.parent.absolute()
# set the save path if it exists
- if o:
- save_path = os.path.abspath(o)
+ if args.output:
+ save_path = args.output
else:
# otherwise the save path is the same path as the spec
- save_path = os.path.join(base_path, "onlist_joined.txt")
+ save_path = base_path / "onlist_joined.txt"
# load spec
- spec = load_spec(spec_fn)
+ spec = load_spec(args.yaml)
# if number of barcodes > 1 then we need to join them
CMD = {
@@ -123,20 +141,22 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o):
"read": run_onlist_read,
}
- onlists = CMD[idtype](spec, modality, ids)
+ onlists = CMD[args.selector](spec, args.modality, args.id)
if len(onlists) == 0:
- raise ValueError(f"No onlist found for {modality}, {idtype}, {ids}")
+ raise ValueError(
+ f"No onlist found for {args.modality}, {args.selector}, {args.id}"
+ )
# for only one onlist we can just return the path
# if only one, its remote and we save it to the base path
elif len(onlists) == 1:
urltype = onlists[0].urltype
- onlist_fn = os.path.basename(onlists[0].filename)
- onlist_path = os.path.join(base_path, onlist_fn)
- if os.path.exists(onlist_path):
+ onlist_fn = Path(onlists[0].filename).name
+ onlist_path = base_path / onlist_fn
+ if onlist_path.exists():
urltype = "local"
- elif urltype == "http":
+ elif urltype in ["http", "https"]:
# download the onlist to the base path and return the path
onlist_elements = read_remote_list(onlists[0])
onlist_path = write_onlist(onlist_elements, save_path)
@@ -147,15 +167,14 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o):
for o in onlists:
if o.urltype == "local":
lsts.append(read_local_list(o, base_path))
- elif o.urltype == "http":
+ elif o.urltype in ["http", "https"]:
# base_path is ignored for remote onlists
lsts.append(read_remote_list(o, base_path))
- onlist_elements = join_onlists(lsts, fmt)
+ onlist_elements = join_onlists(lsts, args.format)
onlist_path = write_onlist(onlist_elements, save_path)
# print the path to the onlist
print(onlist_path)
- return
def run_onlist_region_type(
@@ -174,7 +193,9 @@ def run_onlist_region(spec: Assay, modality: str, region_id: str) -> List[Onlist
regions = find_by_region_id(spec, modality, region_id)
onlists: List[Onlist] = []
for r in regions:
- onlists.append(r.get_onlist())
+ ol = r.get_onlist()
+ if ol:
+ onlists.append(ol)
if len(onlists) == 0:
raise ValueError(f"No onlist found for region {region_id}")
return onlists
diff --git a/seqspec/seqspec_print.py b/seqspec/seqspec_print.py
index be7f304a..9ad66871 100644
--- a/seqspec/seqspec_print.py
+++ b/seqspec/seqspec_print.py
@@ -1,13 +1,33 @@
-from seqspec.utils import load_spec
-from seqspec.seqspec_print_html import print_seqspec_html
+"""Print module for seqspec CLI.
+
+This module provides functionality to print sequence and/or library structure
+in various formats (ascii, png, html).
+"""
+
+from typing import List, Any
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
import newick
-from seqspec.utils import REGION_TYPE_COLORS
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+import matplotlib.patches as mpatches
+
+from seqspec.utils import load_spec, REGION_TYPE_COLORS
+from seqspec.seqspec_print_html import print_seqspec_html
from seqspec.Region import complement_sequence
-from seqspec.Region import project_regions_to_coordinates
-from argparse import RawTextHelpFormatter
+from seqspec.Assay import Assay
+from seqspec.seqspec_print_utils import libseq
-def setup_print_args(parser):
+def setup_print_args(parser) -> ArgumentParser:
+ """Create and configure the print command subparser.
+
+ Args:
+ parser: The main argument parser to add the print subparser to.
+
+ Returns:
+ The configured print subparser.
+ """
subparser = parser.add_parser(
"print",
description="""
@@ -23,20 +43,23 @@ def setup_print_args(parser):
help="Display the sequence and/or library structure from seqspec file",
formatter_class=RawTextHelpFormatter,
)
- subparser.add_argument("yaml", help="Sequencing specification yaml file")
+
+ subparser.add_argument("yaml", type=Path, help="Sequencing specification yaml file")
subparser.add_argument(
"-o",
+ "--output",
metavar="OUT",
- help=("Path to output file"),
- type=str,
+ type=Path,
+ help="Path to output file",
default=None,
)
format_choices = ["library-ascii", "seqspec-html", "seqspec-png", "seqspec-ascii"]
subparser.add_argument(
"-f",
+ "--format",
metavar="FORMAT",
- help=(f"Format ({', '.join(format_choices)}), default: library-ascii"),
+ help=f"Format ({', '.join(format_choices)}), default: library-ascii",
type=str,
default="library-ascii",
choices=format_choices,
@@ -45,81 +68,107 @@ def setup_print_args(parser):
return subparser
-def validate_print_args(parser, args):
- fmt = args.f
+def validate_print_args(parser: ArgumentParser, args: Namespace) -> None:
+ """Validate the print command arguments.
- fn = args.yaml
- o = args.o
- if fmt == "seqspec-png" and o is None:
+ Args:
+ parser: The argument parser.
+ args: The parsed arguments.
+
+ Raises:
+ parser.error: If any validation fails.
+ """
+ if not Path(args.yaml).exists():
+ parser.error(f"Input file does not exist: {args.yaml}")
+
+ if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+ parser.error(f"Output path exists but is not a file: {args.output}")
+
+ if args.format == "seqspec-png" and args.output is None:
parser.error("Output file required for png format")
- return run_seqspec_print(fn, fmt, o)
+def run_print(parser: ArgumentParser, args: Namespace) -> None:
+ """Run the print command.
-def run_seqspec_print(spec_fn, fmt, o):
- spec = load_spec(spec_fn)
+ Args:
+ parser: The argument parser.
+ args: The parsed arguments.
+ """
+
+ validate_print_args(parser, args)
+
+ spec = load_spec(args.yaml)
+ result = seqspec_print(spec, args.format)
+
+ if args.format == "seqspec-png":
+ result.savefig(args.output, dpi=300, bbox_inches="tight")
+ elif args.output:
+ with open(args.output, "w") as f:
+ print(result, file=f)
+ else:
+ print(result)
- # TODO: add reads to seqspec html
- # TODO: add reads to seqspec png
- CMD = {
+
+def seqspec_print(spec: Assay, fmt: str):
+ """Print sequence specification in the specified format.
+
+ Args:
+ spec: The seqspec specification to print
+ fmt: The format to print in (library-ascii, seqspec-html, seqspec-png, seqspec-ascii)
+
+ Returns:
+ The formatted output (string or matplotlib figure)
+
+ Raises:
+ ValueError: If format is not supported
+ """
+ # Map format to print function
+ format_to_function = {
"library-ascii": print_library_ascii,
"seqspec-html": print_seqspec_html,
"seqspec-png": print_seqspec_png,
"seqspec-ascii": print_seqspec_ascii,
}
- s = CMD[fmt](spec)
+ if fmt not in format_to_function:
+ raise ValueError(
+ f"Unsupported format: {fmt}. Must be one of {list(format_to_function.keys())}"
+ )
- if fmt == "png":
- return s.savefig(o, dpi=300, bbox_inches="tight")
+ return format_to_function[fmt](spec)
- if o:
- with open(o, "w") as f:
- print(s, file=f)
- else:
- print(s)
- return
+def print_seqspec_ascii(spec: Assay) -> str:
+ """Print sequence specification in ASCII format.
-def print_seqspec_ascii(spec):
- p = []
+ Args:
+ spec: The seqspec specification to print.
+
+ Returns:
+ The ASCII formatted string.
+ """
+ parts = []
for modality in spec.modalities:
- p.append(libseq(spec, modality))
- return "\n".join(p)
+ parts.append(format_libseq(spec, modality, *libseq(spec, modality)))
+ return "\n".join(parts)
-def libseq(spec, modality):
+def format_libseq(spec: Assay, modality: str, p: List[str], n: List[str]) -> str:
+ """Format library sequence for a specific modality.
+
+ Args:
+ spec: The seqspec specification.
+ modality: The modality to format.
+ p: Positive strand parts.
+ n: Negative strand parts.
+
+ Returns:
+ The formatted string.
+ """
libspec = spec.get_libspec(modality)
- seqspec = spec.get_seqspec(modality)
-
- p = []
- n = []
- leaves = libspec.get_leaves()
- cuts = project_regions_to_coordinates(leaves)
- for idx, read in enumerate(seqspec, 1):
- read_len = read.max_len
- read_id = read.read_id
- primer_id = read.primer_id
- primer_idx = [i for i, l in enumerate(leaves) if l.region_id == primer_id][0]
- primer_pos = cuts[primer_idx]
- if read.strand == "pos":
- wsl = primer_pos.stop - 1
- ws = wsl * " "
-
- arrowl = read_len - 1
- arrow = arrowl * "-"
-
- p.append(f"{ws}|{arrow}>({idx}) {read_id}")
- elif read.strand == "neg":
- wsl = primer_pos.start - read_len
- ws = wsl * " "
-
- arrowl = read_len - 1
- arrow = arrowl * "-"
-
- n.append(f"{ws}<{arrow}|({idx}) {read_id}")
-
- s = "\n".join(
+
+ return "\n".join(
[
modality,
"---",
@@ -129,120 +178,123 @@ def libseq(spec, modality):
"\n".join(n),
]
)
- return s
-def run_print(data):
- header = headerTemplate(data.name, data.doi, data.description, data.modalities)
- header2 = "## Final Library"
- library_spec = multiModalTemplate(data.library_spec)
- s = f"{header}\n{header2}\n{library_spec}"
- return s
+def print_library_ascii(spec: Assay) -> str:
+ """Print library structure in ASCII format.
+ Args:
+ spec: The seqspec specification to print.
-def run_print_sequence_spec(spec):
- p = []
- for r in spec.sequence_spec:
- p.append(
- "\t".join(
- [r.read_id, r.primer_id, r.strand, str(r.min_len), str(r.max_len)]
- )
- )
- return "\n".join(p)
-
-
-def print_library_ascii(spec):
- t = []
+ Returns:
+ The ASCII formatted string.
+ """
+ trees = []
for r in spec.library_spec:
- t.append(r.to_newick())
- n = ",".join(t)
- # print(n)
- tree = newick.loads(f"({n})")
+ trees.append(r.to_newick())
+ tree_str = ",".join(trees)
+ tree = newick.loads(f"({tree_str})")
return tree[0].ascii_art()
-def argsort(arr):
- # http://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python/3382369#3382369
- # by unutbu
- return sorted(range(len(arr)), key=arr.__getitem__)
-
+def print_seqspec_png(spec: Assay):
+ """Print sequence specification as PNG.
-def print_seqspec_png(spec):
- # builds directly off of https://colab.research.google.com/drive/1ZCIGrwLEIfE0yo33bP8uscUNPEn1p1DH developed by https://github.com/LucasSilvaFerreira
+ Args:
+ spec: The seqspec specification to print.
- # modality
+ Returns:
+ The matplotlib figure.
+ """
modalities = spec.list_modalities()
modes = [spec.get_libspec(m) for m in modalities]
lengths = [i.min_len for i in modes]
nmodes = len(modalities)
- # sort the modalities by their lengths
+ # Sort modalities by length
asort = argsort(lengths)
modalities = [modalities[i] for i in asort]
lengths = [lengths[i] for i in asort]
modes = [modes[i] for i in asort]
- assay_id = spec.assay_id
- fig, _ = plot_png(assay_id, modalities, modes, nmodes, lengths)
- return fig
+ return plot_png(spec.assay_id, modalities, modes, nmodes, lengths)
+
+
+def argsort(arr: List[Any]) -> List[int]:
+ """Get indices that would sort an array.
+
+ Args:
+ arr: The array to sort.
+
+ Returns:
+ List of indices that would sort the array.
+ """
+ return sorted(range(len(arr)), key=arr.__getitem__)
+
+def plot_png(
+ assay: str, modalities: List[str], modes: List[Any], nmodes: int, lengths: List[int]
+):
+ """Create PNG plot of sequence specification.
-def plot_png(assay, modalities, modes, nmodes, lengths):
- import matplotlib.pyplot as plt
- from matplotlib.patches import Rectangle
- import matplotlib.patches as mpatches
+ Args:
+ assay: The assay ID.
+ modalities: List of modalities.
+ modes: List of mode specifications.
+ nmodes: Number of modes.
+ lengths: List of lengths.
+ Returns:
+ The matplotlib figure.
+ """
fsize = 15
plt.rcParams.update({"font.size": fsize})
- fig, ax = plt.subplots(
- figsize=(10, 1 * nmodes), nrows=nmodes, constrained_layout=True
- )
- fig.suptitle(assay)
+ fig, _ = plt.subplots(figsize=(10, 1 * nmodes), nrows=nmodes)
+ title_offset = 0.98 if nmodes > 1 else 1.2
+ fig.suptitle(assay, y=title_offset)
+
rts = []
for m, ax in zip(modes, fig.get_axes()):
- # get leaves
+ # Get leaves
leaves = m.get_leaves()
- # setup plotting variables
+ # Setup plotting variables
y = 0
x = 0
height = 1
for idx, node in enumerate(leaves):
- # region tupe
+ # Region type
rtype = node.region_type.lower()
- # add to the global list so we can make a legend
rts.append(rtype)
- # get region properties
+
+ # Get region properties
length = node.min_len
label = f"{length}"
- # setup rectangle for the region
+ # Setup rectangle for the region
rectangle = Rectangle(
(x, y), length, height, color=REGION_TYPE_COLORS[rtype], ec="black"
)
- # write in the length of the region in the rectangle
+ # Write length in the rectangle
ax.text(
x + length / 2,
y + height / 2,
label,
horizontalalignment="center",
verticalalignment="center",
- ) # , rotation=90)
- # add the rectangle
+ )
ax.add_patch(rectangle)
- # add length to x for next region
+ # Add length to x for next region
x += length
ax.autoscale()
+ ax.set(**{"xlim": (0, max(lengths)), "ylim": (0, 1)})
- # since all axes use the same scale, set the xlim to be 0 to the max length
- ax.set(**{"xlim": (0, max(lengths))})
-
- # hide the spines
+ # Hide the spines
for spine in ["right", "top", "left", "bottom"]:
ax.spines[spine].set_visible(False)
# Hide the axis and ticks and labels
@@ -250,10 +302,10 @@ def plot_png(assay, modalities, modes, nmodes, lengths):
ax.set_yticklabels([])
ax.set_yticks([])
- # label the modality on the ylabel
+ # Label the modality on the ylabel
ax.set_ylabel(m.region_type, rotation=0, fontsize=20, ha="right", va="center")
- # adjust the xaxis for the last modality to show the length
+ # Adjust the xaxis for the last modality to show the length
ax.xaxis.set_visible(True)
ax.spines["bottom"].set_visible(True)
ax.minorticks_on()
@@ -264,68 +316,10 @@ def plot_png(assay, modalities, modes, nmodes, lengths):
}
)
- # setup the figure legend
+ # Setup the figure legend
handles = []
for t in sorted(set(rts)):
handles.append(mpatches.Patch(color=REGION_TYPE_COLORS[t], label=t))
fig.legend(handles=handles, loc="center", bbox_to_anchor=(1.1, 0.55))
- return (fig, ax)
-
-def headerTemplate(name, doi, description, modalities):
- s = f"""# {name}
-- DOI: [{doi}]({doi})
-- Description: {description}
-- Modalities: {", ".join(modalities)}
- """
- return s
-
-
-def atomicRegionTemplate(
- name, region_type, sequence_type, sequence, min_len, max_len, onlist, ns=0
-):
- s = f"""{name}
-
-{' '*ns}- region_type: {region_type}
-{' '*ns}- sequence_type: {sequence_type}
-{' '*ns}- sequence: {sequence}
-{' '*ns}- min_len: {min_len}
-{' '*ns}- max_len: {max_len}
-{' '*ns}- onlist: {onlist}
-{' '*ns}
{region.sequence}"""
- return s
-
-
-def multiModalTemplate(library_spec):
- s = "\n".join(
- [libStructTemplate(v) + "\n" + regionsTemplate(v.regions) for v in library_spec]
- )
- return s
+ return fig
diff --git a/seqspec/seqspec_print_html.py b/seqspec/seqspec_print_html.py
index f469238a..8ec4694e 100644
--- a/seqspec/seqspec_print_html.py
+++ b/seqspec/seqspec_print_html.py
@@ -1,16 +1,23 @@
-from seqspec.Region import Region
+"""Print HTML module for seqspec.
+This module provides functionality to generate HTML representations of seqspec files.
+It is used by the print command with the 'seqspec-html' format option.
+"""
+
+from typing import List, Optional
+
+from seqspec.Assay import Assay
+from seqspec.Region import Region, Onlist, complement_sequence
+from seqspec.Read import Read, File
+from seqspec.seqspec_print_utils import libseq
-def print_seqspec_html(spec):
- # header = headerTemplate(spec.name, spec.doi, spec.description, spec.modalities)
- # header2 = "## Final Library"
- # library_spec = multiModalTemplate(spec.library_spec)
- # s = f"{header}\n{header2}\n{library_spec}"
- s = htmlTemplate(spec)
- return s
+def print_seqspec_html(spec: Assay) -> str:
+ """Generate HTML representation of seqspec."""
+ return htmlTemplate(spec)
-def headerTemplate(name, doi, description, modalities):
+
+def headerTemplate(name: str, doi: str, description: str, modalities: List[str]) -> str:
s = f"""
-{colorSeq(region.get_leaves())}
+{seq}
"""
return s
-def multiModalTemplate(library_spec):
- s = "".join(
- [libStructTemplate(v) + "\n" + regionsTemplate(v.regions) for v in library_spec]
- )
+def atomicReadTemplate(read: Read) -> str:
+ files = "".join(atomicFileTemplate(f) for f in read.files) if read.files else ""
+
+ s = f"""
+