diff --git a/README.md b/README.md index 2c7c1150..9ddbafbe 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # seqspec -![github version](https://img.shields.io/badge/Version-0.3.1-informational) -[![pypi version](https://img.shields.io/pypi/v/seqspec)](https://pypi.org/project/seqspec/0.3.1/) +![github version](https://img.shields.io/badge/Version-0.3.0-informational) ![python versions](https://img.shields.io/pypi/pyversions/seqspec) [![license](https://img.shields.io/pypi/l/seqspec)](LICENSE) diff --git a/docs/INSTALLATION.md b/docs/INSTALLATION.md index 9e7db60a..46e683a2 100644 --- a/docs/INSTALLATION.md +++ b/docs/INSTALLATION.md @@ -7,16 +7,8 @@ authors: # Installation -The development version can be installed with - -```bash -pip install git+https://github.com/pachterlab/seqspec@devel -``` - -The official release can be installed directly from pypi - ```bash -pip install seqspec +pip install git+https://github.com/IGVF-DACC/seqspec.git@main ``` Verify the installation diff --git a/docs/SEQSPEC_TOOL.md b/docs/SEQSPEC_TOOL.md index 1e355e71..27037676 100644 --- a/docs/SEQSPEC_TOOL.md +++ b/docs/SEQSPEC_TOOL.md @@ -61,6 +61,12 @@ Check that the `seqspec` file is correctly formatted and consistent with the [sp seqspec check [-h] [-o OUT] yaml ``` +```python +from seqspec.seqspec_check import run_check + +run_check(schema_fn: str, spec_fn: str, o: str) +``` + - optionally, `-o OUT` can be used to write the output to a file. - `yaml` corresponds to the `seqspec` file. @@ -133,6 +139,12 @@ $ seqspec check spec.yaml seqspec find [-h] [-o OUT] [-s Selector] -m MODALITY [-i IDs] yaml ``` +```python +from seqspec.seqspec_find import run_find + +run_find(spec_fn: str, modality: str, id: str, idtype: str, o: str) +``` + - optionally, `-o OUT` can be used to write the output to a file. - optionally, `-s Selector` is the type of the ID you are searching for (default: region). Can be one of - read @@ -195,6 +207,12 @@ $ seqspec find -m rna -s region-type -i barcode spec.yaml seqspec file [-h] [-o OUT] [-i IDs] -m MODALITY [-s SELECTOR] [-f FORMAT] [-k KEY] yaml ``` +```python +from seqspec.seqspec_file import run_file + +run_file(spec_fn: str, m: str, ids: List[str], idtype: str, fmt: str, k: str, o: str, fp=False) +``` + - optionally, `-o OUT` can be used to write the output to a file. - optionally, `-s Selector` is the type of the ID you are searching for (default: read). Can be one of - read @@ -266,6 +284,11 @@ Automatically fill in missing fields in the spec. seqspec format [-h] [-o OUT] yaml ``` +```python +from seqspec.seqspec_format import run_format +run_format(spec_fn: str, o: str) +``` + - `-o OUT` the path to create the formatted `seqspec` file. - `yaml` corresponds to the `seqspec` file. @@ -283,11 +306,16 @@ $ seqspec format -o spec.yaml spec.yaml Identify the position of elements in a spec for use in downstream tools. Returns the 0-indexed position of elements contained in a given region in the 5'->3' direction. -``` +```bash seqspec index [-o OUT] [-t TOOL] [--rev] -m MODALITY -r REGION yaml seqspec index [-h] [-o OUT] [-t TOOL] [-s SELECTOR] [--rev] -m MODALITY [-i IDs] yaml ``` +```python +from seqspec.seqspec_index import run_index +run_index(spec_fn: str, modality: str, ids: List[str], idtype: str, fmt: str, rev: str, subregion_type: str, o) +``` + - optionally, `-o OUT` can be used to write the output to a file. - optionally, `--rev` can be set to return the 3'->5' index. - optionally, `-t TOOL` returns the indices in the format specified by the tool. One of: @@ -342,6 +370,11 @@ $ seqspec index -m atac -t kb -s file spec.yaml seqspec info [-h] [-k KEY] [-f FORMAT] [-o OUT] yaml ``` +```python +from seqspec.seqspec_info import run_info +run_info(spec_fn: str, f: str, k=None, o=None) +``` + - optionally, `-o OUT` path to write the info. - optionally, `-k KEY` the object to display (default: meta). Can be one of - modalities @@ -413,6 +446,11 @@ $ seqspec info -f json -k sequence_spec spec.yaml seqspec init [-h] -n NAME -m MODALITIES -r READS [-o OUT] newick ``` +```python +from seqspec.seqspec_info import run_info +run_info(spec_fn: str, f: str, k: str = None, o: str = None) +``` + - optionally, `-o OUT` path to create `seqspec` file. - `-m MODALITIES` is a comma-separated list of modalities (e.g. rna,atac) - `-n NAME` is the name associated with the `seqspec` file. @@ -432,7 +470,7 @@ $ seqspec init -n myassay -m rna -o spec.yaml -r rna,R1.fastq.gz,r1_primer,26,po $ seqspec init -n myassay -m rna,atac -o spec.yaml -r rna,rna_R1.fastq.gz,rna_r1_primer,26,pos:rna,rna_R2.fastq.gz,rna_r2_primer,100,neg:atac,atac_R1.fastq.gz,atac_r1_primer,100,pos:atac,atac_R2.fastq.gz,atac_r1_primer,16,neg:atac,atac_R3.fastq.gz,atac_r2_primer,100,neg "(((rna_r1_primer:0,barcode:16,umi:12,cdna:150,rna_r2_primer:0)rna),(barcode:16,atac_r1_primer:1,gdna:150,atac_r2_primer)atac)" ``` -## `seqsoec methods`: Convert seqspec file into methods section +## `seqspec methods`: Convert seqspec file into methods section Generate a methods section from a seqspec file. @@ -440,6 +478,11 @@ Generate a methods section from a seqspec file. seqspec methods [-h] -m MODALITY [-o OUT] yaml ``` +```python +from seqspec.seqspec_methods import run_methods +run_methods(spec_fn: str, m: str, o: str) +``` + - optionally, `-o OUT` path to write the methods section. - `-m MODALITY` is the modality to write the methods for. - `yaml` corresponds to the `seqspec` file. @@ -479,6 +522,13 @@ The library was sequenced on a Illumina NovaSeq 6000 (EFO:0008637) using the Nov seqspec modify [-h] [--read-id READID] [--read-name READNAME] [--primer-id PRIMERID] [--strand STRAND] [--files FILES] [--region-id REGIONID] [--region-type REGIONTYPE] [--region-name REGIONNAME] [--sequence-type SEQUENCETYPE] [--sequence SEQUENCE] [--min-len MINLEN] [--max-len MAXLEN] [-o OUT] [-i IDs] [-s SELECTOR] -m MODALITY yaml ``` +```python +from seqspec.seqspec_modify import run_modify_read, run_modify_region + +run_modify_read(spec, modality, target_read, read_id, read_name, primer_id, min_len, max_len, strand, files) +run_modify_region(spec, modality, target_region, region_id, region_type, name, sequence_type, sequence, min_len, max_len) +``` + Read modifications - optionally, `--read-id READID` specifies the new `read_id`. @@ -529,6 +579,12 @@ $ seqspec modify -m atac -o mod_spec.yaml -i atac_R1 --files "R1_1.fastq.gz,fast seqspec onlist [-h] [-o OUT] [-s SELECTOR] [-f FORMAT] [-i IDs] -m MODALITY yaml ``` +```python +from seqspec.seqspec_onlist import run_onlist + +run_onlist(spec_fn, modality, ids, idtype, fmt, o) +``` + - optionally, `-o OUT` to set the path of the onlist file. - `-m MODALITY` is the modality in which you are searching for the region. - `-i ID` is the `id` of the object to search for the onlist. @@ -563,6 +619,11 @@ Print sequence and/or library structure as ascii, png, or html. seqspec print [-h] [-o OUT] [-f FORMAT] yaml ``` +```python +from seqspec.seqspec_print import run_seqspec_print +run_seqspec_print(spec_fn, fmt, o) +``` + - optionally, `-o OUT` to set the path of printed file. - optionally, `-f FORMAT` is the format of the printed file. Can be one of: - `library-ascii`: prints an ascii tree of the library_spec @@ -651,6 +712,11 @@ $ seqspec print -o spec.png -f seqspec-png spec.yaml seqspec split [-h] -o OUT yaml ``` +```python +from seqspec.seqspec_split import run_split +run_split(spec_fn, o) +``` + - optionally, `-o OUT` name prepended to split specs. - `yaml` corresponds to the `seqspec` file. @@ -673,6 +739,11 @@ split.tag.yaml seqspec version [-h] [-o OUT] yaml ``` +```python +from seqspec.seqspec_version import run_version +run_version(spec_fn, o) +``` + - optionally, `-o OUT` path to file to write output. - `yaml` corresponds to the `seqspec` file. @@ -693,6 +764,11 @@ This is a hidden subcommand that upgrades an old version of the spec to the curr seqspec upgrade [-h] [-o OUT] yaml ``` +```python +from seqspec.seqspec_upgrade import run_upgrade +run_upgrade(spec_fn, o) +``` + ### Examples ```bash diff --git a/docs/SPECIFICATION.md b/docs/SPECIFICATION.md index f4ad0be3..63b83843 100644 --- a/docs/SPECIFICATION.md +++ b/docs/SPECIFICATION.md @@ -153,6 +153,7 @@ Each `Region` has the following properties which are useful to annotate the elem - `rna`: The modality corresponding to assaying RNA. - `s5`: A sequencing primer or adaptor typically used in the Nextera kit in conjunction with ME1. - `s7`: A sequencing primer or adaptor typically used in the Nextera kit in conjunction with ME2. + - `sgrna_target`: A sequence corresponding to the guide RNA spacer region that determines the genomic target of CRISPR-based perturbations. - `tag`: A short sequence of DNA or RNA used to label or identify a sample, protein, or other grouping. - `truseq_read1`: The first read primer in a paired-end sequencing run using the Illumina TruSeq Library preparation kit. - `truseq_read2`: The second read primer in a paired-end sequencing run using the Illumina TruSeq Library preparation kit. diff --git a/docs/assays/10xcrispr.spec.yaml b/docs/assays/10xcrispr.spec.yaml index 87e4d369..a95cc5ae 100644 --- a/docs/assays/10xcrispr.spec.yaml +++ b/docs/assays/10xcrispr.spec.yaml @@ -203,7 +203,7 @@ library_spec: parent_id: crispr - !Region region_id: sgrna_target - region_type: crispr + region_type: sgrna_target name: sgrna_target sequence_type: onlist sequence: NNNNNNNNNNNNNNNNNXXX diff --git a/docs/assays/sccrispra.spec.yaml b/docs/assays/sccrispra.spec.yaml index e0f27906..c825dc37 100644 --- a/docs/assays/sccrispra.spec.yaml +++ b/docs/assays/sccrispra.spec.yaml @@ -223,7 +223,7 @@ library_spec: - !Region parent_id: crispr_R2_001.fastq.gz region_id: gRNA - region_type: gRNA + region_type: sgrna_target name: Guide RNAs sequence_type: onlist sequence: NNNNNNNNNNNNNNNNNNNN diff --git a/requirements.txt b/requirements.txt index 4d234556..acff0508 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ jsonschema newick requests biopython -packaging \ No newline at end of file +packaging +matplotlib>=3.4.0 \ No newline at end of file diff --git a/seqspec/File.py b/seqspec/File.py index 70a52f1e..423dcedf 100644 --- a/seqspec/File.py +++ b/seqspec/File.py @@ -24,26 +24,18 @@ def __init__( self.md5 = md5 def __repr__(self) -> str: - d = { - "file_id": self.file_id, - "filename": self.filename, - "filetype": self.filetype, - "filesize": self.filesize, - "url": self.url, - "urltype": self.urltype, - "md5": self.md5, - } + d = self.to_dict() return f"{d}" def to_dict(self): d = { - "file_id": self.file_id, - "filename": self.filename, - "filetype": self.filetype, - "filesize": self.filesize, - "url": self.url, - "urltype": self.urltype, - "md5": self.md5, + "file_id": getattr(self, "file_id", None), + "filename": getattr(self, "filename", None), + "filetype": getattr(self, "filetype", None), + "filesize": getattr(self, "filesize", None), + "url": getattr(self, "url", None), + "urltype": getattr(self, "urltype", None), + "md5": getattr(self, "md5", None), } return d diff --git a/seqspec/Read.py b/seqspec/Read.py index 6681db38..6984998d 100644 --- a/seqspec/Read.py +++ b/seqspec/Read.py @@ -32,24 +32,13 @@ def set_files(self, files: Optional[List["File"]] = []): self.files = files def __repr__(self) -> str: - d = { - "read_id": self.read_id, - "name": self.name, - "modality": self.modality, - "primer_id": self.primer_id, - "min_len": self.min_len, - "max_len": self.max_len, - "strand": self.strand, - "files": self.files, - } + d = self.to_dict() return f"{d}" def to_dict(self): # TODO is this necessary for backwards compatibility? - if self.files: - files = [i.to_dict() for i in self.files] - else: - files = [] + files = getattr(self, "files", []) + files = [i.to_dict() for i in files] d = { "read_id": self.read_id, "name": self.name, diff --git a/seqspec/Region.py b/seqspec/Region.py index 9a2a7fdf..f7edbc01 100644 --- a/seqspec/Region.py +++ b/seqspec/Region.py @@ -417,7 +417,7 @@ def __init__( url: str, urltype: str, md5: str, - location: Optional[str], + # location: Optional[str], ) -> None: super().__init__() self.file_id = file_id diff --git a/seqspec/__init__.py b/seqspec/__init__.py index 260c070a..ad6cb61b 100644 --- a/seqspec/__init__.py +++ b/seqspec/__init__.py @@ -1 +1,8 @@ -__version__ = "0.3.1" +__version__ = "0.3.0" + + +def get_version(): + """ + Returns the version of the package. + """ + return __version__ diff --git a/seqspec/main.py b/seqspec/main.py index d64a881b..b800ab78 100644 --- a/seqspec/main.py +++ b/seqspec/main.py @@ -1,46 +1,48 @@ -from . import __version__ -import argparse -import sys -from .seqspec_format import setup_format_args, validate_format_args -from .seqspec_print import setup_print_args, validate_print_args -from .seqspec_check import setup_check_args, validate_check_args -from .seqspec_find import setup_find_args, validate_find_args - -# from .seqspec_genbank import setup_genbank_args, validate_genbank_args -from .seqspec_modify import setup_modify_args, validate_modify_args -from .seqspec_index import setup_index_args, validate_index_args -from .seqspec_info import setup_info_args, validate_info_args - -from .seqspec_split import setup_split_args, validate_split_args -from .seqspec_init import setup_init_args, validate_init_args -from .seqspec_onlist import setup_onlist_args, validate_onlist_args -from .seqspec_version import setup_version_args, validate_version_args -from .seqspec_methods import setup_methods_args, validate_methods_args -from .seqspec_file import setup_file_args, validate_file_args -from .seqspec_upgrade import setup_upgrade_args, validate_upgrade_args +"""Main module for seqspec CLI. -import warnings - -# Steps to add new subcommands -# Create seqspec_subcommand.py (create setup_subcmd_args, validate_subcmd_args, run_subcmd in that file) -# (in this file) from seqspec_subcmd import setup_subcmd_args, validate_subcmd_args -# Add setup_subcmd_args to command_to_parser along with its key==str(subcmd) -# Add validate_subcmd_args to COMMAND_TO_FUNCTION along with its key==str(subcmd) +This module provides the main entry point for the seqspec command-line interface. +It handles argument parsing, command routing, and execution of subcommands. +""" +import sys +import warnings +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace +from typing import Dict, Callable, Any -def main(): - warnings.simplefilter("default", DeprecationWarning) +from . import __version__ - # setup parsers - parser = argparse.ArgumentParser( +# Import subcommand modules +from .seqspec_format import setup_format_args, run_format +from .seqspec_print import setup_print_args, run_print +from .seqspec_check import setup_check_args, run_check +from .seqspec_find import setup_find_args, run_find +from .seqspec_convert import setup_convert_args, run_convert +from .seqspec_modify import setup_modify_args, run_modify +from .seqspec_index import setup_index_args, run_index +from .seqspec_info import setup_info_args, run_info +from .seqspec_split import setup_split_args, run_split +from .seqspec_init import setup_init_args, run_init +from .seqspec_onlist import setup_onlist_args, run_onlist +from .seqspec_version import setup_version_args, run_version +from .seqspec_methods import setup_methods_args, run_methods +from .seqspec_file import setup_file_args, run_file +from .seqspec_upgrade import setup_upgrade_args, run_upgrade + + +def setup_parser(): + """Create and configure the main argument parser. + + Returns: + Configured ArgumentParser instance. + """ + parser = ArgumentParser( description=f""" seqspec {__version__}: A machine-readable file format for genomic library sequence and structure. GitHub: https://github.com/pachterlab/seqspec Documentation: https://pachterlab.github.io/seqspec/ - """, - formatter_class=argparse.RawTextHelpFormatter, + formatter_class=RawTextHelpFormatter, ) subparsers = parser.add_subparsers( @@ -54,7 +56,7 @@ def main(): "find": setup_find_args(subparsers), "file": setup_file_args(subparsers), "format": setup_format_args(subparsers), - # "genbank": setup_genbank_args(subparsers), + "convert": setup_convert_args(subparsers), "index": setup_index_args(subparsers), "info": setup_info_args(subparsers), "init": setup_init_args(subparsers), @@ -67,7 +69,18 @@ def main(): "version": setup_version_args(subparsers), } - # Show help when no arguments are given + return parser, command_to_parser + + +def handle_no_args( + parser: ArgumentParser, command_to_parser: Dict[str, ArgumentParser] +) -> None: + """Handle case when no arguments are provided. + + Args: + parser: Main argument parser. + command_to_parser: Dictionary mapping commands to their parsers. + """ if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) @@ -80,27 +93,43 @@ def main(): parser.print_help(sys.stderr) sys.exit(1) + +def main() -> None: + """Main entry point for the seqspec CLI.""" + warnings.simplefilter("default", DeprecationWarning) + + parser, command_to_parser = setup_parser() + handle_no_args(parser, command_to_parser) + args = parser.parse_args() - # Setup validator and runner for all subcommands (validate and run if valid) - COMMAND_TO_FUNCTION = { - "format": validate_format_args, - "print": validate_print_args, - "check": validate_check_args, - "find": validate_find_args, - "index": validate_index_args, - "info": validate_info_args, - "init": validate_init_args, - "methods": validate_methods_args, - "modify": validate_modify_args, - "onlist": validate_onlist_args, - "split": validate_split_args, - "version": validate_version_args, - "file": validate_file_args, - "upgrade": validate_upgrade_args, - # "genbank": validate_genbank_args, + # Setup validator and runner for all subcommands + command_to_function: Dict[str, Callable[[ArgumentParser, Namespace], Any]] = { + "format": run_format, + "print": run_print, + "check": run_check, + "find": run_find, + "index": run_index, + "info": run_info, + "init": run_init, + "methods": run_methods, + "modify": run_modify, + "onlist": run_onlist, + "split": run_split, + "version": run_version, + "file": run_file, + "upgrade": run_upgrade, + "convert": run_convert, } - COMMAND_TO_FUNCTION[sys.argv[1]](parser, args) + + try: + command_to_function[sys.argv[1]](parser, args) + except KeyError: + parser.print_help(sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {str(e)}", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": diff --git a/seqspec/schema/seqspec.schema.json b/seqspec/schema/seqspec.schema.json index 997d14a0..5b3ddd06 100644 --- a/seqspec/schema/seqspec.schema.json +++ b/seqspec/schema/seqspec.schema.json @@ -71,6 +71,8 @@ "massively parallel reporter assay (OBI:0002675)", "chromosome conformation capture-on-chip assay (OBI:0002458)", "single nucleus methylation chromatin conformation capture seq (NTR:0000745)", + "in vitro CRISPR screen using flow cytometry (OBI:0003661)", + "in vitro CRISPR screen using single-cell RNA-seq (OBI:0003660)", "Custom" ] }, @@ -141,6 +143,7 @@ "Illumina NextSeq 2000 (EFO:0010963)", "Illumina NovaSeq X (NTR:0000765)", "PacBio RS II (EFO:0008631)", + "Illumina NovaSeq X (EFO:0022840)", "Custom" ] }, @@ -265,6 +268,7 @@ "enum": [ "atac", "barcode", + "bead_TSO", "cdna", "crispr", "custom_primer", @@ -292,6 +296,7 @@ "rna", "s5", "s7", + "sgrna_target", "tag", "truseq_read1", "truseq_read2", diff --git a/seqspec/schema/seqspec_igvf.schema.json b/seqspec/schema/seqspec_igvf.schema.json new file mode 100644 index 00000000..9d0ffd3a --- /dev/null +++ b/seqspec/schema/seqspec_igvf.schema.json @@ -0,0 +1,398 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "Assay.schema.json", + "title": "Assay", + "description": "A Assay of DNA", + "type": "object", + "properties": { + "seqspec_version": { + "description": "Version of the seqspec specification used", + "type": "string", + "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" + }, + "assay_id": { + "description": "Identifier for the assay", + "type": "string" + }, + "name": { + "description": "The name of the assay", + "type": "string" + }, + "doi": { + "description": "the doi of the paper that describes the assay", + "type": "string" + }, + "date": { + "description": "The seqspec creation date", + "type": "string", + "pattern": "^(0?[1-9]|[12][0-9]|3[01])\\s(January|February|March|April|May|June|July|August|September|October|November|December)\\s(19|20)\\d\\d$" + }, + "description": { + "description": "A short description of the assay", + "type": "string" + }, + "modalities": { + "description": "The modalities the assay targets", + "type": "array", + "items": { + "type": "string", + "enum": ["dna", "rna", "tag", "protein", "atac", "crispr"] + } + }, + "lib_struct": { + "description": "The link to Teichmann's libstructs page derived for this sequence", + "type": "string" + }, + "library_protocol": { + "description": "The protocol/machine/tool to generate the library insert", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "protocol_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "library_kit": { + "description": "The kit used to make the library sequence_protocol compatible", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "kit_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "sequence_protocol": { + "description": "The protocol/machine/tool to generate sequences", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "protocol_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "sequence_kit": { + "description": "The kit used with the protocol to sequence the library", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "kit_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "sequence_spec": { + "description": "The spec for the sequencer", + "type": "array", + "items": { + "$ref": "#/$defs/read" + } + }, + "library_spec": { + "description": "The spec for the assay", + "type": "array", + "items": { + "$ref": "#/$defs/region" + } + } + }, + "required": [ + "seqspec_version", + "assay_id", + "name", + "doi", + "date", + "description", + "modalities" + ], + "$defs": { + "region": { + "title": "Region", + "description": "A region of DNA", + "type": "object", + "properties": { + "region_id": { + "description": "identifier for the region", + "type": "string" + }, + "region_type": { + "description": "the type of region", + "type": "string", + "enum": [ + "atac", + "barcode", + "bead_TSO", + "cdna", + "crispr", + "custom_primer", + "dna", + "fastq", + "fastq_link", + "gdna", + "hic", + "illumina_p5", + "illumina_p7", + "index5", + "index7", + "linker", + "ME1", + "ME2", + "methyl", + "named", + "nextera_read1", + "nextera_read2", + "poly_A", + "poly_G", + "poly_T", + "poly_C", + "protein", + "rna", + "s5", + "s7", + "sgrna_target", + "tag", + "truseq_read1", + "truseq_read2", + "umi" + ] + }, + "sequence_type": { + "description": "The type of the sequence", + "type": "string", + "enum": ["fixed", "random", "onlist", "joined"] + }, + "sequence": { + "description": "The sequence", + "type": "string" + }, + "min_len": { + "description": "The minimum length of the sequence", + "type": "integer", + "minimum": 0, + "maximum": 2048 + }, + "max_len": { + "description": "The maximum length of the sequence", + "type": "integer", + "minimum": 0, + "maximum": 2048 + }, + "onlist": { + "description": "The file containing the sequence if seq_type = onlist", + "type": ["object", "null"], + "properties": { + "file_id": { + "description": "filename", + "type": "string" + }, + "filename": { + "description": "filename for the onlist", + "type": "string" + }, + "filetype": { + "description": "the type of file", + "type": "string" + }, + "filesize": { + "description": "the size of the file in bytes", + "type": "integer" + }, + "url": { + "description": "The path or url to the file", + "type": "string" + }, + "urltype": { + "description": "type of file path", + "type": "string", + "enum": ["local", "ftp", "http", "https"] + }, + "location": { + "description": "location of onlist", + "type": "string", + "enum": ["local", "remote"] + }, + "md5": { + "description": "md5sum for the file pointed to by filename", + "type": "string" } + }, + "required": [ + "file_id", + "filename", + "filetype", + "filesize", + "url", + "urltype" + ] + }, + "regions": { + "description": "The regions being joined", + "type": "array", + "items": { + "$ref": "#/$defs/region" + } + } + }, + "required": [ + "region_id", + "region_type", + "sequence_type", + "sequence", + "min_len", + "max_len" + ], + "if": { + "properties": { + "min_len": { + "const": 0 + } + } + }, + "then": { + "properties": { + "sequence": { + "type": "string", + "pattern": "^[ACGTRYMKSWHBVDNX]*$" + } + } + }, + "else": { + "properties": { + "sequence": { + "type": "string", + "minLength": 1, + "pattern": "^[ACGTRYMKSWHBVDNX]+$" + } + } + } + }, + "read": { + "title": "Read", + "type": "object", + "properties": { + "read_id": { + "type": "string", + "description": "The unique identifier for the read.", + "pattern": "^IGVF.*" + }, + "name": { + "type": "string", + "description": "The name of the read." + }, + "modality": { + "type": "string", + "description": "The modality of the assay generating the read." + }, + "primer_id": { + "type": "string", + "description": "The region id of the primer used." + }, + "min_len": { + "type": "integer", + "minimum": 0, + "description": "The minimum length of the read, must be greater than or equal to 0." + }, + "max_len": { + "type": "integer", + "exclusiveMinimum": 0, + "description": "The maximum length of the read, must be greater than 0." + }, + "strand": { + "type": "string", + "enum": ["pos", "neg"], + "description": "The strand orientation of the read, either positive ('pos') or negative ('neg')." + }, + "files": { + "description": "An array of files containing the reads", + "type": "array", + "items": { + "type": "object", + "properties": { + "file_id": { + "description": "filename", + "type": "string" + }, + "filename": { + "description": "filename", + "type": "string" + }, + "filetype": { + "description": "the type of file", + "type": "string" + }, + "filesize": { + "description": "the size of the file in bytes", + "type": "integer" + }, + "url": { + "description": "The path or url to the file", + "type": "string" + }, + "urltype": { + "description": "type of file path", + "type": "string", + "enum": ["local", "ftp", "http", "https"] + }, + "md5": { + "description": "md5sum for the file pointed to by filename", + "type": "string", + "pattern": "^[a-f0-9]{32}$" + } + } + } + } + }, + "required": [ + "read_id", + "modality", + "primer_id", + "min_len", + "max_len", + "strand" + ], + "additionalProperties": false + } + } + } + \ No newline at end of file diff --git a/seqspec/schema/seqspec_igvf_onlist_skip.schema.json b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json new file mode 100644 index 00000000..9908307f --- /dev/null +++ b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json @@ -0,0 +1,397 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "Assay.schema.json", + "title": "Assay", + "description": "A Assay of DNA", + "type": "object", + "properties": { + "seqspec_version": { + "description": "Version of the seqspec specification used", + "type": "string", + "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" + }, + "assay_id": { + "description": "Identifier for the assay", + "type": "string" + }, + "name": { + "description": "The name of the assay", + "type": "string" + }, + "doi": { + "description": "the doi of the paper that describes the assay", + "type": "string" + }, + "date": { + "description": "The seqspec creation date", + "type": "string", + "pattern": "^(0?[1-9]|[12][0-9]|3[01])\\s(January|February|March|April|May|June|July|August|September|October|November|December)\\s(19|20)\\d\\d$" + }, + "description": { + "description": "A short description of the assay", + "type": "string" + }, + "modalities": { + "description": "The modalities the assay targets", + "type": "array", + "items": { + "type": "string", + "enum": ["dna", "rna", "tag", "protein", "atac", "crispr"] + } + }, + "lib_struct": { + "description": "The link to Teichmann's libstructs page derived for this sequence", + "type": "string" + }, + "library_protocol": { + "description": "The protocol/machine/tool to generate the library insert", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "protocol_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "library_kit": { + "description": "The kit used to make the library sequence_protocol compatible", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "kit_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "sequence_protocol": { + "description": "The protocol/machine/tool to generate sequences", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "protocol_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "sequence_kit": { + "description": "The kit used with the protocol to sequence the library", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "kit_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "sequence_spec": { + "description": "The spec for the sequencer", + "type": "array", + "items": { + "$ref": "#/$defs/read" + } + }, + "library_spec": { + "description": "The spec for the assay", + "type": "array", + "items": { + "$ref": "#/$defs/region" + } + } + }, + "required": [ + "seqspec_version", + "assay_id", + "name", + "doi", + "date", + "description", + "modalities" + ], + "$defs": { + "region": { + "title": "Region", + "description": "A region of DNA", + "type": "object", + "properties": { + "region_id": { + "description": "identifier for the region", + "type": "string" + }, + "region_type": { + "description": "the type of region", + "type": "string", + "enum": [ + "atac", + "barcode", + "bead_TSO", + "cdna", + "crispr", + "custom_primer", + "dna", + "fastq", + "fastq_link", + "gdna", + "hic", + "illumina_p5", + "illumina_p7", + "index5", + "index7", + "linker", + "ME1", + "ME2", + "methyl", + "named", + "nextera_read1", + "nextera_read2", + "poly_A", + "poly_G", + "poly_T", + "poly_C", + "protein", + "rna", + "s5", + "s7", + "sgrna_target", + "tag", + "truseq_read1", + "truseq_read2", + "umi" + ] + }, + "sequence_type": { + "description": "The type of the sequence", + "type": "string", + "enum": ["fixed", "random", "onlist", "joined"] + }, + "sequence": { + "description": "The sequence", + "type": "string" + }, + "min_len": { + "description": "The minimum length of the sequence", + "type": "integer", + "minimum": 0, + "maximum": 2048 + }, + "max_len": { + "description": "The maximum length of the sequence", + "type": "integer", + "minimum": 0, + "maximum": 2048 + }, + "onlist": { + "description": "The file containing the sequence if seq_type = onlist", + "type": ["object", "null"], + "properties": { + "file_id": { + "description": "filename", + "type": "string" + }, + "filename": { + "description": "filename for the onlist", + "type": "string" + }, + "filetype": { + "description": "the type of file", + "type": "string" + }, + "filesize": { + "description": "the size of the file in bytes", + "type": "integer" + }, + "url": { + "description": "The path or url to the file", + "type": "string" + }, + "urltype": { + "description": "type of file path", + "type": "string", + "enum": ["local", "ftp", "http", "https"] + }, + "location": { + "description": "location of onlist", + "type": "string", + "enum": ["local", "remote"] + }, + "md5": { + "description": "md5sum for the file pointed to by filename", + "type": "string" } + }, + "required": [ + "file_id", + "filename", + "filetype", + "filesize", + "url", + "urltype" + ] + }, + "regions": { + "description": "The regions being joined", + "type": "array", + "items": { + "$ref": "#/$defs/region" + } + } + }, + "required": [ + "region_id", + "region_type", + "sequence_type", + "sequence", + "min_len", + "max_len" + ], + "if": { + "properties": { + "min_len": { + "const": 0 + } + } + }, + "then": { + "properties": { + "sequence": { + "type": "string", + "pattern": "^[ACGTRYMKSWHBVDNX]*$" + } + } + }, + "else": { + "properties": { + "sequence": { + "type": "string", + "minLength": 1, + "pattern": "^[ACGTRYMKSWHBVDNX]+$" + } + } + } + }, + "read": { + "title": "Read", + "type": "object", + "properties": { + "read_id": { + "type": "string", + "description": "The unique identifier for the read." + }, + "name": { + "type": "string", + "description": "The name of the read." + }, + "modality": { + "type": "string", + "description": "The modality of the assay generating the read." + }, + "primer_id": { + "type": "string", + "description": "The region id of the primer used." + }, + "min_len": { + "type": "integer", + "minimum": 0, + "description": "The minimum length of the read, must be greater than or equal to 0." + }, + "max_len": { + "type": "integer", + "exclusiveMinimum": 0, + "description": "The maximum length of the read, must be greater than 0." + }, + "strand": { + "type": "string", + "enum": ["pos", "neg"], + "description": "The strand orientation of the read, either positive ('pos') or negative ('neg')." + }, + "files": { + "description": "An array of files containing the reads", + "type": "array", + "items": { + "type": "object", + "properties": { + "file_id": { + "description": "filename", + "type": "string" + }, + "filename": { + "description": "filename", + "type": "string" + }, + "filetype": { + "description": "the type of file", + "type": "string" + }, + "filesize": { + "description": "the size of the file in bytes", + "type": "integer" + }, + "url": { + "description": "The path or url to the file", + "type": "string" + }, + "urltype": { + "description": "type of file path", + "type": "string", + "enum": ["local", "ftp", "http", "https"] + }, + "md5": { + "description": "md5sum for the file pointed to by filename", + "type": "string", + "pattern": "^[a-f0-9]{32}$" + } + } + } + } + }, + "required": [ + "read_id", + "modality", + "primer_id", + "min_len", + "max_len", + "strand" + ], + "additionalProperties": false + } + } + } + \ No newline at end of file diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index 702dfbe9..672ebaa2 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -1,12 +1,24 @@ +"""Check module for seqspec CLI. + +This module provides functionality to validate seqspec files against the specification schema. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace +from typing import List, Dict, Optional + from jsonschema import Draft4Validator import yaml from os import path + from seqspec.utils import load_spec, file_exists from seqspec.Assay import Assay -from argparse import RawTextHelpFormatter + +from seqspec import get_version def setup_check_args(parser): + """Create and configure the check command subparser.""" subparser = parser.add_parser( "check", description=""" @@ -19,182 +31,384 @@ def setup_check_args(parser): help="Validate seqspec file against specification", formatter_class=RawTextHelpFormatter, ) - subparser.add_argument("yaml", help="Sequencing specification yaml file") subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), + help="Path to output file", + type=Path, + default=None, + ) + subparser.add_argument( + "-s", + "--skip", + metavar="SKIP", + help="Skip checks", type=str, default=None, + choices=["igvf", "igvf_onlist_skip"], ) + + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) + return subparser -def validate_check_args(parser, args): - spec_fn = args.yaml - o = args.o - schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json") +def validate_check_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the check command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") - return run_check(schema_fn, spec_fn, o) + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") -def run_check(schema_fn, spec_fn, o): - spec = load_spec(spec_fn) +def format_error(errobj, idx=0): + return f"[error {idx}] {errobj['error_message']}" - with open(schema_fn, "r") as stream: - schema = yaml.load(stream, Loader=yaml.Loader) - v = Draft4Validator(schema) - errors = check(v, spec, spec_fn) +def seqspec_check( + spec: Assay, spec_fn: str, filter_type: Optional[str] = None +) -> List[Dict]: + """Core functionality to check a seqspec and return filtered errors. - if errors: - if o: - with open(o, "w") as f: - print("\n".join(errors), file=f) - else: - print("\n".join(errors)) + Args: + spec: The Assay object to check + spec_fn: Path to the spec file, used for relative path resolution + filter_type: Optional filter type to apply to errors (e.g. "igvf", "igvf_onlist_skip") + + Returns: + List of error dictionaries + """ + errors = check(spec, spec_fn, filter_type) + if filter_type: + errors = filter_errors(errors, filter_type) return errors -def check(schema: Draft4Validator, spec: Assay, spec_fn: str): - errors = [] - idx = 0 +def run_check(parser: ArgumentParser, args: Namespace): + """Run the check command.""" + validate_check_args(parser, args) + + spec = load_spec(args.yaml) + errors = seqspec_check(spec, args.yaml, args.skip) + + if args.output: + with open(args.output, "w") as f: + for idx, e in enumerate(errors, 1): + print(format_error(e, idx), file=f) + else: + for idx, e in enumerate(errors, 1): + print(format_error(e, idx)) + return errors + + +IGVF_ONLIST_SKIP_FILTERS = [ + {"error_type": "check_onlist_files_exist", "error_object": "onlist"} +] - # with open("del.json", "w") as f: - # json.dump(spec.to_dict(), f, indent=4) - - for idx, error in enumerate(schema.iter_errors(spec.to_dict()), 1): - errors.append( - f"[error {idx}] {error.message} in spec[{']['.join(repr(index) for index in error.path)}]" - ) - idx += 1 - # check that the modalities are unique - if len(spec.modalities) != len(set(spec.modalities)): - errors.append( - f"[error {idx}] modalities [{', '.join(spec.modalities)}] are not unique" - ) - idx += 1 - # check that region_ids of the first level of the spec correspond to the modalities - # one for each modality - modes = spec.modalities - rgns = spec.library_spec - for r in rgns: - rid = r.region_id - if rid not in modes: - errors.append( - f"[error {idx}] region_id '{rid}' of the first level of the spec does not correspond to a modality [{', '.join(modes)}]" +def filter_errors(errors, filter_type): + filters = None + if filter_type == "igvf_onlist_skip": + filters = IGVF_ONLIST_SKIP_FILTERS + + if filters: + ferrors = [] + for error in errors: + # Check if this specific error combination exists in the filters + should_filter = any( + error["error_type"] == filter_item["error_type"] + and error["error_object"] == filter_item["error_object"] + for filter_item in filters ) - idx += 1 - # get all of the onlist files in the spec and check that they exist relative to the path of the spec - modes = spec.modalities - olrgns = [] - for m in modes: - olrgns += [i.onlist for i in spec.get_libspec(m).get_onlist_regions()] - - # check paths relative to spec_fn - for ol in olrgns: - if ol.urltype == "local": - if ol.filename[:-3] == ".gz": - check = path.join(path.dirname(spec_fn), ol.filename[:-3]) - if not path.exists(check): - errors.append(f"[error {idx}] {ol.filename[:-3]} does not exist") - idx += 1 - else: - check = path.join(path.dirname(spec_fn), ol.filename) - check_gz = path.join(path.dirname(spec_fn), ol.filename + ".gz") - if not path.exists(check) and not path.exists(check_gz): - errors.append(f"[error {idx}] {ol.filename} does not exist") - idx += 1 - elif ol.urltype == "http" or ol.urltype == "https" or ol.urltype == "ftp": - # ping the link with a simple http request to check if the file exists at that URI - if spec.seqspec_version == "0.3.0": - if not file_exists(ol.url): - errors.append(f"[error {idx}] {ol.filename} does not exist") - idx += 1 - else: - if not file_exists(ol.filename): - errors.append(f"[error {idx}] {ol.filename} does not exist") - idx += 1 + # Only keep errors that don't match our filter criteria + if not should_filter: + ferrors.append(error) + return ferrors + else: + return errors - # read ids should be unique - read_ids = set() - for read in spec.sequence_spec: - if read.read_id in read_ids: - errors.append( - f"[error {idx}] read_id '{read.read_id}' is not unique across all reads" + +def check(spec: Assay, spec_fn: str, skip: str = None): + # Variety of checks against schema + def check_schema(spec: Assay, spec_fn: str, errors=[], idx=0): + if skip == "igvf": + schema_fn = path.join( + path.dirname(__file__), "schema/seqspec_igvf.schema.json" + ) + elif skip == "igvf_onlist_skip": + schema_fn = path.join( + path.dirname(__file__), "schema/seqspec_igvf_onlist_skip.schema.json" ) - idx += 1 else: - read_ids.add(read.read_id) - - # iterate through reads in sequence_spec and check that the fastq files exist - for read in spec.sequence_spec: - spec_fn = path.dirname(spec_fn) - for f in read.files: - if f.urltype == "local": - check = path.join(spec_fn, f.filename) - if not path.exists(check): - errors.append(f"[error {idx}] {f.filename} does not exist") - idx += 1 - elif f.urltype == "http" or f.urltype == "https" or f.urltype == "ftp": - # ping the link with a simple http request to check if the file exists at that URI - if not file_exists(f.url): - errors.append(f"[error {idx}] {f.filename} does not exist") - idx += 1 + schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json") + with open(schema_fn, "r") as stream: + schema = yaml.load(stream, Loader=yaml.Loader) + validator = Draft4Validator(schema) + for idx, error in enumerate(validator.iter_errors(spec.to_dict()), 1): + err_elements = [repr(index) for index in error.path] + err_path = f"spec[{']['.join(err_elements)}]" + errobj = { + "error_type": "check_schema", + "error_message": f"{error.message} in {err_path}", + "error_object": err_elements[-1], + } + # errors.append(f"[error {idx}] {error.message} in {err_path}]") + errors.append(errobj) + idx += 1 + return (errors, idx) - # check that the primer ids, strand tuple pairs are unique across all reads - primer_strand_pairs = set() - for read in spec.sequence_spec: - if (read.primer_id, read.strand) in primer_strand_pairs: - errors.append( - f"[error {idx}] primer_id '{read.primer_id}' and strand '{read.strand}' tuple is not unique across all reads" - ) + # Modalities are unique + def check_unique_modalities(spec, spec_fn, errors, idx): + if len(spec.modalities) != len(set(spec.modalities)): + errobj = { + "error_type": "check_unique_modalities", + "error_message": f"modalities [{', '.join(spec.modalities)}] are not unique", + "error_object": "modalities", + } + # errors.append( + # f"[error {idx}] modalities [{', '.join(spec.modalities)}] are not unique" + # ) + errors.append(errobj) idx += 1 - else: - primer_strand_pairs.add((read.primer_id, read.strand)) + return (errors, idx) - # TODO add option to check md5sum + # Region_ids of the first level correspond to the modalities (one per modality) + def check_region_ids_modalities(spec, spec_fn, errors, idx): + modes = spec.modalities + rgns = spec.library_spec + for r in rgns: + rid = r.region_id + if rid not in modes: + errobj = { + "error_type": "check_region_ids_modalities", + "error_message": f"region_id '{rid}' of the first level of the spec does not correspond to a modality [{', '.join(modes)}]", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] region_id '{rid}' of the first level of the spec does not correspond to a modality [{', '.join(modes)}]" + # ) + errors.append(errobj) + idx += 1 + return (errors, idx) - # check that the region_id is unique across all regions - rgn_ids = set() - for m in modes: - for rgn in spec.get_libspec(m).get_leaves(): - if rgn.region_id in rgn_ids: - errors.append( - f"[error {idx}] region_id '{rgn.region_id}' is not unique across all regions" - ) + # Onlist files exist relative to the path of the spec or http + def check_onlist_files_exist(spec, spec_fn, errors, idx): + modes = spec.modalities + olrgns = [] + for m in modes: + olrgns += [i.onlist for i in spec.get_libspec(m).get_onlist_regions()] + + # check paths relative to spec_fn + for ol in olrgns: + if ol.urltype == "local": + if ol.filename[:-3] == ".gz": + check = path.join(path.dirname(spec_fn), ol.filename[:-3]) + if not path.exists(check): + errobj = { + "error_type": "check_onlist_files_exist", + "error_message": f"{ol.filename[:-3]} does not exist", + "error_object": "onlist", + } + # errors.append( + # f"[error {idx}] {ol.filename[:-3]} does not exist" + # ) + errors.append(errobj) + idx += 1 + else: + check = path.join(path.dirname(spec_fn), ol.filename) + check_gz = path.join(path.dirname(spec_fn), ol.filename + ".gz") + if not path.exists(check) and not path.exists(check_gz): + errobj = { + "error_type": "check_onlist_files_exist", + "error_message": f"{ol.filename} does not exist", + "error_object": "onlist", + } + # errors.append(f"[error {idx}] {ol.filename} does not exist") + errors.append(errobj) + idx += 1 + elif ol.urltype == "http" or ol.urltype == "https" or ol.urltype == "ftp": + # ping the link with a simple http request to check if the file exists at that URI + if spec.seqspec_version == get_version(): + if not file_exists(ol.url): + errobj = { + "error_type": "check_onlist_files_exist", + "error_message": f"{ol.filename} does not exist", + "error_object": "onlist", + } + + # errors.append(f"[error {idx}] {ol.filename} does not exist") + errors.append(errobj) + idx += 1 + else: + if not file_exists(ol.filename): + errobj = { + "error_type": "check_onlist_files_exist", + "error_message": f"{ol.filename} does not exist", + "error_object": "onlist", + } + # errors.append(f"[error {idx}] {ol.filename} does not exist") + errors.append(errobj) + idx += 1 + return (errors, idx) + + # Read ids are unique + def check_unique_read_ids(spec, spec_fn, errors, idx): + read_ids = set() + for read in spec.sequence_spec: + if read.read_id in read_ids: + errobj = { + "error_type": "check_unique_read_ids", + "error_message": f"read_id '{read.read_id}' is not unique across all reads", + "error_object": "read", + } + # errors.append( + # f"[error {idx}] read_id '{read.read_id}' is not unique across all reads" + # ) + errors.append(errobj) idx += 1 else: - rgn_ids.add(rgn.region_id) + read_ids.add(read.read_id) + return (errors, idx) - # check that the modality is in the reads - for read in spec.sequence_spec: - if read.modality not in modes: - errors.append( - f"[error {idx}] '{read.read_id}' modality '{read.modality}' does not exist in the modalities" - ) - idx += 1 + # Read files exist + def check_read_files_exist(spec, spec_fn, errors, idx): + for read in spec.sequence_spec: + spec_fn = path.dirname(spec_fn) + for f in read.files: + if f.urltype == "local": + check = path.join(spec_fn, f.filename) + if not path.exists(check): + errobj = { + "error_type": "check_read_files_exist", + "error_message": f"{f.filename} does not exist", + "error_object": "file", + } + # errors.append(f"[error {idx}] {f.filename} does not exist") + errors.append(errobj) + idx += 1 + elif f.urltype == "http" or f.urltype == "https" or f.urltype == "ftp": + # ping the link with a simple http request to check if the file exists at that URI + if not file_exists(f.url): + errobj = { + "error_type": "check_read_files_exist", + "error_message": f"{f.filename} does not exist", + "error_object": "file", + } + # errors.append(f"[error {idx}] {f.filename} does not exist") + errors.append(errobj) + idx += 1 + return (errors, idx) + + # Primer ids, strand tuple pairs are unique across all reads + def check_unique_read_primer_strand_pairs(spec, spec_fn, errors, idx): + primer_strand_pairs = set() + for read in spec.sequence_spec: + if (read.primer_id, read.strand) in primer_strand_pairs: + errobj = { + "error_type": "check_unique_read_primer_strand_pairs", + "error_message": f"primer_id '{read.primer_id}' and strand '{read.strand}' tuple is not unique across all reads", + "error_object": "read", + } + # errors.append( + # f"[error {idx}] primer_id '{read.primer_id}' and strand '{read.strand}' tuple is not unique across all reads" + # ) + errors.append(errobj) + idx += 1 + else: + primer_strand_pairs.add((read.primer_id, read.strand)) + return (errors, idx) + + # TODO add option to check md5sum + def check_md5sum(spec, spec_fn, errors, idx): + return (errors, idx) + + # Region_id is unique across all regions + def check_unique_region_ids(spec, spec_fn, errors, idx): + modes = spec.modalities + rgn_ids = set() + for m in modes: + for rgn in spec.get_libspec(m).get_leaves(): + if rgn.region_id in rgn_ids: + errobj = { + "error_type": "check_unique_region_ids", + "error_message": f"region_id '{rgn.region_id}' is not unique across all regions", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] region_id '{rgn.region_id}' is not unique across all regions" + # ) + errors.append(errobj) + idx += 1 + else: + rgn_ids.add(rgn.region_id) + return (errors, idx) + + # Modality is in the reads + def check_read_modalities(spec, spec_fn, errors, idx): + modes = spec.modalities + for read in spec.sequence_spec: + if read.modality not in modes: + errobj = { + "error_type": "check_read_modalities", + "error_message": f"read '{read.read_id}' modality '{read.modality}' does not exist in the modalities", + "error_object": "read", + } + # errors.append( + # f"[error {idx}] '{read.read_id}' modality '{read.modality}' does not exist in the modalities" + # ) + errors.append(errobj) + idx += 1 + return (errors, idx) # check that the unique primer ids exist as a region id in the library_spec - for read in spec.sequence_spec: - if read.primer_id not in rgn_ids: - errors.append( - f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist in the library_spec" - ) - idx += 1 + # TODO is there a better way to get the rgn_ids? + def check_primer_ids_in_region_ids(spec, spec_fn, errors, idx): + # first get all unique region_ids + modes = spec.modalities + rgn_ids = set() + for m in modes: + for rgn in spec.get_libspec(m).get_leaves(): + if rgn.region_id in rgn_ids: + pass + else: + rgn_ids.add(rgn.region_id) + + # then check that the primer ids exist in the region_ids + for read in spec.sequence_spec: + if read.primer_id not in rgn_ids: + errobj = { + "error_type": "check_primer_ids_in_region_ids", + "error_message": f"'{read.read_id}' primer_id '{read.primer_id}' does not exist in the library_spec", + "error_object": "read", + } + # errors.append( + # f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist in the library_spec" + # ) + errors.append(errobj) + idx += 1 + return (errors, idx) # NOTE: this is a strong assumption that may be relaxed in the future # check that the primer id for each read is in the leaves of the spec for that modality - for read in spec.sequence_spec: - mode = spec.get_libspec(read.modality) - leaves = mode.get_leaves() - if read.primer_id not in [i.region_id for i in leaves]: - errors.append( - f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist as an atomic region in the library_spec for modality '{read.modality}'" - ) - idx += 1 + def check_primer_ids_in_libspec_leaves(spec, spec_fn, errors, idx): + for read in spec.sequence_spec: + mode = spec.get_libspec(read.modality) + leaves = mode.get_leaves() + if read.primer_id not in [i.region_id for i in leaves]: + errobj = { + "error_type": "check_primer_ids_in_libspec_leaves", + "error_message": f"'{read.read_id}' primer_id '{read.primer_id}' does not exist as an atomic region in the library_spec for modality '{read.modality}'", + "error_object": "read", + } + # errors.append( + # f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist as an atomic region in the library_spec for modality '{read.modality}'" + # ) + errors.append(errobj) + idx += 1 + return (errors, idx) # check that the max read len is not longer than the max len of the lib spec after the primer # for read in spec.sequence_spec: @@ -202,89 +416,172 @@ def check(schema: Draft4Validator, spec: Assay, spec_fn: str): # leaves = mode.get_leaves() # idx = [i.region_id for i in leaves].index(read.primer_id) - # if a region has a sequence type "fixed" then it should not contain subregions - # if a region has a sequence type "joiend" then it should contain subregions - # if a region has a sequence type "random" then it should not contain subregions and should be all X's - # if a region has a sequence type "onlist" then it should have an onlist object - def seqtype_check(rgn, errors, idx): - # this is a recursive function that iterates through all regions and checks the sequence type - if rgn.sequence_type == "fixed" and rgn.regions: - errors.append( - f"[error {idx}] '{rgn.region_id}' sequence_type is 'fixed' and contains subregions" - ) - idx += 1 - if rgn.sequence_type == "joined" and not rgn.regions: - errors.append( - f"[error {idx}] '{rgn.region_id}' sequence_type is 'joined' and does not contain subregions" - ) - idx += 1 - if rgn.sequence_type == "random" and rgn.regions: - errors.append( - f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and contains subregions" - ) - idx += 1 - if rgn.sequence_type == "random" and rgn.sequence != "X" * rgn.max_len: - errors.append( - f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and sequence is not all X's" - ) - idx += 1 - if rgn.sequence_type == "onlist" and not rgn.onlist: - errors.append( - f"[error {idx}] '{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object" - ) - idx += 1 - if rgn.regions: - for r in rgn.regions: - errors, idx = seqtype_check(r, errors, idx) - return (errors, idx) + def check_sequence_types(spec, spec_fn, errors, idx): + modes = spec.modalities + + # if a region has a sequence type "fixed" then it should not contain subregions + # if a region has a sequence type "joiend" then it should contain subregions + # if a region has a sequence type "random" then it should not contain subregions and should be all X's + # if a region has a sequence type "onlist" then it should have an onlist object + def seqtype_check(rgn, errors, idx): + # this is a recursive function that iterates through all regions and checks the sequence type + if rgn.sequence_type == "fixed" and rgn.regions: + errobj = { + "error_type": "check_sequence_types", + "error_message": f"'{rgn.region_id}' sequence_type is 'fixed' and contains subregions", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] '{rgn.region_id}' sequence_type is 'fixed' and contains subregions" + # ) + errors.append(errobj) + idx += 1 + if rgn.sequence_type == "joined" and not rgn.regions: + errobj = { + "error_type": "check_sequence_types", + "error_message": f"'{rgn.region_id}' sequence_type is 'joined' and does not contain subregions", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] '{rgn.region_id}' sequence_type is 'joined' and does not contain subregions" + # ) + errors.append(errobj) + idx += 1 + if rgn.sequence_type == "random" and rgn.regions: + errobj = { + "error_type": "check_sequence_types", + "error_message": f"'{rgn.region_id}' sequence_type is 'random' and contains subregions", + "error_object": "region", + } + errors.append(errobj) + idx += 1 + if rgn.sequence_type == "random" and ( + set(rgn.sequence) != {"X"} + or not (rgn.min_len <= len(rgn.sequence) <= rgn.max_len) + ): + errobj = { + "error_type": "check_sequence_types", + "error_message": f"'{rgn.region_id}' sequence_type is 'random' and sequence is not all X's", + "error_object": "region", + } + errors.append(errobj) + idx += 1 + if rgn.sequence_type == "onlist" and not rgn.onlist: + errobj = { + "error_type": "check_sequence_types", + "error_message": f"'{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object", + "error_object": "region", + } + errors.append(errobj) + idx += 1 + if rgn.regions: + for r in rgn.regions: + errors, idx = seqtype_check(r, errors, idx) + return (errors, idx) + + for m in modes: + for rgn in [spec.get_libspec(m)]: + errors, idx = seqtype_check(rgn, errors, idx) - for m in modes: - for rgn in [spec.get_libspec(m)]: - errors, idx = seqtype_check(rgn, errors, idx) + return (errors, idx) # check the lengths of every region against the max_len, using a recursive function - def len_check(rgn, errors, idx): - if rgn.regions: - for r in rgn.regions: - errors, idx = len_check(r, errors, idx) - if rgn.max_len < rgn.min_len: - errors.append( - f"[error {idx}] '{rgn.region_id}' max_len is less than min_len" - ) - idx += 1 + def check_region_lengths(spec, spec_fn, errors, idx): + modes = spec.modalities + + def len_check(rgn, errors, idx): + if rgn.regions: + for r in rgn.regions: + errors, idx = len_check(r, errors, idx) + if rgn.max_len < rgn.min_len: + errobj = { + "error_type": "check_region_lengths", + "error_message": f"'{rgn.region_id}' max_len is less than min_len", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] '{rgn.region_id}' max_len is less than min_len" + # ) + errors.append(errobj) + idx += 1 + return (errors, idx) + + for m in modes: + for rgn in [spec.get_libspec(m)]: + errors, idx = len_check(rgn, errors, idx) return (errors, idx) - for m in modes: - for rgn in [spec.get_libspec(m)]: - errors, idx = len_check(rgn, errors, idx) + # errors, idx = check_region_lengths(spec, spec_fn, errors, idx) # check that the length of the sequence is equal to the max_len using a recursive function # an assumption in the code and spec is that the displayed sequence is equal to the max_len - def seq_len_check(rgn, errors, idx): - if rgn.regions: - for r in rgn.regions: - errors, idx = seq_len_check(r, errors, idx) - if rgn.sequence and ( - len(rgn.sequence) < rgn.min_len or len(rgn.sequence) > rgn.max_len - ): - # noqa - errors.append( - f"[error {idx}] '{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})" - ) - idx += 1 - return (errors, idx) + def check_sequence_lengths(spec, spec_fn, errors, idx): + modes = spec.modalities + + def seq_len_check(rgn, errors, idx): + if rgn.regions: + for r in rgn.regions: + errors, idx = seq_len_check(r, errors, idx) + if rgn.sequence and ( + len(rgn.sequence) < rgn.min_len or len(rgn.sequence) > rgn.max_len + ): + # noqa + errobj = { + "error_type": "check_sequence_lengths", + "error_message": f"'{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] '{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})" + # ) + errors.append(errobj) + idx += 1 + return (errors, idx) - for m in modes: - for rgn in [spec.get_libspec(m)]: - errors, idx = seq_len_check(rgn, errors, idx) + for m in modes: + for rgn in [spec.get_libspec(m)]: + errors, idx = seq_len_check(rgn, errors, idx) + return (errors, idx) # check that the number of files in each "File" object for all Read object are all the same length - nfiles = [] - for read in spec.sequence_spec: - nfiles.append(len(read.files)) + def check_read_file_count(spec, spec_fn, errors, idx): + nfiles = [] + for read in spec.sequence_spec: + nfiles.append(len(read.files)) + + if len(set(nfiles)) != 1: + errobj = { + "error_type": "check_read_file_count", + "error_message": "Reads must have the same number of files", + "error_object": "read", + } + # errors.append(f"[error {idx}] Reads must have the same number of files") + errors.append(errobj) + idx += 1 + return (errors, idx) - if len(set(nfiles)) != 1: - errors.append(f"[error {idx}] Reads must have the same number of files") - idx += 1 + # errors, idx = check_read_file_count(spec, spec_fn, errors, idx) + + errors = [] + idx = 0 + checks = { + "check_schema": check_schema, + "check_unique_modalities": check_unique_modalities, + "check_region_ids_modalities": check_region_ids_modalities, + "check_onlist_files_exist": check_onlist_files_exist, + "check_unique_read_ids": check_unique_read_ids, + "check_read_files_exist": check_read_files_exist, + "check_unique_read_primer_strand_pairs": check_unique_read_primer_strand_pairs, + "check_unique_region_ids": check_unique_region_ids, + "check_read_modalities": check_read_modalities, + "check_primer_ids_in_region_ids": check_primer_ids_in_region_ids, + "check_primer_ids_in_libspec_leaves": check_primer_ids_in_libspec_leaves, + "check_sequence_types": check_sequence_types, + "check_region_lengths": check_region_lengths, + "check_sequence_lengths": check_sequence_lengths, + "check_read_file_count": check_read_file_count, + } + for k, v in checks.items(): + errors, idx = v(spec, spec_fn, errors, idx) return errors diff --git a/seqspec/seqspec_convert.py b/seqspec/seqspec_convert.py new file mode 100644 index 00000000..11a4d1a1 --- /dev/null +++ b/seqspec/seqspec_convert.py @@ -0,0 +1,434 @@ +"""Convert module for seqspec CLI. + +This module provides functionality to convert between different formats (seqspec, genbank, token). +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace +import json +import numpy as np +from typing import Dict, List, Tuple +import os + +from seqspec.utils import load_genbank, load_spec +from seqspec.Region import Region +from seqspec.Assay import Assay + + +# Load schema and constants +schema_fn = os.path.join(os.path.dirname(__file__), "schema/seqspec.schema.json") +with open(schema_fn, "r") as f: + schema = json.load(f) +REGION_TYPES = schema["$defs"]["region"]["properties"]["region_type"]["enum"] +MODALITIES = schema["properties"]["modalities"]["items"]["enum"] +SEQUENCE_TYPES = schema["$defs"]["region"]["properties"]["sequence_type"]["enum"] + + +def setup_convert_args(parser) -> ArgumentParser: + """Create and configure the convert command subparser.""" + subparser = parser.add_parser( + "convert", + description=""" +Convert between different formats (seqspec, genbank, token). + +Examples: +seqspec convert -ifmt seqspec -ofmt token spec.yaml -o output_dir # Convert seqspec to token format +seqspec convert -ifmt genbank -ofmt seqspec input.gb -o spec.yaml # Convert genbank to seqspec +--- +""", + help="Convert between different formats", + formatter_class=RawTextHelpFormatter, + ) + choices = ["genbank", "seqspec", "token"] + subparser.add_argument( + "-ifmt", + "--input-format", + help="Input format", + type=str, + default="seqspec", + choices=choices, + ) + subparser.add_argument( + "-ofmt", + "--output-format", + help="Output format", + type=str, + default="token", + choices=choices, + ) + subparser.add_argument( + "-o", + "--output", + metavar="OUT", + help="Path to output file or directory", + type=Path, + default=None, + required=False, + ) + subparser.add_argument( + "input_file", + metavar="IN", + help="Path to input file", + type=Path, + ) + return subparser + + +def validate_convert_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the convert command arguments.""" + if not Path(args.input_file).exists(): + parser.error(f"Input file does not exist: {args.input_file}") + + if args.output and Path(args.output).exists(): + if args.output_format == "token": + if not Path(args.output).is_dir(): + parser.error( + f"Output path exists but is not a directory: {args.output}" + ) + else: + if not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") + + +def run_convert(parser: ArgumentParser, args: Namespace) -> None: + """Run the convert command.""" + validate_convert_args(parser, args) + + CONVERT = { + ("genbank", "seqspec"): gb_to_seqspec, + ("seqspec", "token"): seqspec_to_token, + } + + file = load_input_file(args.input_file, args.input_format) + result = CONVERT[(args.input_format, args.output_format)](file) + + if args.output: + if args.output_format == "token": + save_tokenized_spec(*result, str(args.output)) # noqa + else: + # Handle other output formats here + pass + else: + return result + + +def load_input_file(fn: Path, ifmt: str): + """Load input file based on format.""" + LOAD = { + "genbank": load_genbank, + "seqspec": load_spec, + # "token": load_token, + } + return LOAD[ifmt](fn) + + +def get_feature_names() -> List[str]: + """Generate ordered list of column names""" + features = [] + + # Modality one-hot features + features.extend([f"modality_{mod}" for mod in MODALITIES]) + + # Region type one-hot features + features.extend([f"region_{rt}" for rt in REGION_TYPES]) + + # Sequence type one-hot features + features.extend([f"seq_{st}" for st in SEQUENCE_TYPES]) + + # Numerical features + features.extend(["min_len", "max_len", "position"]) + + return features + + +def save_tokenized_spec( + matrix: np.ndarray, row_identifiers: List[Tuple[str, str, str]], output_path: str +): + """ + Save tokenized spec output to three files: + - spec.npy: The matrix data + - rows.txt: Tab-separated list of (spec_id, modality, region_type) + - cols.txt: List of column names + + Args: + matrix: The tokenized matrix from tokenize_specs + row_identifiers: List of (spec_id, modality, region_type) tuples + output_path: Path to save the output (directory) + """ + # Create output directory if needed + output_dir = Path(output_path) + output_dir.mkdir(parents=True, exist_ok=True) + + # Save matrix + np.save(output_dir / "spec.npy", matrix) + + # Save row identifiers (tab-separated) + with open(output_dir / "rows.txt", "w") as f: + for spec_id, modality, region_type in row_identifiers: + f.write(f"{spec_id}\t{modality}\t{region_type}\n") + + # Save column names + feature_names = get_feature_names() + with open(output_dir / "cols.txt", "w") as f: + for feature in feature_names: + f.write(f"{feature}\n") + + +def seqspec_to_token(spec): + # for each modalitiy, make a dictionary of regions + specs_regions = {} + modalities = spec.list_modalities() + for modality in modalities: + regions = [i.to_dict() for i in spec.get_libspec(modality).get_leaves()] + specs_regions[modality] = regions + + # Convert to tokenized matrix + return tokenize_specs({spec.assay_id: specs_regions}) + + +def tokenize_specs( + specs_regions: Dict[str, Dict[str, List[Dict]]], +) -> Tuple[np.ndarray, List[Tuple[str, str, str]]]: + """ + Convert specs into a single matrix where each row represents a complete region specification + + Args: + specs_regions: Dict[spec_id -> Dict[modality -> List[region_dict]]] + + Returns: + - Matrix where each row is [modality_onehot, region_type_onehot, sequence_type_onehot, min_len, max_len, position] + - List of (spec_id, modality, region_type) identifying each row + """ + # Calculate feature dimensions + n_modality_features = len(MODALITIES) + n_region_type_features = len(REGION_TYPES) + n_sequence_type_features = len(SEQUENCE_TYPES) + + # Total features = one-hot encodings + numerical features + position + total_features = ( + n_modality_features # modality one-hot + + n_region_type_features # region_type one-hot + + n_sequence_type_features # sequence_type one-hot + + 2 # min_len, max_len + + 1 # position in region list (1-based) + ) + + # features = get_feature_names() + + rows = [] # Will hold our feature vectors + row_identifiers = [] # Will hold (spec_id, modality, region_type) tuples + + for spec_id, modality_regions in specs_regions.items(): + for modality, regions in modality_regions.items(): + # Enumerate regions to get position (1-based) + for position, region in enumerate(regions, start=1): + # Create feature vector for this region + feature_vector = np.zeros(total_features).astype(int) + current_idx = 0 + + # Add modality one-hot + modality_idx = MODALITIES.index(modality) + feature_vector[modality_idx] = 1 + current_idx += n_modality_features + + # Add region_type one-hot + region_type_idx = REGION_TYPES.index(region["region_type"]) + feature_vector[current_idx + region_type_idx] = 1 + current_idx += n_region_type_features + + # Add sequence_type one-hot + sequence_type_idx = SEQUENCE_TYPES.index(region["sequence_type"]) + feature_vector[current_idx + sequence_type_idx] = 1 + current_idx += n_sequence_type_features + + # Add lengths + feature_vector[current_idx] = region["min_len"] + feature_vector[current_idx + 1] = region["max_len"] + current_idx += 2 + + # Add position + feature_vector[current_idx] = position + + # Store feature vector and identifier + rows.append(feature_vector) + row_identifiers.append((spec_id, modality, region["region_type"])) + + return np.array(rows), row_identifiers + + +def gb_to_seqspec(gb): + ex = gb_to_list(gb) + nested_json = nest_intervals(ex) + filled_regions = fill_gaps(gb.sequence, nested_json) + regions = convert(filled_regions) + reads = [] + spec = Assay( + "genbank", + "illumina", + "genbank thing", + "doi", + "date", + ["source"], + "description", + "", + "", + "", + "", + reads, + regions, + ) + return spec + + +def gb_to_list(gb): + feat = [] + label = "source" + for f in gb.features: + id = f.key + + if "complement" in f.location: + start, stop = tuple(map(int, f.location[11:-1].split(".."))) + else: + start, stop = tuple(map(int, f.location.split(".."))) + + # convert to 0-index + start -= 1 + length = stop - start + seq = gb.sequence[start:stop] + + for q in f.qualifiers: + if q.key == "/label=": + label = q.value + break + feat.append( + { + "id": id, + "label": label, + "start": start, + "stop": stop, + "length": length, + "seq": seq, + } + ) + return feat + + +def nest_intervals(intervals): + def nest(start_index, end_limit): + nested = [] + + i = start_index + while i < len(intervals) and intervals[i]["start"] < end_limit: + current_interval = intervals[i] + child, next_index = nest(i + 1, current_interval["stop"]) + interval_obj = { + "id": current_interval["id"], + "label": current_interval["label"], + "start": current_interval["start"], + "stop": current_interval["stop"], + "length": current_interval["length"], + "seq": current_interval["seq"], + "regions": child, + } + nested.append(interval_obj) + i = next_index + + return nested, i + + result, _ = nest(0, intervals[0]["stop"]) + return result + + +def fill_gaps(seq, regions, parent_start=0, parent_stop=0): + if len(regions) == 0: + return [] + + # Insert a filler at the start if necessary + if regions[0]["start"] > parent_start: + start = parent_start + stop = regions[0]["start"] + s = seq[start:stop] + regions.insert( + 0, + { + "id": "filler_start", + "label": "filler_start", + "start": start, + "stop": stop, + "length": stop - start, + "seq": s, + "regions": [], + }, + ) + + new_regions = [] + for i, region in enumerate(regions): + # Append the current region + new_regions.append(region) + + # Recursive call for nested regions + if "regions" in region: + region["regions"] = fill_gaps( + seq, region["regions"], region["start"], region["stop"] + ) + + # Check for gap and insert a filler + if i < len(regions) - 1 and region["stop"] < regions[i + 1]["start"]: + filler_id = f'filler_{region["id"]}_{regions[i+1]["id"]}' + start = region["stop"] + stop = regions[i + 1]["start"] + s = seq[start:stop] + new_regions.append( + { + "id": filler_id, + "label": filler_id, + "start": start, + "stop": stop, + "length": stop - start, + "seq": s, + "regions": [], + } + ) + + # Insert a filler at the end if necessary + if new_regions[-1]["stop"] < parent_stop: + start = new_regions[-1]["stop"] + stop = parent_stop + s = seq[start:stop] + new_regions.append( + { + "id": "filler_end", + "label": "filler_end", + "start": start, + "stop": stop, + "length": stop - start, + "seq": s, + "regions": [], + } + ) + + return new_regions + + +# convert filled regions to seqspec, must be recursive function +# regions is a list +def convert(regions): + if len(regions) == 0: + return [] + new_regions = [] + for r in regions: + rgn = Region( + r["id"], + "", + r["label"], + "fixed", + r["seq"], + r["length"], + r["length"], + None, + None, + ) + if len(r["regions"]) > 0: + rgn.regions = convert(r["regions"]) + new_regions.append(rgn) + return new_regions diff --git a/seqspec/seqspec_diff.py b/seqspec/seqspec_diff.py index c4e6a57a..b4ba3a88 100644 --- a/seqspec/seqspec_diff.py +++ b/seqspec/seqspec_diff.py @@ -1,45 +1,156 @@ -from .Region import Region -from seqspec.Assay import Assay +"""Diff module for seqspec. + +This module provides functionality to compare two seqspec files and identify differences. +""" + +from pathlib import Path +from argparse import ArgumentParser, Namespace +from typing import List + from seqspec.utils import load_spec +from seqspec.Assay import Assay +from seqspec.Region import Region -def setup_diff_args(parser): - parser_diff = parser.add_parser( +def setup_diff_args(parser) -> ArgumentParser: + """Create and configure the diff command subparser.""" + subparser = parser.add_parser( "diff", - description="diff two seqspecs", - help="diff two seqspecs", + description=""" +Compare two seqspec files and identify differences. + +Examples: +seqspec diff spec1.yaml spec2.yaml # Compare two specs and print differences +seqspec diff spec1.yaml spec2.yaml -o diff.txt # Compare specs and save differences to file +--- +""", + help="Compare two seqspec files and identify differences", + ) + subparser.add_argument( + "yamlA", help="First sequencing specification yaml file", type=str ) - parser_diff.add_argument("yamlA", help="Sequencing specification yaml file A") - parser_diff.add_argument("yamlB", help="Sequencing specification yaml file B") - parser_diff.add_argument( + subparser.add_argument( + "yamlB", help="Second sequencing specification yaml file", type=str + ) + subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) - return parser_diff + return subparser + + +def validate_diff_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the diff command arguments.""" + if not Path(args.yamlA).exists(): + parser.error(f"Input file A does not exist: {args.yamlA}") + if not Path(args.yamlB).exists(): + parser.error(f"Input file B does not exist: {args.yamlB}") + if args.output and args.output.exists() and not args.output.is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") + + +def run_diff(parser: ArgumentParser, args: Namespace) -> None: + """Run the diff command.""" + validate_diff_args(parser, args) + + spec_a = load_spec(args.yamlA) + spec_b = load_spec(args.yamlB) + + differences = compare_specs(spec_a, spec_b) + + if args.output: + args.output.write_text(differences) + else: + print(differences) + + +def compare_specs(spec_a: Assay, spec_b: Assay) -> str: + """Compare two specs and return a string describing their differences.""" + differences = [] + + # Compare modalities + modalities_a = set(spec_a.modalities) + modalities_b = set(spec_b.modalities) + + if modalities_a != modalities_b: + differences.append("Modalities differ:") + differences.append(f" Spec A: {', '.join(sorted(modalities_a))}") + differences.append(f" Spec B: {', '.join(sorted(modalities_b))}") + + # Compare common modalities + common_modalities = modalities_a.intersection(modalities_b) + for modality in common_modalities: + regions_a = spec_a.get_libspec(modality).get_leaves() + regions_b = spec_b.get_libspec(modality).get_leaves() + + region_diffs = compare_regions(regions_a, regions_b) + if region_diffs: + differences.append(f"\nModality '{modality}' differences:") + differences.extend(region_diffs) + + return "\n".join(differences) if differences else "No differences found" + + +def compare_regions(regions_a: List[Region], regions_b: List[Region]) -> List[str]: + """Compare two lists of regions and return a list of differences.""" + differences = [] + + # Create lookup dictionaries + regions_a_dict = {r.region_id: r for r in regions_a} + regions_b_dict = {r.region_id: r for r in regions_b} + + # Find regions unique to each spec + unique_to_a = set(regions_a_dict.keys()) - set(regions_b_dict.keys()) + unique_to_b = set(regions_b_dict.keys()) - set(regions_a_dict.keys()) + + if unique_to_a: + differences.append(" Regions only in spec A:") + for region_id in sorted(unique_to_a): + differences.append(f" - {region_id}") + + if unique_to_b: + differences.append(" Regions only in spec B:") + for region_id in sorted(unique_to_b): + differences.append(f" - {region_id}") + # Compare common regions + common_regions = set(regions_a_dict.keys()).intersection(set(regions_b_dict.keys())) + for region_id in sorted(common_regions): + region_a = regions_a_dict[region_id] + region_b = regions_b_dict[region_id] -def validate_diff_args(parser, args): - # if everything is valid the run_diff - A_fn = args.yamlA - B_fn = args.yamlB - # o = args.o - A = load_spec(A_fn) - B = load_spec(B_fn) + region_diffs = diff_regions(region_a, region_b) + if region_diffs: + differences.append(f" Region '{region_id}' differences:") + differences.extend(f" - {diff}" for diff in region_diffs) - # load in two specs - run_diff(A, B) + return differences -def run_diff(A: Assay, B: Assay): - # What does it mean to diff two assays? - # Only compare on modalities? - # itx: pull out regions that have the same name? - # itx: - pass +def diff_regions(region_a: Region, region_b: Region) -> List[str]: + """Compare two regions and return a list of differences.""" + differences = [] + # Compare basic properties + if region_a.region_type != region_b.region_type: + differences.append( + f"region_type: {region_a.region_type} != {region_b.region_type}" + ) + if region_a.name != region_b.name: + differences.append(f"name: {region_a.name} != {region_b.name}") + if region_a.sequence_type != region_b.sequence_type: + differences.append( + f"sequence_type: {region_a.sequence_type} != {region_b.sequence_type}" + ) + if region_a.sequence != region_b.sequence: + differences.append(f"sequence: {region_a.sequence} != {region_b.sequence}") + if region_a.min_len != region_b.min_len: + differences.append(f"min_len: {region_a.min_len} != {region_b.min_len}") + if region_a.max_len != region_b.max_len: + differences.append(f"max_len: {region_a.max_len} != {region_b.max_len}") -def diff_regions(R1: Region, R2: Region): - pass + return differences diff --git a/seqspec/seqspec_file.py b/seqspec/seqspec_file.py index 31be20dd..d1ad8e54 100644 --- a/seqspec/seqspec_file.py +++ b/seqspec/seqspec_file.py @@ -1,16 +1,22 @@ +"""File module for seqspec. + +This module provides functionality to list and format files present in seqspec files. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace +from typing import Dict, List, Optional +from collections import defaultdict +import json + from seqspec.utils import load_spec from seqspec.Assay import Assay -from collections import defaultdict from seqspec.File import File -from typing import Dict, List, Optional -from argparse import RawTextHelpFormatter -import json from seqspec import seqspec_find -import os -import argparse -def setup_file_args(parser): +def setup_file_args(parser) -> ArgumentParser: + """Create and configure the file command subparser.""" subparser = parser.add_parser( "file", description=""" @@ -29,36 +35,37 @@ def setup_file_args(parser): ) subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) subparser.add_argument( "-i", + "--ids", metavar="IDs", - help=("Ids to list"), + help="Ids to list", type=str, default=None, ) subparser_required.add_argument( "-m", + "--modality", metavar="MODALITY", - help=("Modality"), + help="Modality", type=str, - default=None, required=True, ) - # the object we are using to index - # choices = ["read", "region", "file", "onlist", "region-type"] choices = ["read", "region", "file", "region-type"] subparser.add_argument( "-s", + "--selector", metavar="SELECTOR", - help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"), + help=f"Selector for ID, [{', '.join(choices)}] (default: read)", type=str, default="read", choices=choices, @@ -66,6 +73,7 @@ def setup_file_args(parser): choices = ["paired", "interleaved", "index", "list", "json"] subparser.add_argument( "-f", + "--format", metavar="FORMAT", help=f"Format, [{', '.join(choices)}], default: paired", type=str, @@ -84,6 +92,7 @@ def setup_file_args(parser): ] subparser.add_argument( "-k", + "--key", metavar="KEY", help=f"Key, [{', '.join(choices)}], default: file_id", type=str, @@ -94,7 +103,7 @@ def setup_file_args(parser): # option to get the full path of the file subparser.add_argument( "--fullpath", - help=argparse.SUPPRESS, + help="Use full path for local files", action="store_true", default=False, ) @@ -102,59 +111,50 @@ def setup_file_args(parser): return subparser -def validate_file_args(parser, args): - spec_fn = os.path.abspath(args.yaml) - o = args.o - m = args.m # modality - idtype = args.s # selector - fmt = args.f # format - ids = args.i # ids - k = args.k # key - fp = args.fullpath +def validate_file_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the file command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") - if (k == "filesize" or k == "filetype" or k == "urltype" or k == "md5") and ( - fmt == "paired" or fmt == "interleaved" or fmt == "index" - ): - parser.error(f"-f {fmt} valid only with -k file_id, filename, url") + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") - return run_file(spec_fn, m, ids, idtype, fmt, k, o, fp=fp) + if args.key in ["filesize", "filetype", "urltype", "md5"] and args.format in [ + "paired", + "interleaved", + "index", + ]: + parser.error( + f"Format '{args.format}' valid only with key 'file_id', 'filename', or 'url'" + ) -def run_file(spec_fn, m, ids, idtype, fmt, k, o, fp=False): - spec = load_spec(spec_fn) - if ids is None: - ids = [] - else: - ids = ids.split(",") - files = file(spec, m, ids, idtype, fmt, k, spec_fn, fp) - - if files: - if o: - with open(o, "w") as f: - print(files, file=f) - else: - print(files) - return - - -def file( +def seqspec_file( spec: Assay, modality: str, - ids: List[str], - idtype: str, - fmt: str, - k: Optional[str], - spec_fn: str, - fp: bool = False, -): + ids: Optional[List[str]] = None, + selector: str = "read", +) -> Dict[str, List[File]]: + """Core functionality to list files from a seqspec. + + Args: + spec: The Assay object to operate on + spec_fn: Path to the spec file, used for relative path resolution + modality: The modality to list files for + ids: Optional list of IDs to filter by + selector: Type of ID to filter by (read, region, file, region-type) + + Returns: + Dictionary mapping IDs to lists of File objects + """ # NOTE: LIST FILES DOES NOT RESPECT ORDERING OF INPUT IDs LIST # NOTE: seqspec file -s read gets the files for the read, not the files mapped from the regions associated with the read. LIST_FILES = { "read": list_read_files, "region": list_region_files, - "file": list_files, - # "onlist": list_onlist_files, + "file": list_all_files, } + LIST_FILES_BY_ID = { "read": list_files_by_read_id, "file": list_files_by_file_id, @@ -162,34 +162,60 @@ def file( "region-type": list_files_by_region_type, } - if len(ids) == 0: - # list all the files - files = LIST_FILES[idtype](spec, modality) + # Get files based on whether we're filtering by IDs + if not ids: + # list all files + return LIST_FILES[selector](spec, modality) else: - # list only the id files - files = LIST_FILES_BY_ID[idtype](spec, modality, ids) - - FORMAT = { - "list": format_list_files_metadata, - "paired": format_list_files, - "interleaved": format_list_files, - "index": format_list_files, - "json": format_json_files, - } + # list files by id + return LIST_FILES_BY_ID[selector](spec, modality, ids) + + +def run_file(parser: ArgumentParser, args: Namespace) -> None: + """Run the file command.""" + validate_file_args(parser, args) - x = FORMAT[fmt](files, fmt, k, spec_fn, fp) - return x + spec = load_spec(args.yaml) + ids = args.ids.split(",") if args.ids else [] + + files = seqspec_file( + spec=spec, + modality=args.modality, + ids=ids, + selector=args.selector, + ) + + if files: + FORMAT = { + "list": format_list_files_metadata, + "paired": format_list_files, + "interleaved": format_list_files, + "index": format_list_files, + "json": format_json_files, + } + + result = FORMAT[args.format]( + files, args.format, args.key, Path(args.yaml), args.fullpath + ) + + if args.output: + args.output.write_text(str(result)) + else: + print(result) -def list_read_files(spec, modality): +def list_read_files(spec: Assay, modality: str) -> Dict[str, List[File]]: + """List files for all reads in a modality.""" files = defaultdict(list) reads = spec.get_seqspec(modality) for rd in reads: - files[rd.read_id] = rd.files + if rd.files: + files[rd.read_id] = rd.files return files -def list_files(spec, modality): +def list_all_files(spec: Assay, modality: str) -> Dict[str, List[File]]: + """List all files in a modality.""" files_rd = list_read_files(spec, modality) files_rgn = list_region_files(spec, modality) return {**files_rd, **files_rgn} @@ -210,131 +236,155 @@ def list_region_files(spec, modality): def format_list_files_metadata( - files: Dict[str, List[File]], fmt, k, spec_fn="", fp=False -): - x = "" + files: Dict[str, List[File]], + fmt: str, + k: str, + spec_fn: Path = Path(""), + fp: bool = False, +) -> str: + """Format file metadata as a tab-separated list.""" + x = [] if k == "all": for items in zip(*files.values()): for key, item in zip(files.keys(), items): - x += f"{key}\t{item.file_id}\t{item.filename}\t{item.filetype}\t{item.filesize}\t{item.url}\t{item.urltype}\t{item.md5}\n" - x = x[:-1] - + x.append( + f"{key}\t{item.file_id}\t{item.filename}\t{item.filetype}\t{item.filesize}\t{item.url}\t{item.urltype}\t{item.md5}" + ) else: for items in zip(*files.values()): for key, item in zip(files.keys(), items): attr = str(getattr(item, k)) id = item.file_id - x += f"{key}\t{id}\t{attr}\n" - x = x[:-1] + x.append(f"{key}\t{id}\t{attr}") + return "\n".join(x) - return x - -def format_json_files(files: Dict[str, List[File]], fmt, k, spec_fn="", fp=False): +def format_json_files( + files: Dict[str, List[File]], + fmt: str, + k: str, + spec_fn: Path = Path(""), + fp: bool = False, +) -> str: + """Format files as JSON.""" x = [] for items in zip(*files.values()): if k == "all": for key, item in zip(files.keys(), items): d = item.to_dict() if item.urltype == "local" and fp: - d["url"] = os.path.join(os.path.dirname(spec_fn), d["url"]) + d["url"] = str(spec_fn.parent / d["url"]) x.append(d) else: for key, item in zip(files.keys(), items): attr = getattr(item, k) if k == "url" and item.urltype == "local" and fp: - attr = os.path.join(os.path.dirname(spec_fn), attr) + attr = str(spec_fn.parent / attr) x.append({"file_id": item.file_id, k: attr}) return json.dumps(x, indent=4) -def format_list_files(files: Dict[str, List[File]], fmt, k=None, spec_fn="", fp=False): - x = "" +def format_list_files( + files: Dict[str, List[File]], + fmt: str, + k: Optional[str] = None, + spec_fn: Path = Path(""), + fp: bool = False, +) -> str: + """Format files as a list based on the format type.""" + x = [] + if fmt == "paired": - x = "" for items in zip(*files.values()): - t = "" + t = [] for i in items: if k: attr = str(getattr(i, k)) if k == "url" and i.urltype == "local" and fp: - attr = os.path.join(os.path.dirname(spec_fn), attr) - t += f"{attr}\t" + attr = str(spec_fn.parent / attr) + t.append(attr) else: - t += f"{i.filename}\t" - x += f"{t[:-1]}\n" - x = x[:-1] + t.append(i.filename) + x.append("\t".join(t)) - elif fmt == "interleaved" or fmt == "list": + elif fmt in ["interleaved", "list"]: for items in zip(*files.values()): for item in items: id = item.filename if k: id = str(getattr(item, k)) if k == "url" and item.urltype == "local" and fp: - id = os.path.join(os.path.dirname(spec_fn), id) - x += id + "\n" - x = x[:-1] + id = str(spec_fn.parent / id) + x.append(id) + elif fmt == "index": + t = [] for items in zip(*files.values()): for item in items: id = item.filename if k: id = str(getattr(item, k)) if k == "url" and item.urltype == "local" and fp: - id = os.path.join(os.path.dirname(spec_fn), id) - x += id + "," - x = x[:-1] + id = str(spec_fn.parent / id) + t.append(id) + x.append(",".join(t)) - return x + return "\n".join(x) -def list_files_by_read_id(spec, modality, read_ids): +def list_files_by_read_id( + spec: Assay, modality: str, read_ids: List[str] +) -> Dict[str, List[File]]: + """List files for specific read IDs.""" seqspec = spec.get_seqspec(modality) files = defaultdict(list) ids = set(read_ids) # TODO return the files in the order of the ids given in the input # NOTE ORDERING HERE IS IMPORANT SEE GET_INDEX_BY_FILES FUNCTION for read in seqspec: - if read.read_id in ids: - for file in read.files: - # files[read.read_id].append(file.filename) - files[read.read_id].append(file) + if read.read_id in ids and read.files: + files[read.read_id].extend(read.files) return files -def list_files_by_file_id(spec, modality, file_ids): +def list_files_by_file_id( + spec: Assay, modality: str, file_ids: List[str] +) -> Dict[str, List[File]]: + """List files for specific file IDs.""" seqspec = spec.get_seqspec(modality) ids = set(file_ids) files = defaultdict(list) # TODO: NOTE ORDERING HERE IS IMPORTANT SEE RUN_LIST_FILES FUNCTION for read in seqspec: - for file in read.files: - if file.filename in ids: - # files[read.read_id].append(file.filename) - files[read.read_id].append(file) + if read.files: + for file in read.files: + if file.filename in ids: + files[read.read_id].append(file) return files -def list_files_by_region_id(spec, modality, file_ids): +def list_files_by_region_id( + spec: Assay, modality: str, region_ids: List[str] +) -> Dict[str, List[File]]: + """List files for specific region IDs.""" files = list_region_files(spec, modality) - - ids = set(file_ids) + ids = set(region_ids) new_files = defaultdict(list) - for region_id, files in files.items(): + for region_id, region_files in files.items(): if region_id in ids: - new_files[region_id] += files + new_files[region_id].extend(region_files) return new_files -def list_files_by_region_type(spec, modality, file_ids): +def list_files_by_region_type( + spec: Assay, modality: str, region_types: List[str] +) -> Dict[str, List[File]]: + """List files for specific region types.""" files = list_region_files(spec, modality) - - ids = set(file_ids) + ids = set(region_types) new_files = defaultdict(list) - for region_id, files in files.items(): + for region_id, region_files in files.items(): r = seqspec_find.find_by_region_id(spec, modality, region_id)[0] - rt = r.region_type - if rt in ids: - new_files[region_id] += files + if r.region_type in ids: + new_files[region_id].extend(region_files) return new_files diff --git a/seqspec/seqspec_find.py b/seqspec/seqspec_find.py index f5aec054..d1cd9f5d 100644 --- a/seqspec/seqspec_find.py +++ b/seqspec/seqspec_find.py @@ -1,13 +1,24 @@ +"""Find module for seqspec CLI. + +This module provides functionality to search for objects within seqspec files. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS +import warnings +import yaml +from typing import List, Optional, Union + from seqspec.utils import load_spec from seqspec.Assay import Assay -import yaml -import argparse -import warnings -from argparse import RawTextHelpFormatter -from seqspec.seqspec_file import list_files +from seqspec.Read import Read +from seqspec.Region import Region +from seqspec.File import File +from seqspec.seqspec_file import list_all_files -def setup_find_args(parser): +def setup_find_args(parser) -> ArgumentParser: + """Create and configure the find command subparser.""" subparser = parser.add_parser( "find", description=""" @@ -24,45 +35,48 @@ def setup_find_args(parser): ) subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) # depracate - subparser.add_argument("--rtype", help=argparse.SUPPRESS, action="store_true") + subparser.add_argument("--rtype", help=SUPPRESS, action="store_true") choices = ["read", "region", "file", "region-type"] subparser.add_argument( "-s", - metavar="Selector", - help=(f"Selector, [{','.join(choices)}] (default: region)"), + "--selector", + metavar="SELECTOR", + help=f"Selector, [{','.join(choices)}] (default: region)", type=str, default="region", choices=choices, ) subparser_required.add_argument( "-m", + "--modality", metavar="MODALITY", - help=("Modality"), + help="Modality", type=str, - default=None, required=True, ) # depracate -r subparser_required.add_argument( "-r", metavar="REGION", - help=argparse.SUPPRESS, + help=SUPPRESS, type=str, default=None, ) subparser_required.add_argument( "-i", - metavar="IDs", - help=("IDs"), + "--id", + metavar="ID", + help="ID to search for", type=str, default=None, required=False, @@ -71,8 +85,14 @@ def setup_find_args(parser): return subparser -def validate_find_args(parser, args): - # IDs +def validate_find_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the find command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") + if args.r is not None: warnings.warn( "The '-r' argument is deprecated and will be removed in a future version. " @@ -80,46 +100,73 @@ def validate_find_args(parser, args): DeprecationWarning, ) # Optionally map the old option to the new one - if not args.i: - args.i = args.r - - fn = args.yaml - m = args.m - o = args.o - idtype = args.s # selector - ids = args.i - - # run function - return run_find(fn, m, ids, idtype, o) - - -def run_find(spec_fn: str, modality: str, id: str, idtype: str, o: str): - spec = load_spec(spec_fn) - found = [] - if idtype == "region-type": - found = find_by_region_type(spec, modality, id) - elif idtype == "region": - found = find_by_region_id(spec, modality, id) - elif idtype == "read": - found = find_by_read_id(spec, modality, id) - elif idtype == "file": - found = find_by_file_id(spec, modality, id) - else: - raise ValueError(f"Unknown idtype: {idtype}") + if not args.id: + args.id = args.r + + +def seqspec_find( + spec: Assay, selector: str, modality: str, id: Optional[str] = None +) -> Union[List[Read], List[Region], List[File]]: + """Core functionality to find objects in a seqspec file. + + Args: + spec: The Assay object to search in + selector: Type of object to search for (read, region, file, region-type) + modality: The modality to search in + id: The ID to search for (optional) + + Returns: + List of found objects matching the search criteria: + - List[Read] for "read" selector + - List[Region] for "region" and "region-type" selectors + - List[File] for "file" selector + - Empty list for unknown selectors + """ + FIND = { + "region-type": find_by_region_type, + "region": find_by_region_id, + "read": find_by_read_id, + "file": find_by_file_id, + } + + if selector not in FIND: + warnings.warn( + f"Unknown selector '{selector}'. Valid selectors are: {', '.join(FIND.keys())}" + ) + return [] + + return FIND[selector](spec, modality, id) + - # post processing - if o: - with open(o, "w") as f: +def run_find(parser: ArgumentParser, args: Namespace) -> None: + """Run the find command.""" + validate_find_args(parser, args) + + spec = load_spec(args.yaml) + found = seqspec_find(spec, args.selector, args.modality, args.id) + + # Handle output + if args.output: + with open(args.output, "w") as f: yaml.dump(found, f, sort_keys=False) else: print(yaml.dump(found, sort_keys=False)) - return +def find_by_read_id(spec: Assay, modality: str, id: Optional[str]) -> List[Read]: + """Find reads by their ID. + + Args: + spec: The seqspec specification. + modality: The modality to search in. + id: The read ID to search for. -# TODO implement -def find_by_read_id(spec: Assay, modality: str, id: str): + Returns: + A list of Read objects matching the ID. + """ rds = [] + if id is None: + return rds reads = spec.get_seqspec(modality) for r in reads: if r.read_id == id: @@ -127,10 +174,21 @@ def find_by_read_id(spec: Assay, modality: str, id: str): return rds -# TODO implement -def find_by_file_id(spec: Assay, modality: str, id: str): +def find_by_file_id(spec: Assay, modality: str, id: Optional[str]) -> List[File]: + """Find files by their ID. + + Args: + spec: The seqspec specification. + modality: The modality to search in. + id: The file ID to search for. + + Returns: + A list of File objects matching the ID. + """ files = [] - lf = list_files(spec, modality) + if id is None: + return files + lf = list_all_files(spec, modality) for k, v in lf.items(): for f in v: if f.file_id == id: @@ -138,13 +196,37 @@ def find_by_file_id(spec: Assay, modality: str, id: str): return files -def find_by_region_id(spec: Assay, modality: str, id: str): +def find_by_region_id(spec: Assay, modality: str, id: Optional[str]) -> List[Region]: + """Find regions by their ID. + + Args: + spec: The seqspec specification. + modality: The modality to search in. + id: The region ID to search for. + + Returns: + A list of Region objects matching the ID. + """ + if id is None: + return [] m = spec.get_libspec(modality) regions = m.get_region_by_id(id) return regions -def find_by_region_type(spec: Assay, modality: str, id: str): +def find_by_region_type(spec: Assay, modality: str, id: Optional[str]) -> List[Region]: + """Find regions by their type. + + Args: + spec: The seqspec specification. + modality: The modality to search in. + id: The region type to search for. + + Returns: + A list of Region objects matching the type. + """ + if id is None: + return [] m = spec.get_libspec(modality) regions = m.get_region_by_region_type(id) return regions diff --git a/seqspec/seqspec_format.py b/seqspec/seqspec_format.py index 87eccada..b6d209f6 100644 --- a/seqspec/seqspec_format.py +++ b/seqspec/seqspec_format.py @@ -1,8 +1,25 @@ +"""Format module for seqspec CLI. + +This module provides functionality to automatically format and fill in missing fields +in a seqspec specification file. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace + from seqspec.utils import load_spec -from argparse import RawTextHelpFormatter +from seqspec.Assay import Assay -def setup_format_args(parser): +def setup_format_args(parser) -> ArgumentParser: + """Create and configure the format command subparser. + + Args: + parser: The main argument parser to add the format subparser to. + + Returns: + The configured format subparser. + """ subparser = parser.add_parser( "format", description=""" @@ -16,34 +33,58 @@ def setup_format_args(parser): help="Autoformat seqspec file", formatter_class=RawTextHelpFormatter, ) - # subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", type=Path, help="Sequencing specification yaml file") subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + type=Path, + help="Path to output file", default=None, ) return subparser -def validate_format_args(parser, args): - fn = args.yaml - o = args.o +def validate_format_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the format command arguments. + + Args: + parser: The argument parser. + args: The parsed arguments. - run_format(spec_fn=fn, o=o) + Raises: + parser.error: If any validation fails. + """ + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") -def run_format(spec_fn, o): - spec = load_spec(spec_fn) - format(spec) - if o: - spec.to_YAML(o) + +def run_format(parser: ArgumentParser, args: Namespace) -> None: + """Run the format command. + + Args: + parser: The argument parser. + args: The parsed arguments. + """ + validate_format_args(parser, args) + + spec = load_spec(args.yaml) + format_spec(spec) + + if args.output: + spec.to_YAML(args.output) else: print(spec.to_YAML()) -def format(spec): - return spec.update_spec() +def format_spec(spec: Assay) -> None: + """Format a seqspec specification by updating its fields. + + Args: + spec: The seqspec specification to format. + """ + spec.update_spec() diff --git a/seqspec/seqspec_genbank.py b/seqspec/seqspec_genbank.py deleted file mode 100644 index 419d67ce..00000000 --- a/seqspec/seqspec_genbank.py +++ /dev/null @@ -1,210 +0,0 @@ -from seqspec.utils import load_genbank -import json -from seqspec.Region import Region -from seqspec.Assay import Assay - - -def setup_genbank_args(parser): - subparser = parser.add_parser( - "genbank", - description="get genbank about seqspec file", - help="get genbank about seqspec file", - ) - - subparser.add_argument("gbk", help="Genbank file") - subparser.add_argument( - "-o", - metavar="OUT", - help=("Path to output file"), - type=str, - default=None, - required=False, - ) - return subparser - - -def validate_genbank_args(parser, args): - # if everything is valid the run_genbank - fn = args.gbk - o = args.o - gb = load_genbank(fn) - - spec = run_genbank(gb) - - if o: - spec.to_YAML(o) - else: - print(json.dumps(spec, sort_keys=False, indent=4)) - - -def run_genbank(gb): - ex = gb_to_list(gb) - nested_json = nest_intervals(ex) - filled_regions = fill_gaps(gb.sequence, nested_json) - regions = convert(filled_regions) - spec = Assay( - "genbank", - "illumina", - "genbank thing", - "doi", - "date", - "description", - ["source"], - "", - regions, - ) - return spec - - -def gb_to_list(gb): - feat = [] - label = "source" - for f in gb.features: - id = f.key - - if "complement" in f.location: - start, stop = tuple(map(int, f.location[11:-1].split(".."))) - else: - start, stop = tuple(map(int, f.location.split(".."))) - - # convert to 0-index - start -= 1 - length = stop - start - seq = gb.sequence[start:stop] - - for q in f.qualifiers: - if q.key == "/label=": - label = q.value - break - feat.append( - { - "id": id, - "label": label, - "start": start, - "stop": stop, - "length": length, - "seq": seq, - } - ) - return feat - - -def nest_intervals(intervals): - def nest(start_index, end_limit): - nested = [] - - i = start_index - while i < len(intervals) and intervals[i]["start"] < end_limit: - current_interval = intervals[i] - child, next_index = nest(i + 1, current_interval["stop"]) - interval_obj = { - "id": current_interval["id"], - "label": current_interval["label"], - "start": current_interval["start"], - "stop": current_interval["stop"], - "length": current_interval["length"], - "seq": current_interval["seq"], - "regions": child, - } - nested.append(interval_obj) - i = next_index - - return nested, i - - result, _ = nest(0, intervals[0]["stop"]) - return result - - -def fill_gaps(seq, regions, parent_start=0, parent_stop=0): - if len(regions) == 0: - return [] - - # Insert a filler at the start if necessary - if regions[0]["start"] > parent_start: - start = parent_start - stop = regions[0]["start"] - s = seq[start:stop] - regions.insert( - 0, - { - "id": "filler_start", - "label": "filler_start", - "start": start, - "stop": stop, - "length": stop - start, - "seq": s, - "regions": [], - }, - ) - - new_regions = [] - for i, region in enumerate(regions): - # Append the current region - new_regions.append(region) - - # Recursive call for nested regions - if "regions" in region: - region["regions"] = fill_gaps( - seq, region["regions"], region["start"], region["stop"] - ) - - # Check for gap and insert a filler - if i < len(regions) - 1 and region["stop"] < regions[i + 1]["start"]: - filler_id = f'filler_{region["id"]}_{regions[i+1]["id"]}' - start = region["stop"] - stop = regions[i + 1]["start"] - s = seq[start:stop] - new_regions.append( - { - "id": filler_id, - "label": filler_id, - "start": start, - "stop": stop, - "length": stop - start, - "seq": s, - "regions": [], - } - ) - - # Insert a filler at the end if necessary - if new_regions[-1]["stop"] < parent_stop: - start = new_regions[-1]["stop"] - stop = parent_stop - s = seq[start:stop] - new_regions.append( - { - "id": "filler_end", - "label": "filler_end", - "start": start, - "stop": stop, - "length": stop - start, - "seq": s, - "regions": [], - } - ) - - return new_regions - - -# convert filled regions to seqspec, must be recursive function -# regions is a list -def convert(regions): - if len(regions) == 0: - return [] - new_regions = [] - for r in regions: - rgn = Region( - r["id"], - "", - r["label"], - "fixed", - r["seq"], - r["length"], - r["length"], - None, - None, - ) - if len(r["regions"]) > 0: - rgn.regions = convert(r["regions"]) - new_regions.append(rgn) - return new_regions diff --git a/seqspec/seqspec_index.py b/seqspec/seqspec_index.py index 4bbc6576..fff7f59e 100644 --- a/seqspec/seqspec_index.py +++ b/seqspec/seqspec_index.py @@ -1,19 +1,28 @@ +"""Index module for seqspec CLI. + +This module provides functionality to identify the position of elements in a spec for use in downstream tools. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS +import warnings +from typing import List, Optional, Dict, Any + from seqspec.utils import load_spec, map_read_id_to_regions from seqspec.seqspec_find import find_by_region_id -import warnings from seqspec.seqspec_file import list_files_by_file_id, list_read_files -from argparse import SUPPRESS, RawTextHelpFormatter -from seqspec.Region import complement_sequence -from seqspec.Region import RegionCoordinateDifference - from seqspec.Region import ( + complement_sequence, + RegionCoordinateDifference, project_regions_to_coordinates, itx_read, ) from seqspec.Read import ReadCoordinate +from seqspec.Assay import Assay -def setup_index_args(parser): +def setup_index_args(parser) -> ArgumentParser: + """Create and configure the index command subparser.""" subparser = parser.add_parser( "index", description=""" @@ -30,12 +39,13 @@ def setup_index_args(parser): formatter_class=RawTextHelpFormatter, ) subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) @@ -50,6 +60,7 @@ def setup_index_args(parser): choices = [ "chromap", "kb", + "kb-single", "relative", "seqkit", "simpleaf", @@ -60,8 +71,9 @@ def setup_index_args(parser): ] subparser.add_argument( "-t", + "--tool", metavar="TOOL", - help=(f"Tool, [{', '.join(choices)}] (default: tab)"), + help=f"Tool, [{', '.join(choices)}] (default: tab)", default="tab", type=str, choices=choices, @@ -70,8 +82,9 @@ def setup_index_args(parser): choices = ["read", "region", "file"] subparser.add_argument( "-s", + "--selector", metavar="SELECTOR", - help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"), + help=f"Selector for ID, [{', '.join(choices)}] (default: read)", type=str, default="read", choices=choices, @@ -90,16 +103,17 @@ def setup_index_args(parser): subparser_required.add_argument( "-m", + "--modality", metavar="MODALITY", - help=("Modality"), + help="Modality", type=str, - default=None, required=True, ) subparser_required.add_argument( "-i", + "--ids", metavar="IDs", - help=("IDs"), + help="IDs", type=str, default=None, required=False, @@ -114,7 +128,8 @@ def setup_index_args(parser): return subparser -def validate_index_args(parser, args): +def validate_index_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the index command arguments.""" if args.r is not None: warnings.warn( "The '-r' argument is deprecated and will be removed in a future version. " @@ -122,8 +137,8 @@ def validate_index_args(parser, args): DeprecationWarning, ) # Optionally map the old option to the new one - if not args.i: - args.i = args.r + if not args.ids: + args.ids = args.r if args.region: warnings.warn( "The '--region' argument is deprecated and will be removed in a future version. " @@ -131,61 +146,67 @@ def validate_index_args(parser, args): DeprecationWarning, ) - fn = args.yaml - m = args.m - ids = args.i # this can be a list of ids (reads, regions, or files) - t = args.t - o = args.o - subregion_type = args.subregion_type - rev = args.rev - idtype = args.s + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") - if ids is None and (idtype == "read" or idtype == "region"): + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") + + if args.ids is None and (args.selector == "read" or args.selector == "region"): parser.error("Must specify ids with -i for -s read or -s region") - return run_index(fn, m, ids, idtype, t, rev, subregion_type, o=o) - - -def run_index( - spec_fn, - modality, - ids, - idtype, - fmt, - rev, - subregion_type, - o, -): - spec = load_spec(spec_fn) - if ids is None: - ids = [] - else: - ids = ids.split(",") - x = index(spec, modality, ids, idtype, fmt, rev, subregion_type) +def seqspec_index( + spec: Assay, + modality: str, + ids: List[str], + idtype: str, + rev: bool = False, +) -> List[Dict[str, Any]]: + """Core functionality to get index information from the spec. + + Args: + spec: The Assay object to index + modality: The modality to index + ids: List of IDs to index + idtype: Type of ID (read, region, file) + rev: Whether to return 3'->5' region order + + Returns: + List of index dictionaries containing region coordinates and strand information + """ + GET_INDICES = { + "file": get_index_by_files, + } - # post processing - if o: - with open(o, "w") as f: - print(x, file=f) - else: - print(x) + GET_INDICES_BY_IDS = { + "file": get_index_by_file_ids, + "region": get_index_by_region_ids, + "read": get_index_by_read_ids, + } + + if not ids: + return GET_INDICES[idtype](spec, modality) + return GET_INDICES_BY_IDS[idtype](spec, modality, ids) - return +def format_index( + indices: List[Dict[str, Any]], fmt: str, subregion_type: Optional[str] = None +) -> str: + """Format index information into a specific output format. -def index( - spec, - modality, - ids, - idtype, - fmt, - rev=False, - subregion_type=None, -): + Args: + indices: List of index dictionaries from seqspec_index + fmt: Output format to use + subregion_type: Optional subregion type for filtering + + Returns: + Formatted index information as a string + """ FORMAT = { "chromap": format_chromap, "kb": format_kallisto_bus, + "kb-single": format_kallisto_bus_force_single, "relative": format_relative, "seqkit": format_seqkit_subseq, "simpleaf": format_simpleaf, @@ -195,22 +216,37 @@ def index( "zumis": format_zumis, } - GET_INDICES = { - "file": get_index_by_files, - } + if fmt not in FORMAT: + warnings.warn( + f"Unknown format '{fmt}'. Valid formats are: {', '.join(FORMAT.keys())}" + ) + return "" - GET_INDICES_BY_IDS = { - "file": get_index_by_file_ids, - "region": get_index_by_region_ids, - "read": get_index_by_read_ids, - } + return FORMAT[fmt](indices, subregion_type) - if len(ids) == 0: - indices = GET_INDICES[idtype](spec, modality) - else: - indices = GET_INDICES_BY_IDS[idtype](spec, modality, ids) - return FORMAT[fmt](indices, subregion_type) +def run_index(parser: ArgumentParser, args: Namespace) -> None: + """Run the index command.""" + validate_index_args(parser, args) + + spec = load_spec(args.yaml) + ids = args.ids.split(",") if args.ids else [] + + indices = seqspec_index( + spec, + args.modality, + ids, + args.selector, + args.rev, + ) + + result = format_index(indices, args.tool, args.subregion_type) + + if args.output: + with open(args.output, "w") as f: + print(result, file=f) + else: + print(result) def get_index_by_files(spec, modality): @@ -283,6 +319,9 @@ def get_index_by_primer( return {read_id: new_rcs, "strand": rdc.read.strand} +FEATURE_REGION_TYPES = {"CDNA", "GDNA", "PROTEIN", "TAG", "SGRNA_TARGET"} + + def format_kallisto_bus(indices, subregion_type=None): bcs = [] umi = [] @@ -295,12 +334,7 @@ def format_kallisto_bus(indices, subregion_type=None): bcs.append(f"{idx},{cut.start},{cut.stop}") elif cut.region_type.upper() == "UMI": umi.append(f"{idx},{cut.start},{cut.stop}") - elif ( - cut.region_type.upper() == "CDNA" - or cut.region_type.upper() == "GDNA" - or cut.region_type.upper() == "PROTEIN" - or cut.region_type.upper() == "TAG" - ): + elif cut.region_type.upper() in FEATURE_REGION_TYPES: feature.append(f"{idx},{cut.start},{cut.stop}") if len(umi) == 0: umi.append("-1,-1,-1") @@ -311,6 +345,38 @@ def format_kallisto_bus(indices, subregion_type=None): return x +def format_kallisto_bus_force_single(indices, subregion_type=None): + bcs = [] + umi = [] + feature = [] + longest_feature = None + max_length = 0 + + for idx, region in enumerate(indices): + rg_strand = region.pop("strand") # noqa + for rgn, cuts in region.items(): + for cut in cuts: + if cut.region_type.upper() == "BARCODE": + bcs.append(f"{idx},{cut.start},{cut.stop}") + elif cut.region_type.upper() == "UMI": + umi.append(f"{idx},{cut.start},{cut.stop}") + elif cut.region_type.upper() in FEATURE_REGION_TYPES: + length = cut.stop - cut.start + if length > max_length: + max_length = length + longest_feature = f"{idx},{cut.start},{cut.stop}" + + if len(umi) == 0: + umi.append("-1,-1,-1") + if len(bcs) == 0: + bcs.append("-1,-1,-1") + if longest_feature: + feature.append(longest_feature) + + x = ",".join(bcs) + ":" + ",".join(umi) + ":" + ",".join(feature) + return x + + # this one should only return one string # TODO: return to this def format_seqkit_subseq(indices, subregion_type=None): diff --git a/seqspec/seqspec_info.py b/seqspec/seqspec_info.py index c01a9639..9a257311 100644 --- a/seqspec/seqspec_info.py +++ b/seqspec/seqspec_info.py @@ -1,13 +1,13 @@ from seqspec.utils import load_spec import json -from typing import List -from seqspec.Region import Region -from seqspec.Read import Read +from typing import Dict from seqspec.Assay import Assay -from argparse import RawTextHelpFormatter +from argparse import RawTextHelpFormatter, ArgumentParser, Namespace +from pathlib import Path -def setup_info_args(parser): +def setup_info_args(parser) -> ArgumentParser: + """Create and configure the info command subparser.""" subparser = parser.add_parser( "info", description=""" @@ -24,12 +24,13 @@ def setup_info_args(parser): formatter_class=RawTextHelpFormatter, ) - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) choices = ["modalities", "meta", "sequence_spec", "library_spec"] subparser.add_argument( "-k", + "--key", metavar="KEY", - help=(f"Object to display, [{', '.join(choices)}] (default: meta)"), + help=f"Object to display, [{', '.join(choices)}] (default: meta)", type=str, default="meta", required=False, @@ -37,8 +38,9 @@ def setup_info_args(parser): choices = ["tab", "json"] subparser.add_argument( "-f", + "--format", metavar="FORMAT", - help=(f"The output format, [{', '.join(choices)}] (default: tab)"), + help=f"The output format, [{', '.join(choices)}] (default: tab)", type=str, default="tab", required=False, @@ -46,116 +48,239 @@ def setup_info_args(parser): ) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, required=False, ) return subparser -def validate_info_args(parser, args): - spec_fn = args.yaml - o = args.o - k = args.k - fmt = args.f - return run_info(spec_fn, fmt, k, o) +def validate_info_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the info command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") + + +def run_info(parser: ArgumentParser, args: Namespace) -> None: + """Run the info command.""" + validate_info_args(parser, args) + + spec = load_spec(args.yaml) + + if args.key: + # Extract data + info = seqspec_info(spec, args.key) + # Format info + result = format_info(info, args.key, args.format) + if args.output: + with open(args.output, "w") as f: + if args.format == "json": + f.write(result) + else: + print(result, file=f) + else: + print(result) -def run_info(spec_fn, f, k=None, o=None): - # return json of the Assay object - spec = load_spec(spec_fn) - CMD = { + +def seqspec_info(spec: Assay, key: str) -> Dict: + """Get information from the spec based on the key. + + Args: + spec: The Assay object to get info from + key: The type of information to retrieve (modalities, meta, sequence_spec, library_spec) + + Returns: + Dictionary containing the requested information + + Raises: + KeyError: If the requested key is not supported + """ + INFO_FUNCS = { "modalities": seqspec_info_modalities, - "meta": seqspec_info, + "meta": seqspec_info_meta, "sequence_spec": seqspec_info_sequence_spec, "library_spec": seqspec_info_library_spec, } - s = "" - if k: - s = CMD[k](spec, f) + if key not in INFO_FUNCS: + raise KeyError( + f"Unsupported info key: {key}. Must be one of {list(INFO_FUNCS.keys())}" + ) + return INFO_FUNCS[key](spec) - if o: - with open(o, "w") as f: - json.dump(s, f, sort_keys=False, indent=4) - else: - print(s) - return +def format_info(info: Dict, key: str, fmt: str = "tab") -> str: + """Format information based on the key and format. -def seqspec_info(spec, fmt): - s = format_info(spec, fmt) - return s + Args: + info: Dictionary containing the information to format + key: The type of information to format (modalities, meta, sequence_spec, library_spec) + fmt: Output format (tab or json) + Returns: + Formatted string -def seqspec_info_library_spec(spec, fmt): + Raises: + KeyError: If the requested key is not supported + """ + FORMAT_FUNCS = { + "modalities": format_modalities, + "meta": format_meta, + "sequence_spec": format_sequence_spec, + "library_spec": format_library_spec, + } + if key not in FORMAT_FUNCS: + raise KeyError( + f"Unsupported format key: {key}. Must be one of {list(FORMAT_FUNCS.keys())}" + ) + return FORMAT_FUNCS[key](info, fmt) + + +def seqspec_info_meta(spec: Assay) -> Dict: + """Get meta information about the spec. + + Args: + spec: The Assay object to get info from + + Returns: + Dictionary containing meta information + """ + sd = spec.to_dict() + del sd["library_spec"] + del sd["sequence_spec"] + del sd["modalities"] + return {"meta": sd} + + +def seqspec_info_library_spec(spec: Assay) -> Dict: + """Get library specification information. + + Args: + spec: The Assay object to get info from + + Returns: + Dictionary containing library specifications by modality + """ modalities = spec.list_modalities() - s = "" + result = {} for m in modalities: libspec = spec.get_libspec(m) - s += format_library_spec(m, libspec.get_leaves(), fmt) - return s + result[m] = libspec.get_leaves() + return {"library_spec": result} -def seqspec_info_sequence_spec(spec: Assay, fmt): - reads = format_sequence_spec(spec.sequence_spec, fmt) - return reads +def seqspec_info_sequence_spec(spec: Assay) -> Dict: + """Get sequence specification information. + Args: + spec: The Assay object to get info from -def seqspec_info_modalities(spec, fmt): - modalities = format_modalities(spec.list_modalities(), fmt) - return modalities + Returns: + Dictionary containing sequence specifications + """ + return {"sequence_spec": spec.sequence_spec} -def format_info(spec: Assay, fmt="tab"): - sd = spec.to_dict() - del sd["library_spec"] - del sd["sequence_spec"] - del sd["modalities"] - s = "" +def seqspec_info_modalities(spec: Assay) -> Dict: + """Get list of modalities. + + Args: + spec: The Assay object to get info from + + Returns: + Dictionary containing list of modalities + """ + return {"modalities": spec.list_modalities()} + + +def format_meta(info: Dict, fmt: str = "tab") -> str: + """Format meta information. + + Args: + info: Dictionary containing meta information from seqspec_info_meta + fmt: Output format (tab or json) + + Returns: + Formatted string + """ if fmt == "tab": - for k, v in sd.items(): - s += f"{v}\t" - s = s[:-1] + return "\t".join(str(v) for v in info["meta"].values()) elif fmt == "json": - s = json.dumps(sd, sort_keys=False, indent=4) + return json.dumps(info["meta"], sort_keys=False, indent=4) + return "" + - return s +def format_modalities(info: Dict, fmt: str = "tab") -> str: + """Format list of modalities. + Args: + info: Dictionary containing modalities from seqspec_info_modalities + fmt: Output format (tab or json) -def format_modalities(modalities: List[str], fmt="tab"): - s = "" + Returns: + Formatted string + """ if fmt == "tab": - s = "\t".join(modalities) + return "\t".join(info["modalities"]) elif fmt == "json": - s = json.dumps(modalities, sort_keys=False, indent=4) - return s + return json.dumps(info["modalities"], sort_keys=False, indent=4) + return "" -def format_sequence_spec(sequence_spec: List[Read], fmt="tab"): - s = "" +def format_sequence_spec(info: Dict, fmt: str = "tab") -> str: + """Format sequence specification. + + Args: + info: Dictionary containing sequence specs from seqspec_info_sequence_spec + fmt: Output format (tab or json) + + Returns: + Formatted string + """ if fmt == "tab": - # format the output as a table - for r in sequence_spec: - s += f"{r.modality}\t{r.read_id}\t{r.strand}\t{r.min_len}\t{r.max_len}\t{r.primer_id}\t{r.name}\t{','.join([i.file_id for i in r.files])}\n" - s = s[:-1] + lines = [] + for r in info["sequence_spec"]: + files = ",".join([i.file_id for i in r.files]) if r.files else "" + lines.append( + f"{r.modality}\t{r.read_id}\t{r.strand}\t{r.min_len}\t{r.max_len}\t{r.primer_id}\t{r.name}\t{files}" + ) + return "\n".join(lines) elif fmt == "json": - s = json.dumps([i.to_dict() for i in sequence_spec], sort_keys=False, indent=4) - return s + return json.dumps( + [i.to_dict() for i in info["sequence_spec"]], sort_keys=False, indent=4 + ) + return "" + + +def format_library_spec(info: Dict, fmt: str = "tab") -> str: + """Format library specification. + Args: + info: Dictionary containing library specs from seqspec_info_library_spec + fmt: Output format (tab or json) -def format_library_spec(modality: str, library_spec: List[Region], fmt="tab"): - s = "" + Returns: + Formatted string + """ if fmt == "tab": - for r in library_spec: - file = None - if r.onlist: - file = r.onlist.filename - s += f"{modality}\t{r.region_id}\t{r.region_type}\t{r.name}\t{r.sequence_type}\t{r.sequence}\t{r.min_len}\t{r.max_len}\t{file}\n" - s = s[:-1] + lines = [] + for modality, regions in info["library_spec"].items(): + for r in regions: + file = r.onlist.filename if r.onlist else None + lines.append( + f"{modality}\t{r.region_id}\t{r.region_type}\t{r.name}\t{r.sequence_type}\t{r.sequence}\t{r.min_len}\t{r.max_len}\t{file}" + ) + return "\n".join(lines) elif fmt == "json": - s = json.dumps( - {modality: [i.to_dict() for i in library_spec]}, sort_keys=False, indent=4 + return json.dumps( + {m: [i.to_dict() for i in r] for m, r in info["library_spec"].items()}, + sort_keys=False, + indent=4, ) - return s + return "" diff --git a/seqspec/seqspec_init.py b/seqspec/seqspec_init.py index 066c5b65..a65af025 100644 --- a/seqspec/seqspec_init.py +++ b/seqspec/seqspec_init.py @@ -1,18 +1,25 @@ +"""Init module for seqspec CLI. + +This module provides functionality to generate new seqspec files from a newick tree format. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace +from typing import List + +import newick from seqspec.Assay import Assay from seqspec.Region import Region from seqspec.File import File from seqspec.Read import Read -from typing import List -import newick -from argparse import RawTextHelpFormatter # example +# seqspec init -n myassay -m 1 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna)" # seqspec init -n myassay -m 1 -o spec.yaml -r "rna,R1.fastq.gz,truseq_r1,16,pos:rna,R2.fastq.gz,truseq_r2,100,neg" " ((truseq_r1:10,barcode:16,umi:12,cdna:150)rna)" +# seqspec init -n myassay -m 2 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna,((barcode:16)r1.fastq.gz,(gdna:150)r2.fastq.gz,(gdna:150)r3.fastq.gz)atac)" -# seqspec init -n myassay -m 1 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna)" -# seqspec init -n myassay -m 1 -o spec.yaml -r "rna,R1.fastq.gz,truseq_r1,16,pos:rna,R2.fastq.gz,truseq_r2,100,neg" "((truseq_r1:10,barcode:16,umi:12,cdna:150)rna)" -# seqspec init -n myassay -m 2 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna,((barcode:16)r1.fastq.gz,(gdna:150)r2.fastq.gz,(gdna:150)r3.fastq.gz)atac)" -def setup_init_args(parser): +def setup_init_args(parser) -> ArgumentParser: + """Create and configure the init command subparser.""" subparser = parser.add_parser( "init", description=""" @@ -27,85 +34,90 @@ def setup_init_args(parser): ) subparser_required = subparser.add_argument_group("required arguments") subparser_required.add_argument( - "-n", metavar="NAME", type=str, help="assay name", required=True + "-n", "--name", metavar="NAME", type=str, help="Assay name", required=True ) - # -m "rna,atac" subparser_required.add_argument( "-m", + "--modalities", metavar="MODALITIES", type=str, - help="list of comma-separated modalities (e.g. rna,atac)", + help="List of comma-separated modalities (e.g. rna,atac)", required=True, ) # -r "rna,R1.fastq.gz,truseq_r1,16,pos:rna,R2.fastq.gz,truseq_r2,100,neg" subparser_required.add_argument( "-r", + "--reads", metavar="READS", type=str, - help="list of modalities, reads, primer_ids, lengths, and strand (e.g. modality,fastq_name,primer_id,len,strand:...)", + help="List of modalities, reads, primer_ids, lengths, and strand (e.g. modality,fastq_name,primer_id,len,strand:...)", required=True, ) - subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) subparser.add_argument( "newick", - help=( - "tree in newick format (https://marvin.cs.uidaho.edu/Teaching/CS515/newickFormat.html)" - ), + help="Tree in newick format (https://marvin.cs.uidaho.edu/Teaching/CS515/newickFormat.html)", ) return subparser -def validate_init_args(parser, args): - name = args.n - modalities_str = args.m - newick_str = args.newick - o = args.o - reads_str = args.r +def validate_init_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the init command arguments.""" + if not args.newick: + parser.error("Newick tree must be provided") - if newick_str is None: - parser.error("modality-FASTQs pairs must be provided") + if args.output and args.output.exists() and not args.output.is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") - return run_init(name, modalities_str, newick_str, reads_str, o) +def run_init(parser: ArgumentParser, args: Namespace) -> None: + """Run the init command.""" + validate_init_args(parser, args) -def run_init(name: str, modalities_str, newick_str, reads_str, o=None): - modalities = modalities_str.split(",") - reads = parse_reads_string(reads_str) - tree = newick.loads(newick_str) - if len(tree[0].descendants) != len(modalities): - raise ValueError( - "Number of modalities must match number of modality-FASTQs pairs" - ) + modalities = args.modalities.split(",") + reads = parse_reads_string(args.reads) + regions = newick_to_regions(args.newick) - reads = parse_reads_string(reads_str) - spec = init(name, modalities, tree[0].descendants, reads) + spec = seqspec_init(args.name, modalities, regions, reads) + yaml_str = spec.to_YAML() + if yaml_str is None: + raise ValueError("Failed to generate YAML string from assay") - if o: - spec.to_YAML(o) + if args.output: + args.output.write_text(yaml_str) else: - print(spec.to_YAML()) + print(yaml_str) + - return +def seqspec_init( + name: str, modalities: List[str], regions: List[Region], reads: List[Read] +) -> Assay: + """Initialize a new seqspec specification. + Args: + name: Name of the assay + modalities: List of modalities + regions: List of Region objects + reads: List of read specifications -def init(name: str, modalities, tree: List[newick.Node], reads: List[Read]): - # make read for each fastq - # make region for each modality - # add modality regions to assay - rgns = [] - for t in tree: - r = Region(region_id="", region_type="", name="", sequence_type="") - rgns.append(newick_to_region(t, r)) + Returns: + Initialized Assay object - assay = Assay( + Raises: + ValueError: If number of modalities doesn't match number of regions + """ + if len(regions) != len(modalities): + raise ValueError("Number of modalities must match number of regions") + + return Assay( assay_id="", name=name, doi="", @@ -118,59 +130,103 @@ def init(name: str, modalities, tree: List[newick.Node], reads: List[Read]): sequence_kit="", sequence_protocol="", sequence_spec=reads, - library_spec=rgns, + library_spec=regions, ) - return assay -def newick_to_region( - node, region=Region(region_id="", region_type="", name="", sequence_type="") -): +def newick_to_regions(newick_str: str) -> List[Region]: + """Convert a newick string to a list of Region objects. + + Args: + newick_str: Newick format string representing the library structure + + Returns: + List of Region objects + + Raises: + ValueError: If newick string is invalid + """ + try: + tree = newick.loads(newick_str) + except Exception as e: + raise ValueError(f"Invalid newick string: {e}") + + regions = [] + for node in tree[0].descendants: + region = Region(region_id="", region_type="", name="", sequence_type="") + regions.append(newick_to_region(node, region)) + return regions + + +def newick_to_region(node: newick.Node, region: Region) -> Region: + """Convert a newick node to a Region object. + + Args: + node: Newick tree node + region: Base region object to populate + + Returns: + Populated Region object + """ region.region_id = node.name region.name = node.name - if len(node.descendants) == 0: + if not node.descendants: region.min_len = int(node.length) region.max_len = int(node.length) return region + region.regions = [] - for n in node.descendants: + for descendant in node.descendants: region.regions.append( newick_to_region( - n, - Region(region_id=n.name, region_type="", name=n.name, sequence_type=""), + descendant, + Region( + region_id=descendant.name, + region_type="", + name=descendant.name, + sequence_type="", + ), ) ) return region -def parse_reads_string(input_string): +def parse_reads_string(input_string: str) -> List[Read]: + """Parse a string of read specifications into Read objects. + + Args: + input_string: String containing read specifications in format + "modality,read_id,primer_id,min_len,strand:..." + + Returns: + List of Read objects + """ reads = [] - objects = input_string.split(":") - for obj in objects: - parts = obj.split(",") - modality, read_id, primer_id, min_len, strand = parts - - read = Read( - read_id=read_id, - name=read_id, - modality=modality, - primer_id=primer_id, - min_len=int(min_len), - max_len=int(min_len), - strand=strand, - files=[ - File( - file_id=read_id, - filename=read_id, - filetype="", - filesize=0, - url="", - urltype="", - md5="", - ) - ], + for obj in input_string.split(":"): + modality, read_id, primer_id, min_len, strand = obj.split(",") + + reads.append( + Read( + read_id=read_id, + name=read_id, + modality=modality, + primer_id=primer_id, + min_len=int(min_len), + max_len=int(min_len), + strand=strand, + files=[ + File( + file_id=read_id, + filename=read_id, + filetype="", + filesize=0, + url="", + urltype="", + md5="", + ) + ], + ) ) - reads.append(read) return reads diff --git a/seqspec/seqspec_methods.py b/seqspec/seqspec_methods.py index 2d19915c..6f7a3b01 100644 --- a/seqspec/seqspec_methods.py +++ b/seqspec/seqspec_methods.py @@ -1,17 +1,27 @@ +"""Methods module for seqspec. + +This module provides functionality to convert seqspec files into methods sections. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace + from seqspec.utils import load_spec from seqspec.Assay import Assay from seqspec.Region import Region -from argparse import RawTextHelpFormatter +from seqspec.Read import Read, File -def setup_methods_args(parser): +def setup_methods_args(parser) -> ArgumentParser: + """Create and configure the methods command subparser.""" subparser = parser.add_parser( "methods", description=""" Convert seqspec file into methods section. Examples: -seqspec methods -m rna spec.yaml # Return methods section for rna modality +seqspec methods -m rna -o methods.txt spec.yaml # Save methods section to file +seqspec methods -m rna spec.yaml # Print methods section to stdout --- """, help="Convert seqspec file into methods section", @@ -19,46 +29,50 @@ def setup_methods_args(parser): ) subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str) subparser_required.add_argument( "-m", + "--modality", metavar="MODALITY", - help=("Modality"), + help="Modality", type=str, - default=None, required=True, ) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) return subparser -def validate_methods_args(parser, args): - # if everything is valid the run_methods - fn = args.yaml - o = args.o +def validate_methods_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the methods command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") - m = args.m - return run_methods(fn, m, o) + if args.output and args.output.exists() and not args.output.is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") -def run_methods(spec_fn, m, o): - spec = load_spec(spec_fn) - m = methods(spec, m) +def run_methods(parser: ArgumentParser, args: Namespace) -> None: + """Run the methods command.""" + validate_methods_args(parser, args) - if o: - with open(o, "w") as f: - print(m, file=f) + spec = load_spec(args.yaml) + methods_text = methods(spec, args.modality) + + if args.output: + args.output.write_text(methods_text) else: - print(m) + print(methods_text) -def methods(spec, modality): +def methods(spec: Assay, modality: str) -> str: + """Generate methods section for spec and modality.""" m = f"""Methods The {modality} portion of the {spec.name} assay was generated on {spec.date}. """ @@ -67,15 +81,16 @@ def methods(spec, modality): # TODO: manage sequence/library protocol/kit for cases where each modality has different protocols/kits -def format_library_spec(spec: Assay, m): - leaves = spec.get_libspec(m).get_leaves() +def format_library_spec(spec: Assay, modality: str) -> str: + """Format library specification for methods section.""" + leaves = spec.get_libspec(modality).get_leaves() lib_prot = None if isinstance(spec.library_protocol, str): lib_prot = spec.library_protocol elif isinstance(spec.library_protocol, list): for i in spec.library_protocol: - if i.modality == m: + if i.modality == modality: lib_prot = i.protocol_id lib_kit = None @@ -83,7 +98,7 @@ def format_library_spec(spec: Assay, m): lib_kit = spec.library_kit elif isinstance(spec.library_kit, list): for i in spec.library_kit: - if i.modality == m: + if i.modality == modality: lib_kit = i.kit_id seq_prot = None @@ -91,7 +106,7 @@ def format_library_spec(spec: Assay, m): seq_prot = spec.sequence_protocol elif isinstance(spec.sequence_protocol, list): for i in spec.sequence_protocol: - if i.modality == m: + if i.modality == modality: seq_prot = i.protocol_id seq_kit = None @@ -99,7 +114,7 @@ def format_library_spec(spec: Assay, m): seq_kit = spec.sequence_kit elif isinstance(spec.sequence_kit, list): for i in spec.sequence_kit: - if i.modality == m: + if i.modality == modality: seq_kit = i.kit_id s = f""" @@ -112,13 +127,14 @@ def format_library_spec(spec: Assay, m): \nSequence structure\n The library was sequenced on a {seq_prot} using the {seq_kit} sequencing kit. The library was sequenced using the following configuration:\n """ - reads = spec.get_seqspec(m) + reads = spec.get_seqspec(modality) for idx, r in enumerate(reads, 1): s += format_read(r, idx) return s -def format_region(region: Region, idx: int = 1): +def format_region(region: Region, idx: int = 1) -> str: + """Format region for methods section.""" s = f"{idx}. {region.name}: {region.min_len}-{region.max_len}bp {region.sequence_type} sequence ({region.sequence})" if region.onlist: s += f", onlist file: {region.onlist.filename}.\n" @@ -127,14 +143,15 @@ def format_region(region: Region, idx: int = 1): return s -def format_read(read, idx: int = 1): +def format_read(read: Read, idx: int = 1) -> str: + """Format read for methods section.""" s = f"- {read.name}: {read.max_len} cycles on the {'positive' if read.strand == 'pos' else 'negative'} strand using the {read.primer_id} primer. The following files contain the sequences in Read {idx}:\n" - for idx, f in enumerate(read.files, 1): - s += " " + format_read_file(f, idx) - s = s[:-1] + if read.files: + for idx, f in enumerate(read.files, 1): + s += " " + format_read_file(f, idx) return s -def format_read_file(file, idx: int = 1): - s = f"- File {idx}: {file.filename}\n" - return s +def format_read_file(file: File, idx: int = 1) -> str: + """Format read file for methods section.""" + return f"- File {idx}: {file.filename}\n" diff --git a/seqspec/seqspec_modify.py b/seqspec/seqspec_modify.py index 006cf005..78aa1ecd 100644 --- a/seqspec/seqspec_modify.py +++ b/seqspec/seqspec_modify.py @@ -1,12 +1,20 @@ +"""Modify module for seqspec. + +This module provides functionality to modify attributes of various elements in seqspec files. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS +from typing import List, Optional +import warnings + from seqspec.utils import load_spec from seqspec.File import File -from argparse import RawTextHelpFormatter, SUPPRESS -import warnings +from seqspec.Assay import Assay -# TODO fix modify to use the -s selector -def setup_modify_args(parser): - # given a spec, a region id and a list of key value property pairs, modify the spec +def setup_modify_args(parser) -> ArgumentParser: + """Create and configure the modify command subparser.""" subparser = parser.add_parser( "modify", description=""" @@ -22,34 +30,34 @@ def setup_modify_args(parser): formatter_class=RawTextHelpFormatter, ) subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str) # Read properties subparser.add_argument( "--read-id", metavar="READID", - help=("New ID of read"), + help="New ID of read", type=str, default=None, ) subparser.add_argument( "--read-name", metavar="READNAME", - help=("New name of read"), + help="New name of read", type=str, default=None, ) subparser.add_argument( "--primer-id", metavar="PRIMERID", - help=("New ID of primer"), + help="New ID of primer", type=str, default=None, ) subparser.add_argument( "--strand", metavar="STRAND", - help=("New strand"), + help="New strand", type=str, default=None, ) @@ -59,45 +67,44 @@ def setup_modify_args(parser): subparser.add_argument( "--files", metavar="FILES", - help=("New files, (filename,filetype,filesize,url,urltype,md5:...)"), + help="New files, (filename,filetype,filesize,url,urltype,md5:...)", type=str, default=None, ) # Region properties - subparser.add_argument( "--region-id", metavar="REGIONID", - help=("New ID of region"), + help="New ID of region", type=str, default=None, ) subparser.add_argument( "--region-type", metavar="REGIONTYPE", - help=("New type of region"), + help="New type of region", type=str, default=None, ) subparser.add_argument( "--region-name", metavar="REGIONNAME", - help=("New name of region"), + help="New name of region", type=str, default=None, ) subparser.add_argument( "--sequence-type", metavar="SEQUENCETYPE", - help=("New type of sequence"), + help="New type of sequence", type=str, default=None, ) subparser.add_argument( "--sequence", metavar="SEQUENCE", - help=("New sequence"), + help="New sequence", type=str, default=None, ) @@ -106,25 +113,25 @@ def setup_modify_args(parser): subparser.add_argument( "--min-len", metavar="MINLEN", - help=("Min region length"), + help="Min region length", type=int, default=None, ) subparser.add_argument( "--max-len", metavar="MAXLEN", - help=("Max region length"), + help="Max region length", type=int, default=None, ) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, - required=False, ) subparser_required.add_argument( "-r", @@ -132,38 +139,38 @@ def setup_modify_args(parser): help=SUPPRESS, type=str, default=None, - required=False, ) subparser_required.add_argument( "-i", metavar="IDs", - help=("IDs"), + help="IDs", type=str, default=None, - required=False, ) choices = ["read", "region"] subparser.add_argument( "-s", + "--selector", metavar="SELECTOR", - help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"), + help=f"Selector for ID, [{', '.join(choices)}] (default: read)", type=str, default="read", choices=choices, ) subparser_required.add_argument( "-m", + "--modality", metavar="MODALITY", - help=("Modality of the assay"), + help="Modality of the assay", type=str, - default=None, required=True, ) return subparser -def validate_modify_args(parser, args): +def validate_modify_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the modify command arguments.""" if args.r is not None: warnings.warn( "The '-r' argument is deprecated and will be removed in a future version. " @@ -174,103 +181,99 @@ def validate_modify_args(parser, args): if not args.i: args.i = args.r - # if everything is valid the run_format - fn = args.yaml - o = args.o - modality = args.m - # target_r = args.r - idtype = args.s # selector - ids = args.i + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") - # Read properties - read_id = args.read_id - read_name = args.read_name - primer_id = args.primer_id - strand = args.strand - files = args.files + if args.output and args.output.exists() and not args.output.is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") - # Region properties - region_id = args.region_id - region_type = args.region_type - region_name = args.region_name - sequence_type = args.sequence_type - sequence = args.sequence - # Read and Region properties - min_len = args.min_len - max_len = args.max_len +def run_modify(parser: ArgumentParser, args: Namespace) -> None: + """Run the modify command.""" + validate_modify_args(parser, args) - spec = load_spec(fn) + spec = load_spec(args.yaml) + # Read properties read_kwd = { - "read_id": read_id, - "read_name": read_name, - "primer_id": primer_id, - "min_len": min_len, - "max_len": max_len, - "strand": strand, - "files": files, + "read_id": args.read_id, + "read_name": args.read_name, + "primer_id": args.primer_id, + "min_len": args.min_len, + "max_len": args.max_len, + "strand": args.strand, + "files": args.files, } + # Region properties region_kwd = { - "region_id": region_id, - "region_type": region_type, - "name": region_name, - "sequence_type": sequence_type, - "sequence": sequence, - "min_len": min_len, - "max_len": max_len, + "region_id": args.region_id, + "region_type": args.region_type, + "name": args.region_name, + "sequence_type": args.sequence_type, + "sequence": args.sequence, + "min_len": args.min_len, + "max_len": args.max_len, } - if idtype == "region": - spec = run_modify_region(spec, modality, ids, **region_kwd) - elif idtype == "read": - spec = run_modify_read(spec, modality, ids, **read_kwd) - # update region in spec - # once the region is updated, update the spec + if args.selector == "region": + spec = run_modify_region(spec, args.modality, args.i, **region_kwd) + elif args.selector == "read": + spec = run_modify_read(spec, args.modality, args.i, **read_kwd) + + # Update spec spec.update_spec() - if o: - spec.to_YAML(o) + + if args.output: + args.output.write_text(spec.to_YAML()) else: print(spec.to_YAML()) def run_modify_read( - spec, - modality, - target_read, - read_id, - read_name, - primer_id, - min_len, - max_len, - strand, - files, -): + spec: Assay, + modality: str, + target_read: str, + read_id: Optional[str] = None, + read_name: Optional[str] = None, + primer_id: Optional[str] = None, + min_len: Optional[int] = None, + max_len: Optional[int] = None, + strand: Optional[str] = None, + files: Optional[str] = None, +) -> Assay: + """Modify read properties in spec.""" reads = spec.get_seqspec(modality) if files: - files = parse_files_string(files) + files_list = parse_files_string(files) for r in reads: if r.read_id == target_read: r.update_read_by_id( - read_id, read_name, modality, primer_id, min_len, max_len, strand, files + read_id, + read_name, + modality, + primer_id, + min_len, + max_len, + strand, + files_list, ) - return spec def run_modify_region( - spec, - modality, - target_region, - region_id, - region_type, - name, - sequence_type, - sequence, - min_len, - max_len, -): + spec: Assay, + modality: str, + target_region: str, + region_id: Optional[str] = None, + region_type: Optional[str] = None, + name: Optional[str] = None, + sequence_type: Optional[str] = None, + sequence: Optional[str] = None, + min_len: Optional[int] = None, + max_len: Optional[int] = None, +) -> Assay: + """Modify region properties in spec.""" spec.get_libspec(modality).update_region_by_id( target_region, region_id, @@ -281,27 +284,23 @@ def run_modify_region( min_len, max_len, ) - return spec -# filename,filetype,filesize,url,urltype,md5:... -def parse_files_string(input_string): +def parse_files_string(input_string: str) -> List[File]: + """Parse files string into list of File objects. # filename,filetype,filesize,url,urltype,md5:...""" files = [] - objects = input_string.split(":") - for obj in objects: - parts = obj.split(",") - filename, filetype, filesize, url, urltype, md5 = parts - - file = File( - file_id=filename, - filename=filename, - filetype=filetype, - filesize=int(filesize), - url=url, - urltype=urltype, - md5=md5, + for f in input_string.split(":"): + filename, filetype, filesize, url, urltype, md5 = f.split(",") + files.append( + File( + file_id=filename, + filename=filename, + filetype=filetype, + filesize=int(filesize), + url=url, + urltype=urltype, + md5=md5, + ) ) - files.append(file) - return files diff --git a/seqspec/seqspec_onlist.py b/seqspec/seqspec_onlist.py index a29c63ad..51198b96 100644 --- a/seqspec/seqspec_onlist.py +++ b/seqspec/seqspec_onlist.py @@ -1,16 +1,28 @@ -from seqspec.Assay import Assay -from seqspec.Region import project_regions_to_coordinates, itx_read, Onlist -from seqspec.utils import load_spec, map_read_id_to_regions -from seqspec.seqspec_find import find_by_region_type, find_by_region_id +"""Onlist module for seqspec CLI. + +This module provides functionality to generate and manage onlist files for seqspec regions. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS +import warnings import os -from seqspec.utils import read_local_list, read_remote_list import itertools from typing import List -from argparse import SUPPRESS, RawTextHelpFormatter -import warnings +from seqspec.Assay import Assay +from seqspec.Region import project_regions_to_coordinates, itx_read, Onlist +from seqspec.utils import ( + load_spec, + map_read_id_to_regions, + read_local_list, + read_remote_list, +) +from seqspec.seqspec_find import find_by_region_type, find_by_region_id -def setup_onlist_args(parser): + +def setup_onlist_args(parser) -> ArgumentParser: + """Create and configure the onlist command subparser.""" subparser = parser.add_parser( "onlist", description=""" @@ -25,20 +37,22 @@ def setup_onlist_args(parser): formatter_class=RawTextHelpFormatter, ) subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) choices = ["read", "region", "region-type"] subparser.add_argument( "-s", + "--selector", metavar="SELECTOR", - help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"), + help=f"Selector for ID, [{', '.join(choices)}] (default: read)", type=str, default="read", choices=choices, @@ -55,6 +69,7 @@ def setup_onlist_args(parser): format_choices = ["product", "multi"] subparser.add_argument( "-f", + "--format", metavar="FORMAT", type=str, default="product", @@ -63,16 +78,18 @@ def setup_onlist_args(parser): ) subparser_required.add_argument( "-i", - metavar="IDs", - help=("IDs"), + "--id", + metavar="ID", + help="ID to search for", type=str, default=None, required=False, ) subparser_required.add_argument( "-m", + "--modality", metavar="MODALITY", - help=("Modality"), + help="Modality", type=str, default=None, required=True, @@ -81,7 +98,14 @@ def setup_onlist_args(parser): return subparser -def validate_onlist_args(parser, args): +def validate_onlist_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the onlist command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") + if args.r is not None: warnings.warn( "The '-r' argument is deprecated and will be removed in a future version. " @@ -89,32 +113,26 @@ def validate_onlist_args(parser, args): DeprecationWarning, ) # Optionally map the old option to the new one - if not args.i: - args.i = args.r - - fn = args.yaml - m = args.m - ids = args.i - fmt = args.f - o = args.o - idtype = args.s + if not args.id: + args.id = args.r - return run_onlist(fn, m, ids, idtype, fmt, o) +def run_onlist(parser: ArgumentParser, args: Namespace) -> None: + """Run the onlist command.""" + validate_onlist_args(parser, args) -def run_onlist(spec_fn, modality, ids, idtype, fmt, o): # the base path is the path to the spec file - base_path = os.path.dirname(os.path.abspath(spec_fn)) + base_path = args.yaml.parent.absolute() # set the save path if it exists - if o: - save_path = os.path.abspath(o) + if args.output: + save_path = args.output else: # otherwise the save path is the same path as the spec - save_path = os.path.join(base_path, "onlist_joined.txt") + save_path = base_path / "onlist_joined.txt" # load spec - spec = load_spec(spec_fn) + spec = load_spec(args.yaml) # if number of barcodes > 1 then we need to join them CMD = { @@ -123,20 +141,22 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o): "read": run_onlist_read, } - onlists = CMD[idtype](spec, modality, ids) + onlists = CMD[args.selector](spec, args.modality, args.id) if len(onlists) == 0: - raise ValueError(f"No onlist found for {modality}, {idtype}, {ids}") + raise ValueError( + f"No onlist found for {args.modality}, {args.selector}, {args.id}" + ) # for only one onlist we can just return the path # if only one, its remote and we save it to the base path elif len(onlists) == 1: urltype = onlists[0].urltype - onlist_fn = os.path.basename(onlists[0].filename) - onlist_path = os.path.join(base_path, onlist_fn) - if os.path.exists(onlist_path): + onlist_fn = Path(onlists[0].filename).name + onlist_path = base_path / onlist_fn + if onlist_path.exists(): urltype = "local" - elif urltype == "http": + elif urltype in ["http", "https"]: # download the onlist to the base path and return the path onlist_elements = read_remote_list(onlists[0]) onlist_path = write_onlist(onlist_elements, save_path) @@ -147,15 +167,14 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o): for o in onlists: if o.urltype == "local": lsts.append(read_local_list(o, base_path)) - elif o.urltype == "http": + elif o.urltype in ["http", "https"]: # base_path is ignored for remote onlists lsts.append(read_remote_list(o, base_path)) - onlist_elements = join_onlists(lsts, fmt) + onlist_elements = join_onlists(lsts, args.format) onlist_path = write_onlist(onlist_elements, save_path) # print the path to the onlist print(onlist_path) - return def run_onlist_region_type( @@ -174,7 +193,9 @@ def run_onlist_region(spec: Assay, modality: str, region_id: str) -> List[Onlist regions = find_by_region_id(spec, modality, region_id) onlists: List[Onlist] = [] for r in regions: - onlists.append(r.get_onlist()) + ol = r.get_onlist() + if ol: + onlists.append(ol) if len(onlists) == 0: raise ValueError(f"No onlist found for region {region_id}") return onlists diff --git a/seqspec/seqspec_print.py b/seqspec/seqspec_print.py index be7f304a..9ad66871 100644 --- a/seqspec/seqspec_print.py +++ b/seqspec/seqspec_print.py @@ -1,13 +1,33 @@ -from seqspec.utils import load_spec -from seqspec.seqspec_print_html import print_seqspec_html +"""Print module for seqspec CLI. + +This module provides functionality to print sequence and/or library structure +in various formats (ascii, png, html). +""" + +from typing import List, Any +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace import newick -from seqspec.utils import REGION_TYPE_COLORS +import matplotlib.pyplot as plt +from matplotlib.patches import Rectangle +import matplotlib.patches as mpatches + +from seqspec.utils import load_spec, REGION_TYPE_COLORS +from seqspec.seqspec_print_html import print_seqspec_html from seqspec.Region import complement_sequence -from seqspec.Region import project_regions_to_coordinates -from argparse import RawTextHelpFormatter +from seqspec.Assay import Assay +from seqspec.seqspec_print_utils import libseq -def setup_print_args(parser): +def setup_print_args(parser) -> ArgumentParser: + """Create and configure the print command subparser. + + Args: + parser: The main argument parser to add the print subparser to. + + Returns: + The configured print subparser. + """ subparser = parser.add_parser( "print", description=""" @@ -23,20 +43,23 @@ def setup_print_args(parser): help="Display the sequence and/or library structure from seqspec file", formatter_class=RawTextHelpFormatter, ) - subparser.add_argument("yaml", help="Sequencing specification yaml file") + + subparser.add_argument("yaml", type=Path, help="Sequencing specification yaml file") subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + type=Path, + help="Path to output file", default=None, ) format_choices = ["library-ascii", "seqspec-html", "seqspec-png", "seqspec-ascii"] subparser.add_argument( "-f", + "--format", metavar="FORMAT", - help=(f"Format ({', '.join(format_choices)}), default: library-ascii"), + help=f"Format ({', '.join(format_choices)}), default: library-ascii", type=str, default="library-ascii", choices=format_choices, @@ -45,81 +68,107 @@ def setup_print_args(parser): return subparser -def validate_print_args(parser, args): - fmt = args.f +def validate_print_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the print command arguments. - fn = args.yaml - o = args.o - if fmt == "seqspec-png" and o is None: + Args: + parser: The argument parser. + args: The parsed arguments. + + Raises: + parser.error: If any validation fails. + """ + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") + + if args.format == "seqspec-png" and args.output is None: parser.error("Output file required for png format") - return run_seqspec_print(fn, fmt, o) +def run_print(parser: ArgumentParser, args: Namespace) -> None: + """Run the print command. -def run_seqspec_print(spec_fn, fmt, o): - spec = load_spec(spec_fn) + Args: + parser: The argument parser. + args: The parsed arguments. + """ + + validate_print_args(parser, args) + + spec = load_spec(args.yaml) + result = seqspec_print(spec, args.format) + + if args.format == "seqspec-png": + result.savefig(args.output, dpi=300, bbox_inches="tight") + elif args.output: + with open(args.output, "w") as f: + print(result, file=f) + else: + print(result) - # TODO: add reads to seqspec html - # TODO: add reads to seqspec png - CMD = { + +def seqspec_print(spec: Assay, fmt: str): + """Print sequence specification in the specified format. + + Args: + spec: The seqspec specification to print + fmt: The format to print in (library-ascii, seqspec-html, seqspec-png, seqspec-ascii) + + Returns: + The formatted output (string or matplotlib figure) + + Raises: + ValueError: If format is not supported + """ + # Map format to print function + format_to_function = { "library-ascii": print_library_ascii, "seqspec-html": print_seqspec_html, "seqspec-png": print_seqspec_png, "seqspec-ascii": print_seqspec_ascii, } - s = CMD[fmt](spec) + if fmt not in format_to_function: + raise ValueError( + f"Unsupported format: {fmt}. Must be one of {list(format_to_function.keys())}" + ) - if fmt == "png": - return s.savefig(o, dpi=300, bbox_inches="tight") + return format_to_function[fmt](spec) - if o: - with open(o, "w") as f: - print(s, file=f) - else: - print(s) - return +def print_seqspec_ascii(spec: Assay) -> str: + """Print sequence specification in ASCII format. -def print_seqspec_ascii(spec): - p = [] + Args: + spec: The seqspec specification to print. + + Returns: + The ASCII formatted string. + """ + parts = [] for modality in spec.modalities: - p.append(libseq(spec, modality)) - return "\n".join(p) + parts.append(format_libseq(spec, modality, *libseq(spec, modality))) + return "\n".join(parts) -def libseq(spec, modality): +def format_libseq(spec: Assay, modality: str, p: List[str], n: List[str]) -> str: + """Format library sequence for a specific modality. + + Args: + spec: The seqspec specification. + modality: The modality to format. + p: Positive strand parts. + n: Negative strand parts. + + Returns: + The formatted string. + """ libspec = spec.get_libspec(modality) - seqspec = spec.get_seqspec(modality) - - p = [] - n = [] - leaves = libspec.get_leaves() - cuts = project_regions_to_coordinates(leaves) - for idx, read in enumerate(seqspec, 1): - read_len = read.max_len - read_id = read.read_id - primer_id = read.primer_id - primer_idx = [i for i, l in enumerate(leaves) if l.region_id == primer_id][0] - primer_pos = cuts[primer_idx] - if read.strand == "pos": - wsl = primer_pos.stop - 1 - ws = wsl * " " - - arrowl = read_len - 1 - arrow = arrowl * "-" - - p.append(f"{ws}|{arrow}>({idx}) {read_id}") - elif read.strand == "neg": - wsl = primer_pos.start - read_len - ws = wsl * " " - - arrowl = read_len - 1 - arrow = arrowl * "-" - - n.append(f"{ws}<{arrow}|({idx}) {read_id}") - - s = "\n".join( + + return "\n".join( [ modality, "---", @@ -129,120 +178,123 @@ def libseq(spec, modality): "\n".join(n), ] ) - return s -def run_print(data): - header = headerTemplate(data.name, data.doi, data.description, data.modalities) - header2 = "## Final Library" - library_spec = multiModalTemplate(data.library_spec) - s = f"{header}\n{header2}\n{library_spec}" - return s +def print_library_ascii(spec: Assay) -> str: + """Print library structure in ASCII format. + Args: + spec: The seqspec specification to print. -def run_print_sequence_spec(spec): - p = [] - for r in spec.sequence_spec: - p.append( - "\t".join( - [r.read_id, r.primer_id, r.strand, str(r.min_len), str(r.max_len)] - ) - ) - return "\n".join(p) - - -def print_library_ascii(spec): - t = [] + Returns: + The ASCII formatted string. + """ + trees = [] for r in spec.library_spec: - t.append(r.to_newick()) - n = ",".join(t) - # print(n) - tree = newick.loads(f"({n})") + trees.append(r.to_newick()) + tree_str = ",".join(trees) + tree = newick.loads(f"({tree_str})") return tree[0].ascii_art() -def argsort(arr): - # http://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python/3382369#3382369 - # by unutbu - return sorted(range(len(arr)), key=arr.__getitem__) - +def print_seqspec_png(spec: Assay): + """Print sequence specification as PNG. -def print_seqspec_png(spec): - # builds directly off of https://colab.research.google.com/drive/1ZCIGrwLEIfE0yo33bP8uscUNPEn1p1DH developed by https://github.com/LucasSilvaFerreira + Args: + spec: The seqspec specification to print. - # modality + Returns: + The matplotlib figure. + """ modalities = spec.list_modalities() modes = [spec.get_libspec(m) for m in modalities] lengths = [i.min_len for i in modes] nmodes = len(modalities) - # sort the modalities by their lengths + # Sort modalities by length asort = argsort(lengths) modalities = [modalities[i] for i in asort] lengths = [lengths[i] for i in asort] modes = [modes[i] for i in asort] - assay_id = spec.assay_id - fig, _ = plot_png(assay_id, modalities, modes, nmodes, lengths) - return fig + return plot_png(spec.assay_id, modalities, modes, nmodes, lengths) + + +def argsort(arr: List[Any]) -> List[int]: + """Get indices that would sort an array. + + Args: + arr: The array to sort. + + Returns: + List of indices that would sort the array. + """ + return sorted(range(len(arr)), key=arr.__getitem__) + +def plot_png( + assay: str, modalities: List[str], modes: List[Any], nmodes: int, lengths: List[int] +): + """Create PNG plot of sequence specification. -def plot_png(assay, modalities, modes, nmodes, lengths): - import matplotlib.pyplot as plt - from matplotlib.patches import Rectangle - import matplotlib.patches as mpatches + Args: + assay: The assay ID. + modalities: List of modalities. + modes: List of mode specifications. + nmodes: Number of modes. + lengths: List of lengths. + Returns: + The matplotlib figure. + """ fsize = 15 plt.rcParams.update({"font.size": fsize}) - fig, ax = plt.subplots( - figsize=(10, 1 * nmodes), nrows=nmodes, constrained_layout=True - ) - fig.suptitle(assay) + fig, _ = plt.subplots(figsize=(10, 1 * nmodes), nrows=nmodes) + title_offset = 0.98 if nmodes > 1 else 1.2 + fig.suptitle(assay, y=title_offset) + rts = [] for m, ax in zip(modes, fig.get_axes()): - # get leaves + # Get leaves leaves = m.get_leaves() - # setup plotting variables + # Setup plotting variables y = 0 x = 0 height = 1 for idx, node in enumerate(leaves): - # region tupe + # Region type rtype = node.region_type.lower() - # add to the global list so we can make a legend rts.append(rtype) - # get region properties + + # Get region properties length = node.min_len label = f"{length}" - # setup rectangle for the region + # Setup rectangle for the region rectangle = Rectangle( (x, y), length, height, color=REGION_TYPE_COLORS[rtype], ec="black" ) - # write in the length of the region in the rectangle + # Write length in the rectangle ax.text( x + length / 2, y + height / 2, label, horizontalalignment="center", verticalalignment="center", - ) # , rotation=90) - # add the rectangle + ) ax.add_patch(rectangle) - # add length to x for next region + # Add length to x for next region x += length ax.autoscale() + ax.set(**{"xlim": (0, max(lengths)), "ylim": (0, 1)}) - # since all axes use the same scale, set the xlim to be 0 to the max length - ax.set(**{"xlim": (0, max(lengths))}) - - # hide the spines + # Hide the spines for spine in ["right", "top", "left", "bottom"]: ax.spines[spine].set_visible(False) # Hide the axis and ticks and labels @@ -250,10 +302,10 @@ def plot_png(assay, modalities, modes, nmodes, lengths): ax.set_yticklabels([]) ax.set_yticks([]) - # label the modality on the ylabel + # Label the modality on the ylabel ax.set_ylabel(m.region_type, rotation=0, fontsize=20, ha="right", va="center") - # adjust the xaxis for the last modality to show the length + # Adjust the xaxis for the last modality to show the length ax.xaxis.set_visible(True) ax.spines["bottom"].set_visible(True) ax.minorticks_on() @@ -264,68 +316,10 @@ def plot_png(assay, modalities, modes, nmodes, lengths): } ) - # setup the figure legend + # Setup the figure legend handles = [] for t in sorted(set(rts)): handles.append(mpatches.Patch(color=REGION_TYPE_COLORS[t], label=t)) fig.legend(handles=handles, loc="center", bbox_to_anchor=(1.1, 0.55)) - return (fig, ax) - -def headerTemplate(name, doi, description, modalities): - s = f"""# {name} -- DOI: [{doi}]({doi}) -- Description: {description} -- Modalities: {", ".join(modalities)} - """ - return s - - -def atomicRegionTemplate( - name, region_type, sequence_type, sequence, min_len, max_len, onlist, ns=0 -): - s = f"""
{name} - -{' '*ns}- region_type: {region_type} -{' '*ns}- sequence_type: {sequence_type} -{' '*ns}- sequence:
{sequence}
-{' '*ns}- min_len: {min_len} -{' '*ns}- max_len: {max_len} -{' '*ns}- onlist: {onlist} -{' '*ns}
""" - return s - - -def regionsTemplate(regions): - s = "\n".join( - [ - f"{idx + 1}. " - + atomicRegionTemplate( - v.name, - v.region_type, - v.sequence_type, - v.sequence, - v.min_len, - v.max_len, - v.onlist, - len(str(idx + 1)) - + 1 - + 1, # length of string rep of number plus 1 for "." plus 1 for space - ) - for idx, v in enumerate(regions) - ] - ) - return s - - -def libStructTemplate(region): - s = f"""###### {region.name} -
{region.sequence}
""" - return s - - -def multiModalTemplate(library_spec): - s = "\n".join( - [libStructTemplate(v) + "\n" + regionsTemplate(v.regions) for v in library_spec] - ) - return s + return fig diff --git a/seqspec/seqspec_print_html.py b/seqspec/seqspec_print_html.py index f469238a..8ec4694e 100644 --- a/seqspec/seqspec_print_html.py +++ b/seqspec/seqspec_print_html.py @@ -1,16 +1,23 @@ -from seqspec.Region import Region +"""Print HTML module for seqspec. +This module provides functionality to generate HTML representations of seqspec files. +It is used by the print command with the 'seqspec-html' format option. +""" + +from typing import List, Optional + +from seqspec.Assay import Assay +from seqspec.Region import Region, Onlist, complement_sequence +from seqspec.Read import Read, File +from seqspec.seqspec_print_utils import libseq -def print_seqspec_html(spec): - # header = headerTemplate(spec.name, spec.doi, spec.description, spec.modalities) - # header2 = "## Final Library" - # library_spec = multiModalTemplate(spec.library_spec) - # s = f"{header}\n{header2}\n{library_spec}" - s = htmlTemplate(spec) - return s +def print_seqspec_html(spec: Assay) -> str: + """Generate HTML representation of seqspec.""" + return htmlTemplate(spec) -def headerTemplate(name, doi, description, modalities): + +def headerTemplate(name: str, doi: str, description: str, modalities: List[str]) -> str: s = f"""

{name}