From 2e19173a5743a4ef66c3770842d27027ed747219 Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Fri, 1 Nov 2024 17:42:42 -0500 Subject: [PATCH 01/21] update schema (#52) --- seqspec/schema/seqspec.schema.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/seqspec/schema/seqspec.schema.json b/seqspec/schema/seqspec.schema.json index 997d14a..0dc9a04 100644 --- a/seqspec/schema/seqspec.schema.json +++ b/seqspec/schema/seqspec.schema.json @@ -71,6 +71,8 @@ "massively parallel reporter assay (OBI:0002675)", "chromosome conformation capture-on-chip assay (OBI:0002458)", "single nucleus methylation chromatin conformation capture seq (NTR:0000745)", + "in vitro CRISPR screen using flow cytometry (OBI:0003661)", + "in vitro CRISPR screen using single-cell RNA-seq (OBI:0003660)", "Custom" ] }, @@ -141,6 +143,7 @@ "Illumina NextSeq 2000 (EFO:0010963)", "Illumina NovaSeq X (NTR:0000765)", "PacBio RS II (EFO:0008631)", + "Illumina NovaSeq X (EFO:0022840)", "Custom" ] }, From 289e7e3228a6c35289e82d1d760ca6c84610f16d Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Fri, 1 Nov 2024 17:43:22 -0500 Subject: [PATCH 02/21] update file_exsits function to check file url in igvf portal (#53) --- seqspec/utils.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/seqspec/utils.py b/seqspec/utils.py index bad593d..1ed0b25 100644 --- a/seqspec/utils.py +++ b/seqspec/utils.py @@ -119,6 +119,16 @@ def region_ids_in_spec(seqspec, modality, region_ids): def file_exists(uri): try: + if uri.startswith("https://api.data.igvf.org"): + auth = get_remote_auth_token() + if auth is None: + print("Warning: IGVF_API_KEY and IGVF_SECRET_KEY not set") + r = requests.head(uri, auth=auth) + if r.status_code == 307: + # igvf download link will redirect to a presigned amazon s3 url, HEAD request will not work. + r = requests.get(r.headers["Location"], headers={"Range": "bytes=0-0"}) + return r.status_code == 206 + return r.status_code == 200 r = requests.head(uri) if r.status_code == 302: return file_exists(r.headers["Location"]) From 2a5df33dab6457c31fd9999759d99f262d1ff9cb Mon Sep 17 00:00:00 2001 From: Sina Booeshaghi Date: Sun, 3 Nov 2024 19:08:43 -0800 Subject: [PATCH 03/21] adding seqspec spec tokenization --- seqspec/main.py | 6 +- seqspec/seqspec_convert.py | 403 +++++++++++++++++++++++++++++++++++++ seqspec/seqspec_genbank.py | 210 ------------------- 3 files changed, 406 insertions(+), 213 deletions(-) create mode 100644 seqspec/seqspec_convert.py delete mode 100644 seqspec/seqspec_genbank.py diff --git a/seqspec/main.py b/seqspec/main.py index d64a881..a5bb35e 100644 --- a/seqspec/main.py +++ b/seqspec/main.py @@ -6,7 +6,7 @@ from .seqspec_check import setup_check_args, validate_check_args from .seqspec_find import setup_find_args, validate_find_args -# from .seqspec_genbank import setup_genbank_args, validate_genbank_args +from .seqspec_convert import setup_convert_args, validate_convert_args from .seqspec_modify import setup_modify_args, validate_modify_args from .seqspec_index import setup_index_args, validate_index_args from .seqspec_info import setup_info_args, validate_info_args @@ -54,7 +54,7 @@ def main(): "find": setup_find_args(subparsers), "file": setup_file_args(subparsers), "format": setup_format_args(subparsers), - # "genbank": setup_genbank_args(subparsers), + "convert": setup_convert_args(subparsers), "index": setup_index_args(subparsers), "info": setup_info_args(subparsers), "init": setup_init_args(subparsers), @@ -98,7 +98,7 @@ def main(): "version": validate_version_args, "file": validate_file_args, "upgrade": validate_upgrade_args, - # "genbank": validate_genbank_args, + "convert": validate_convert_args, } COMMAND_TO_FUNCTION[sys.argv[1]](parser, args) diff --git a/seqspec/seqspec_convert.py b/seqspec/seqspec_convert.py new file mode 100644 index 0000000..ccffa7c --- /dev/null +++ b/seqspec/seqspec_convert.py @@ -0,0 +1,403 @@ +from seqspec.utils import load_genbank +import json +from seqspec.Region import Region +from seqspec.Assay import Assay +from seqspec.utils import load_spec +import numpy as np +from typing import Dict, List, Tuple +from os import path +from pathlib import Path + + +schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json") + +with open(schema_fn, "r") as f: + schema = json.load(f) +REGION_TYPES = schema["$defs"]["region"]["properties"]["region_type"]["enum"] +MODALITIES = schema["properties"]["modalities"]["items"]["enum"] +SEQUENCE_TYPES = schema["$defs"]["region"]["properties"]["sequence_type"]["enum"] + + +def setup_convert_args(parser): + subparser = parser.add_parser( + "convert", + description="get genbank about seqspec file", + help="get genbank about seqspec file", + ) + choices = ["genbank", "seqspec", "token"] + subparser.add_argument( + "-ifmt", help="Input format", type=str, default="seqspec", choices=choices + ) + subparser.add_argument( + "-ofmt", help="Output format", type=str, default="token", choices=choices + ) + + subparser.add_argument( + "-o", + metavar="OUT", + help=("Path to output file"), + type=str, + default=None, + required=False, + ) + subparser.add_argument( + "input_file", metavar="IN", help="Path to input file", type=str + ) + return subparser + + +def validate_convert_args(parser, args): + # if everything is valid the run_convert + fn = args.input_file + ifmt = args.ifmt + ofmt = args.ofmt + o = args.o + + cnv = run_convert(fn, ifmt, ofmt, o) + print(cnv) + + # if o: + # spec.to_YAML(o) + # else: + # print(json.dumps(spec, sort_keys=False, indent=4)) + + +def load_input_file(fn, ifmt): + LOAD = { + "genbank": load_genbank, + "seqspec": load_spec, + # "token": load_token, + } + return LOAD[ifmt](fn) + + +def get_feature_names() -> List[str]: + """Generate ordered list of column names""" + features = [] + + # Modality one-hot features + features.extend([f"modality_{mod}" for mod in MODALITIES]) + + # Region type one-hot features + features.extend([f"region_{rt}" for rt in REGION_TYPES]) + + # Sequence type one-hot features + features.extend([f"seq_{st}" for st in SEQUENCE_TYPES]) + + # Numerical features + features.extend(["min_len", "max_len", "position"]) + + return features + + +def save_tokenized_spec( + matrix: np.ndarray, row_identifiers: List[Tuple[str, str, str]], output_path: str +): + """ + Save tokenized spec output to three files: + - spec.npy: The matrix data + - rows.txt: Tab-separated list of (spec_id, modality, region_type) + - cols.txt: List of column names + + Args: + matrix: The tokenized matrix from tokenize_specs + row_identifiers: List of (spec_id, modality, region_type) tuples + output_path: Path to save the output (directory) + """ + # Create output directory if needed + output_dir = Path(output_path) + output_dir.mkdir(parents=True, exist_ok=True) + + # Save matrix + np.save(output_dir / "spec.npy", matrix) + + # Save row identifiers (tab-separated) + with open(output_dir / "rows.txt", "w") as f: + for spec_id, modality, region_type in row_identifiers: + f.write(f"{spec_id}\t{modality}\t{region_type}\n") + + # Save column names + feature_names = get_feature_names() + with open(output_dir / "cols.txt", "w") as f: + for feature in feature_names: + f.write(f"{feature}\n") + + +def run_convert(fn, ifmt, ofmt, o): + CONVERT = { + ("genbank", "seqspec"): gb_to_seqspec, + ("seqspec", "token"): seqspec_to_token, + } + file = load_input_file(fn, ifmt) + c = CONVERT[(ifmt, ofmt)](file) + if o: + save_tokenized_spec(*c, o) + else: + return c + return + + +def seqspec_to_token(spec): + # for each modalitiy, make a dictionary of regions + specs_regions = {} + modalities = spec.list_modalities() + for modality in modalities: + regions = [i.to_dict() for i in spec.get_libspec(modality).get_leaves()] + specs_regions[modality] = regions + + # Convert to tokenized matrix + return tokenize_specs({spec.assay_id: specs_regions}) + + +def tokenize_specs( + specs_regions: Dict[str, Dict[str, List[Dict]]] +) -> Tuple[np.ndarray, List[Tuple[str, str, str]]]: + """ + Convert specs into a single matrix where each row represents a complete region specification + + Args: + specs_regions: Dict[spec_id -> Dict[modality -> List[region_dict]]] + + Returns: + - Matrix where each row is [modality_onehot, region_type_onehot, sequence_type_onehot, min_len, max_len, position] + - List of (spec_id, modality, region_type) identifying each row + """ + # Calculate feature dimensions + n_modality_features = len(MODALITIES) + n_region_type_features = len(REGION_TYPES) + n_sequence_type_features = len(SEQUENCE_TYPES) + + # Total features = one-hot encodings + numerical features + total_features = ( + n_modality_features # modality one-hot + + n_region_type_features # region_type one-hot + + n_sequence_type_features # sequence_type one-hot + + 2 # min_len, max_len + ) + + # Total features = one-hot encodings + numerical features + position + total_features = ( + n_modality_features # modality one-hot + + n_region_type_features # region_type one-hot + + n_sequence_type_features # sequence_type one-hot + + 2 # min_len, max_len + + 1 # position in region list (1-based) + ) + + rows = [] # Will hold our feature vectors + row_identifiers = [] # Will hold (spec_id, modality, region_type) tuples + + for spec_id, modality_regions in specs_regions.items(): + for modality, regions in modality_regions.items(): + # Enumerate regions to get position (1-based) + for position, region in enumerate(regions, start=1): + # Create feature vector for this region + feature_vector = np.zeros(total_features) + current_idx = 0 + + # Add modality one-hot + modality_idx = MODALITIES.index(modality) + feature_vector[modality_idx] = 1 + current_idx += n_modality_features + + # Add region_type one-hot + region_type_idx = REGION_TYPES.index(region["region_type"]) + feature_vector[current_idx + region_type_idx] = 1 + current_idx += n_region_type_features + + # Add sequence_type one-hot + sequence_type_idx = SEQUENCE_TYPES.index(region["sequence_type"]) + feature_vector[current_idx + sequence_type_idx] = 1 + current_idx += n_sequence_type_features + + # Add lengths + feature_vector[current_idx] = region["min_len"] + feature_vector[current_idx + 1] = region["max_len"] + current_idx += 2 + + # Add position + feature_vector[current_idx] = position + + # Store feature vector and identifier + rows.append(feature_vector) + row_identifiers.append((spec_id, modality, region["region_type"])) + + return np.array(rows), row_identifiers + + +def gb_to_seqspec(gb): + ex = gb_to_list(gb) + nested_json = nest_intervals(ex) + filled_regions = fill_gaps(gb.sequence, nested_json) + regions = convert(filled_regions) + reads = [] + spec = Assay( + "genbank", + "illumina", + "genbank thing", + "doi", + "date", + ["source"], + "description", + "", + "", + "", + "", + reads, + regions, + ) + return spec + + +def gb_to_list(gb): + feat = [] + label = "source" + for f in gb.features: + id = f.key + + if "complement" in f.location: + start, stop = tuple(map(int, f.location[11:-1].split(".."))) + else: + start, stop = tuple(map(int, f.location.split(".."))) + + # convert to 0-index + start -= 1 + length = stop - start + seq = gb.sequence[start:stop] + + for q in f.qualifiers: + if q.key == "/label=": + label = q.value + break + feat.append( + { + "id": id, + "label": label, + "start": start, + "stop": stop, + "length": length, + "seq": seq, + } + ) + return feat + + +def nest_intervals(intervals): + def nest(start_index, end_limit): + nested = [] + + i = start_index + while i < len(intervals) and intervals[i]["start"] < end_limit: + current_interval = intervals[i] + child, next_index = nest(i + 1, current_interval["stop"]) + interval_obj = { + "id": current_interval["id"], + "label": current_interval["label"], + "start": current_interval["start"], + "stop": current_interval["stop"], + "length": current_interval["length"], + "seq": current_interval["seq"], + "regions": child, + } + nested.append(interval_obj) + i = next_index + + return nested, i + + result, _ = nest(0, intervals[0]["stop"]) + return result + + +def fill_gaps(seq, regions, parent_start=0, parent_stop=0): + if len(regions) == 0: + return [] + + # Insert a filler at the start if necessary + if regions[0]["start"] > parent_start: + start = parent_start + stop = regions[0]["start"] + s = seq[start:stop] + regions.insert( + 0, + { + "id": "filler_start", + "label": "filler_start", + "start": start, + "stop": stop, + "length": stop - start, + "seq": s, + "regions": [], + }, + ) + + new_regions = [] + for i, region in enumerate(regions): + # Append the current region + new_regions.append(region) + + # Recursive call for nested regions + if "regions" in region: + region["regions"] = fill_gaps( + seq, region["regions"], region["start"], region["stop"] + ) + + # Check for gap and insert a filler + if i < len(regions) - 1 and region["stop"] < regions[i + 1]["start"]: + filler_id = f'filler_{region["id"]}_{regions[i+1]["id"]}' + start = region["stop"] + stop = regions[i + 1]["start"] + s = seq[start:stop] + new_regions.append( + { + "id": filler_id, + "label": filler_id, + "start": start, + "stop": stop, + "length": stop - start, + "seq": s, + "regions": [], + } + ) + + # Insert a filler at the end if necessary + if new_regions[-1]["stop"] < parent_stop: + start = new_regions[-1]["stop"] + stop = parent_stop + s = seq[start:stop] + new_regions.append( + { + "id": "filler_end", + "label": "filler_end", + "start": start, + "stop": stop, + "length": stop - start, + "seq": s, + "regions": [], + } + ) + + return new_regions + + +# convert filled regions to seqspec, must be recursive function +# regions is a list +def convert(regions): + if len(regions) == 0: + return [] + new_regions = [] + for r in regions: + rgn = Region( + r["id"], + "", + r["label"], + "fixed", + r["seq"], + r["length"], + r["length"], + None, + None, + ) + if len(r["regions"]) > 0: + rgn.regions = convert(r["regions"]) + new_regions.append(rgn) + return new_regions diff --git a/seqspec/seqspec_genbank.py b/seqspec/seqspec_genbank.py deleted file mode 100644 index 419d67c..0000000 --- a/seqspec/seqspec_genbank.py +++ /dev/null @@ -1,210 +0,0 @@ -from seqspec.utils import load_genbank -import json -from seqspec.Region import Region -from seqspec.Assay import Assay - - -def setup_genbank_args(parser): - subparser = parser.add_parser( - "genbank", - description="get genbank about seqspec file", - help="get genbank about seqspec file", - ) - - subparser.add_argument("gbk", help="Genbank file") - subparser.add_argument( - "-o", - metavar="OUT", - help=("Path to output file"), - type=str, - default=None, - required=False, - ) - return subparser - - -def validate_genbank_args(parser, args): - # if everything is valid the run_genbank - fn = args.gbk - o = args.o - gb = load_genbank(fn) - - spec = run_genbank(gb) - - if o: - spec.to_YAML(o) - else: - print(json.dumps(spec, sort_keys=False, indent=4)) - - -def run_genbank(gb): - ex = gb_to_list(gb) - nested_json = nest_intervals(ex) - filled_regions = fill_gaps(gb.sequence, nested_json) - regions = convert(filled_regions) - spec = Assay( - "genbank", - "illumina", - "genbank thing", - "doi", - "date", - "description", - ["source"], - "", - regions, - ) - return spec - - -def gb_to_list(gb): - feat = [] - label = "source" - for f in gb.features: - id = f.key - - if "complement" in f.location: - start, stop = tuple(map(int, f.location[11:-1].split(".."))) - else: - start, stop = tuple(map(int, f.location.split(".."))) - - # convert to 0-index - start -= 1 - length = stop - start - seq = gb.sequence[start:stop] - - for q in f.qualifiers: - if q.key == "/label=": - label = q.value - break - feat.append( - { - "id": id, - "label": label, - "start": start, - "stop": stop, - "length": length, - "seq": seq, - } - ) - return feat - - -def nest_intervals(intervals): - def nest(start_index, end_limit): - nested = [] - - i = start_index - while i < len(intervals) and intervals[i]["start"] < end_limit: - current_interval = intervals[i] - child, next_index = nest(i + 1, current_interval["stop"]) - interval_obj = { - "id": current_interval["id"], - "label": current_interval["label"], - "start": current_interval["start"], - "stop": current_interval["stop"], - "length": current_interval["length"], - "seq": current_interval["seq"], - "regions": child, - } - nested.append(interval_obj) - i = next_index - - return nested, i - - result, _ = nest(0, intervals[0]["stop"]) - return result - - -def fill_gaps(seq, regions, parent_start=0, parent_stop=0): - if len(regions) == 0: - return [] - - # Insert a filler at the start if necessary - if regions[0]["start"] > parent_start: - start = parent_start - stop = regions[0]["start"] - s = seq[start:stop] - regions.insert( - 0, - { - "id": "filler_start", - "label": "filler_start", - "start": start, - "stop": stop, - "length": stop - start, - "seq": s, - "regions": [], - }, - ) - - new_regions = [] - for i, region in enumerate(regions): - # Append the current region - new_regions.append(region) - - # Recursive call for nested regions - if "regions" in region: - region["regions"] = fill_gaps( - seq, region["regions"], region["start"], region["stop"] - ) - - # Check for gap and insert a filler - if i < len(regions) - 1 and region["stop"] < regions[i + 1]["start"]: - filler_id = f'filler_{region["id"]}_{regions[i+1]["id"]}' - start = region["stop"] - stop = regions[i + 1]["start"] - s = seq[start:stop] - new_regions.append( - { - "id": filler_id, - "label": filler_id, - "start": start, - "stop": stop, - "length": stop - start, - "seq": s, - "regions": [], - } - ) - - # Insert a filler at the end if necessary - if new_regions[-1]["stop"] < parent_stop: - start = new_regions[-1]["stop"] - stop = parent_stop - s = seq[start:stop] - new_regions.append( - { - "id": "filler_end", - "label": "filler_end", - "start": start, - "stop": stop, - "length": stop - start, - "seq": s, - "regions": [], - } - ) - - return new_regions - - -# convert filled regions to seqspec, must be recursive function -# regions is a list -def convert(regions): - if len(regions) == 0: - return [] - new_regions = [] - for r in regions: - rgn = Region( - r["id"], - "", - r["label"], - "fixed", - r["seq"], - r["length"], - r["length"], - None, - None, - ) - if len(r["regions"]) > 0: - rgn.regions = convert(r["regions"]) - new_regions.append(rgn) - return new_regions From e3a6dea4b3a25742e0335391278b79aa79f7bc7e Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Mon, 25 Nov 2024 15:14:22 -0600 Subject: [PATCH 04/21] allow https for remote onlist (#54) --- seqspec/seqspec_onlist.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/seqspec/seqspec_onlist.py b/seqspec/seqspec_onlist.py index a29c63a..b021a5a 100644 --- a/seqspec/seqspec_onlist.py +++ b/seqspec/seqspec_onlist.py @@ -136,7 +136,7 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o): onlist_path = os.path.join(base_path, onlist_fn) if os.path.exists(onlist_path): urltype = "local" - elif urltype == "http": + elif urltype in ["http", "https"]: # download the onlist to the base path and return the path onlist_elements = read_remote_list(onlists[0]) onlist_path = write_onlist(onlist_elements, save_path) @@ -147,7 +147,7 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o): for o in onlists: if o.urltype == "local": lsts.append(read_local_list(o, base_path)) - elif o.urltype == "http": + elif o.urltype in ["http", "https"]: # base_path is ignored for remote onlists lsts.append(read_remote_list(o, base_path)) onlist_elements = join_onlists(lsts, fmt) From 8e9554f8293f9efd82bf008eac536e06b743cc12 Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Wed, 15 Jan 2025 17:49:55 -0600 Subject: [PATCH 05/21] Update seqspec check so we can run it directly in python script (#58) * update seqspec check * add spec parameter back to check function --- seqspec/seqspec_check.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index 702dfbe..77e94b4 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -33,19 +33,14 @@ def setup_check_args(parser): def validate_check_args(parser, args): spec_fn = args.yaml o = args.o - schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json") - return run_check(schema_fn, spec_fn, o) + return run_check(spec_fn, o) -def run_check(schema_fn, spec_fn, o): +def run_check(spec_fn, o): spec = load_spec(spec_fn) - with open(schema_fn, "r") as stream: - schema = yaml.load(stream, Loader=yaml.Loader) - v = Draft4Validator(schema) - - errors = check(v, spec, spec_fn) + errors = check(spec) if errors: if o: @@ -56,14 +51,19 @@ def run_check(schema_fn, spec_fn, o): return errors -def check(schema: Draft4Validator, spec: Assay, spec_fn: str): +def check(spec: Assay, spec_fn: str = None): + schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json") + + with open(schema_fn, "r") as stream: + schema = yaml.load(stream, Loader=yaml.Loader) + validator = Draft4Validator(schema) errors = [] idx = 0 # with open("del.json", "w") as f: # json.dump(spec.to_dict(), f, indent=4) - for idx, error in enumerate(schema.iter_errors(spec.to_dict()), 1): + for idx, error in enumerate(validator.iter_errors(spec.to_dict()), 1): errors.append( f"[error {idx}] {error.message} in spec[{']['.join(repr(index) for index in error.path)}]" ) From a280567bcaf03b392a8718b749f9617efa470420 Mon Sep 17 00:00:00 2001 From: Sina Booeshaghi Date: Wed, 15 Jan 2025 15:50:36 -0800 Subject: [PATCH 06/21] added python usage to docs --- docs/SEQSPEC_TOOL.md | 80 ++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 2 deletions(-) diff --git a/docs/SEQSPEC_TOOL.md b/docs/SEQSPEC_TOOL.md index 1e355e7..2703767 100644 --- a/docs/SEQSPEC_TOOL.md +++ b/docs/SEQSPEC_TOOL.md @@ -61,6 +61,12 @@ Check that the `seqspec` file is correctly formatted and consistent with the [sp seqspec check [-h] [-o OUT] yaml ``` +```python +from seqspec.seqspec_check import run_check + +run_check(schema_fn: str, spec_fn: str, o: str) +``` + - optionally, `-o OUT` can be used to write the output to a file. - `yaml` corresponds to the `seqspec` file. @@ -133,6 +139,12 @@ $ seqspec check spec.yaml seqspec find [-h] [-o OUT] [-s Selector] -m MODALITY [-i IDs] yaml ``` +```python +from seqspec.seqspec_find import run_find + +run_find(spec_fn: str, modality: str, id: str, idtype: str, o: str) +``` + - optionally, `-o OUT` can be used to write the output to a file. - optionally, `-s Selector` is the type of the ID you are searching for (default: region). Can be one of - read @@ -195,6 +207,12 @@ $ seqspec find -m rna -s region-type -i barcode spec.yaml seqspec file [-h] [-o OUT] [-i IDs] -m MODALITY [-s SELECTOR] [-f FORMAT] [-k KEY] yaml ``` +```python +from seqspec.seqspec_file import run_file + +run_file(spec_fn: str, m: str, ids: List[str], idtype: str, fmt: str, k: str, o: str, fp=False) +``` + - optionally, `-o OUT` can be used to write the output to a file. - optionally, `-s Selector` is the type of the ID you are searching for (default: read). Can be one of - read @@ -266,6 +284,11 @@ Automatically fill in missing fields in the spec. seqspec format [-h] [-o OUT] yaml ``` +```python +from seqspec.seqspec_format import run_format +run_format(spec_fn: str, o: str) +``` + - `-o OUT` the path to create the formatted `seqspec` file. - `yaml` corresponds to the `seqspec` file. @@ -283,11 +306,16 @@ $ seqspec format -o spec.yaml spec.yaml Identify the position of elements in a spec for use in downstream tools. Returns the 0-indexed position of elements contained in a given region in the 5'->3' direction. -``` +```bash seqspec index [-o OUT] [-t TOOL] [--rev] -m MODALITY -r REGION yaml seqspec index [-h] [-o OUT] [-t TOOL] [-s SELECTOR] [--rev] -m MODALITY [-i IDs] yaml ``` +```python +from seqspec.seqspec_index import run_index +run_index(spec_fn: str, modality: str, ids: List[str], idtype: str, fmt: str, rev: str, subregion_type: str, o) +``` + - optionally, `-o OUT` can be used to write the output to a file. - optionally, `--rev` can be set to return the 3'->5' index. - optionally, `-t TOOL` returns the indices in the format specified by the tool. One of: @@ -342,6 +370,11 @@ $ seqspec index -m atac -t kb -s file spec.yaml seqspec info [-h] [-k KEY] [-f FORMAT] [-o OUT] yaml ``` +```python +from seqspec.seqspec_info import run_info +run_info(spec_fn: str, f: str, k=None, o=None) +``` + - optionally, `-o OUT` path to write the info. - optionally, `-k KEY` the object to display (default: meta). Can be one of - modalities @@ -413,6 +446,11 @@ $ seqspec info -f json -k sequence_spec spec.yaml seqspec init [-h] -n NAME -m MODALITIES -r READS [-o OUT] newick ``` +```python +from seqspec.seqspec_info import run_info +run_info(spec_fn: str, f: str, k: str = None, o: str = None) +``` + - optionally, `-o OUT` path to create `seqspec` file. - `-m MODALITIES` is a comma-separated list of modalities (e.g. rna,atac) - `-n NAME` is the name associated with the `seqspec` file. @@ -432,7 +470,7 @@ $ seqspec init -n myassay -m rna -o spec.yaml -r rna,R1.fastq.gz,r1_primer,26,po $ seqspec init -n myassay -m rna,atac -o spec.yaml -r rna,rna_R1.fastq.gz,rna_r1_primer,26,pos:rna,rna_R2.fastq.gz,rna_r2_primer,100,neg:atac,atac_R1.fastq.gz,atac_r1_primer,100,pos:atac,atac_R2.fastq.gz,atac_r1_primer,16,neg:atac,atac_R3.fastq.gz,atac_r2_primer,100,neg "(((rna_r1_primer:0,barcode:16,umi:12,cdna:150,rna_r2_primer:0)rna),(barcode:16,atac_r1_primer:1,gdna:150,atac_r2_primer)atac)" ``` -## `seqsoec methods`: Convert seqspec file into methods section +## `seqspec methods`: Convert seqspec file into methods section Generate a methods section from a seqspec file. @@ -440,6 +478,11 @@ Generate a methods section from a seqspec file. seqspec methods [-h] -m MODALITY [-o OUT] yaml ``` +```python +from seqspec.seqspec_methods import run_methods +run_methods(spec_fn: str, m: str, o: str) +``` + - optionally, `-o OUT` path to write the methods section. - `-m MODALITY` is the modality to write the methods for. - `yaml` corresponds to the `seqspec` file. @@ -479,6 +522,13 @@ The library was sequenced on a Illumina NovaSeq 6000 (EFO:0008637) using the Nov seqspec modify [-h] [--read-id READID] [--read-name READNAME] [--primer-id PRIMERID] [--strand STRAND] [--files FILES] [--region-id REGIONID] [--region-type REGIONTYPE] [--region-name REGIONNAME] [--sequence-type SEQUENCETYPE] [--sequence SEQUENCE] [--min-len MINLEN] [--max-len MAXLEN] [-o OUT] [-i IDs] [-s SELECTOR] -m MODALITY yaml ``` +```python +from seqspec.seqspec_modify import run_modify_read, run_modify_region + +run_modify_read(spec, modality, target_read, read_id, read_name, primer_id, min_len, max_len, strand, files) +run_modify_region(spec, modality, target_region, region_id, region_type, name, sequence_type, sequence, min_len, max_len) +``` + Read modifications - optionally, `--read-id READID` specifies the new `read_id`. @@ -529,6 +579,12 @@ $ seqspec modify -m atac -o mod_spec.yaml -i atac_R1 --files "R1_1.fastq.gz,fast seqspec onlist [-h] [-o OUT] [-s SELECTOR] [-f FORMAT] [-i IDs] -m MODALITY yaml ``` +```python +from seqspec.seqspec_onlist import run_onlist + +run_onlist(spec_fn, modality, ids, idtype, fmt, o) +``` + - optionally, `-o OUT` to set the path of the onlist file. - `-m MODALITY` is the modality in which you are searching for the region. - `-i ID` is the `id` of the object to search for the onlist. @@ -563,6 +619,11 @@ Print sequence and/or library structure as ascii, png, or html. seqspec print [-h] [-o OUT] [-f FORMAT] yaml ``` +```python +from seqspec.seqspec_print import run_seqspec_print +run_seqspec_print(spec_fn, fmt, o) +``` + - optionally, `-o OUT` to set the path of printed file. - optionally, `-f FORMAT` is the format of the printed file. Can be one of: - `library-ascii`: prints an ascii tree of the library_spec @@ -651,6 +712,11 @@ $ seqspec print -o spec.png -f seqspec-png spec.yaml seqspec split [-h] -o OUT yaml ``` +```python +from seqspec.seqspec_split import run_split +run_split(spec_fn, o) +``` + - optionally, `-o OUT` name prepended to split specs. - `yaml` corresponds to the `seqspec` file. @@ -673,6 +739,11 @@ split.tag.yaml seqspec version [-h] [-o OUT] yaml ``` +```python +from seqspec.seqspec_version import run_version +run_version(spec_fn, o) +``` + - optionally, `-o OUT` path to file to write output. - `yaml` corresponds to the `seqspec` file. @@ -693,6 +764,11 @@ This is a hidden subcommand that upgrades an old version of the spec to the curr seqspec upgrade [-h] [-o OUT] yaml ``` +```python +from seqspec.seqspec_upgrade import run_upgrade +run_upgrade(spec_fn, o) +``` + ### Examples ```bash From c9520b49232ec9a488a32ac9aba4099878241fad Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Wed, 22 Jan 2025 16:08:47 -0600 Subject: [PATCH 07/21] support gzipped yaml file for function load_spec (#60) * support gzipped yaml file for function load_spec * fix bug in function run_check * support gzipped yaml file for function load_spec --- seqspec/seqspec_check.py | 4 ++-- seqspec/utils.py | 16 ++++++++++++++-- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index 77e94b4..402a99a 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -40,7 +40,7 @@ def validate_check_args(parser, args): def run_check(spec_fn, o): spec = load_spec(spec_fn) - errors = check(spec) + errors = check(spec, spec_fn) if errors: if o: @@ -51,7 +51,7 @@ def run_check(spec_fn, o): return errors -def check(spec: Assay, spec_fn: str = None): +def check(spec: Assay, spec_fn: str): schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json") with open(schema_fn, "r") as stream: diff --git a/seqspec/utils.py b/seqspec/utils.py index 1ed0b25..eea15dc 100644 --- a/seqspec/utils.py +++ b/seqspec/utils.py @@ -13,8 +13,20 @@ def load_spec(spec_fn: str): - with open(spec_fn, "r") as stream: - return load_spec_stream(stream) + """ + Reads a YAML file that may be gzipped or not. + + :param spec_fn: Path to the YAML or gzipped YAML file. + :return: Parsed YAML content as a Assay object. + """ + try: + # Check if the file is gzipped by attempting to open it as such + with gzip.open(spec_fn, "rt") as stream: + return load_spec_stream(stream) + except gzip.BadGzipFile: + # If opening as gzip fails, assume it's a regular YAML file + with open(spec_fn, "r") as stream: + return load_spec_stream(stream) def load_spec_stream(spec_stream: io.IOBase): From 1ea72390c4faf979095da6feb8d4e697a5250a91 Mon Sep 17 00:00:00 2001 From: Sina Booeshaghi Date: Thu, 20 Feb 2025 14:01:35 -0800 Subject: [PATCH 08/21] enabled skipping checks with seqspec check --- seqspec/seqspec_check.py | 662 +++++++++++++++++++++++++++------------ 1 file changed, 459 insertions(+), 203 deletions(-) diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index 402a99a..f45afb9 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -19,7 +19,6 @@ def setup_check_args(parser): help="Validate seqspec file against specification", formatter_class=RawTextHelpFormatter, ) - subparser.add_argument("yaml", help="Sequencing specification yaml file") subparser.add_argument( "-o", metavar="OUT", @@ -27,174 +26,342 @@ def setup_check_args(parser): type=str, default=None, ) + subparser.add_argument( + "-s", + metavar="SKIP", + help=("Skip checks"), + type=str, + default=None, + choices=["igvf"], + ) + + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str) + return subparser def validate_check_args(parser, args): spec_fn = args.yaml o = args.o + s = args.s - return run_check(spec_fn, o) + return run_check(spec_fn, o, s) -def run_check(spec_fn, o): +def run_check(spec_fn, o, s): spec = load_spec(spec_fn) errors = check(spec, spec_fn) + if s == "igvf": + errors = filter_errors(errors, "igvf") if errors: if o: with open(o, "w") as f: - print("\n".join(errors), file=f) + for idx, e in enumerate(errors, 1): + print(format_error(e, idx), file=f) else: - print("\n".join(errors)) + for idx, e in enumerate(errors, 1): + print(format_error(e, idx)) return errors -def check(spec: Assay, spec_fn: str): - schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json") +IGVF_FILTERS = [ + {"error_type": "check_schema", "error_object": "lib_struct"}, + {"error_type": "check_schema", "error_object": "library_protocol"}, + {"error_type": "check_schema", "error_object": "library_kit"}, + {"error_type": "check_schema", "error_object": "sequence_protocol"}, + {"error_type": "check_schema", "error_object": "sequence_kit"}, + {"error_type": "check_schema", "error_object": "md5"}, +] - with open(schema_fn, "r") as stream: - schema = yaml.load(stream, Loader=yaml.Loader) - validator = Draft4Validator(schema) - errors = [] - idx = 0 - # with open("del.json", "w") as f: - # json.dump(spec.to_dict(), f, indent=4) - - for idx, error in enumerate(validator.iter_errors(spec.to_dict()), 1): - errors.append( - f"[error {idx}] {error.message} in spec[{']['.join(repr(index) for index in error.path)}]" - ) - idx += 1 - # check that the modalities are unique - if len(spec.modalities) != len(set(spec.modalities)): - errors.append( - f"[error {idx}] modalities [{', '.join(spec.modalities)}] are not unique" - ) +def filter_errors(errors, filter_type): + if filter_type == "igvf": + et = set([i["error_type"] for i in IGVF_FILTERS]) + eo = set([i["error_object"] for i in IGVF_FILTERS]) + ferrors = [] + for i in errors: + if i["error_type"] not in et and i["error_object"] not in eo: + ferrors.append(i) + return ferrors + else: + return errors + + +def format_error(errobj, idx=0): + return f"[error {idx}] {errobj['error_message']}" + + +def check(spec: Assay, spec_fn: str): + # Variety of checks against schema + def check_schema(spec: Assay, spec_fn: str, errors=[], idx=0): + schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json") + with open(schema_fn, "r") as stream: + schema = yaml.load(stream, Loader=yaml.Loader) + validator = Draft4Validator(schema) + for idx, error in enumerate(validator.iter_errors(spec.to_dict()), 1): + err_elements = [repr(index) for index in error.path] + err_path = f"spec[{']['.join(err_elements)}]" + errobj = { + "error_type": "check_schema", + "error_message": f"{error.message} in {err_path}", + "error_object": err_elements[-1], + } + # errors.append(f"[error {idx}] {error.message} in {err_path}]") + errors.append(errobj) idx += 1 + return (errors, idx) - # check that region_ids of the first level of the spec correspond to the modalities - # one for each modality - modes = spec.modalities - rgns = spec.library_spec - for r in rgns: - rid = r.region_id - if rid not in modes: - errors.append( - f"[error {idx}] region_id '{rid}' of the first level of the spec does not correspond to a modality [{', '.join(modes)}]" - ) + # Modalities are unique + def check_unique_modalities(spec, spec_fn, errors, idx): + if len(spec.modalities) != len(set(spec.modalities)): + errobj = { + "error_type": "check_unique_modalities", + "error_message": f"modalities [{', '.join(spec.modalities)}] are not unique", + "error_object": "modalities", + } + # errors.append( + # f"[error {idx}] modalities [{', '.join(spec.modalities)}] are not unique" + # ) + errors.append(errobj) idx += 1 + return (errors, idx) - # get all of the onlist files in the spec and check that they exist relative to the path of the spec - modes = spec.modalities - olrgns = [] - for m in modes: - olrgns += [i.onlist for i in spec.get_libspec(m).get_onlist_regions()] - - # check paths relative to spec_fn - for ol in olrgns: - if ol.urltype == "local": - if ol.filename[:-3] == ".gz": - check = path.join(path.dirname(spec_fn), ol.filename[:-3]) - if not path.exists(check): - errors.append(f"[error {idx}] {ol.filename[:-3]} does not exist") - idx += 1 - else: - check = path.join(path.dirname(spec_fn), ol.filename) - check_gz = path.join(path.dirname(spec_fn), ol.filename + ".gz") - if not path.exists(check) and not path.exists(check_gz): - errors.append(f"[error {idx}] {ol.filename} does not exist") - idx += 1 - elif ol.urltype == "http" or ol.urltype == "https" or ol.urltype == "ftp": - # ping the link with a simple http request to check if the file exists at that URI - if spec.seqspec_version == "0.3.0": - if not file_exists(ol.url): - errors.append(f"[error {idx}] {ol.filename} does not exist") - idx += 1 - else: - if not file_exists(ol.filename): - errors.append(f"[error {idx}] {ol.filename} does not exist") - idx += 1 + # Region_ids of the first level correspond to the modalities (one per modality) + def check_region_ids_modalities(spec, spec_fn, errors, idx): + modes = spec.modalities + rgns = spec.library_spec + for r in rgns: + rid = r.region_id + if rid not in modes: + errobj = { + "error_type": "check_region_ids_modalities", + "error_message": f"region_id '{rid}' of the first level of the spec does not correspond to a modality [{', '.join(modes)}]", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] region_id '{rid}' of the first level of the spec does not correspond to a modality [{', '.join(modes)}]" + # ) + errors.append(errobj) + idx += 1 + return (errors, idx) - # read ids should be unique - read_ids = set() - for read in spec.sequence_spec: - if read.read_id in read_ids: - errors.append( - f"[error {idx}] read_id '{read.read_id}' is not unique across all reads" - ) - idx += 1 - else: - read_ids.add(read.read_id) - - # iterate through reads in sequence_spec and check that the fastq files exist - for read in spec.sequence_spec: - spec_fn = path.dirname(spec_fn) - for f in read.files: - if f.urltype == "local": - check = path.join(spec_fn, f.filename) - if not path.exists(check): - errors.append(f"[error {idx}] {f.filename} does not exist") - idx += 1 - elif f.urltype == "http" or f.urltype == "https" or f.urltype == "ftp": + # Onlist files exist relative to the path of the spec or http + def check_onlist_files_exist(spec, spec_fn, errors, idx): + modes = spec.modalities + olrgns = [] + for m in modes: + olrgns += [i.onlist for i in spec.get_libspec(m).get_onlist_regions()] + + # check paths relative to spec_fn + for ol in olrgns: + if ol.urltype == "local": + if ol.filename[:-3] == ".gz": + check = path.join(path.dirname(spec_fn), ol.filename[:-3]) + if not path.exists(check): + errobj = { + "error_type": "check_onlist_files_exist", + "error_message": f"{ol.filename[:-3]} does not exist", + "error_object": "onlist", + } + # errors.append( + # f"[error {idx}] {ol.filename[:-3]} does not exist" + # ) + errors.append(errobj) + idx += 1 + else: + check = path.join(path.dirname(spec_fn), ol.filename) + check_gz = path.join(path.dirname(spec_fn), ol.filename + ".gz") + if not path.exists(check) and not path.exists(check_gz): + errobj = { + "error_type": "check_onlist_files_exist", + "error_message": f"{ol.filename} does not exist", + "error_object": "onlist", + } + # errors.append(f"[error {idx}] {ol.filename} does not exist") + errors.append(errobj) + idx += 1 + elif ol.urltype == "http" or ol.urltype == "https" or ol.urltype == "ftp": # ping the link with a simple http request to check if the file exists at that URI - if not file_exists(f.url): - errors.append(f"[error {idx}] {f.filename} does not exist") - idx += 1 + if spec.seqspec_version == "0.3.0": + if not file_exists(ol.url): + errobj = { + "error_type": "check_onlist_files_exist", + "error_message": f"{ol.filename} does not exist", + "error_object": "onlist", + } + + # errors.append(f"[error {idx}] {ol.filename} does not exist") + errors.append(errobj) + idx += 1 + else: + if not file_exists(ol.filename): + errobj = { + "error_type": "check_onlist_files_exist", + "error_message": f"{ol.filename} does not exist", + "error_object": "onlist", + } + # errors.append(f"[error {idx}] {ol.filename} does not exist") + errors.append(errobj) + idx += 1 + return (errors, idx) - # check that the primer ids, strand tuple pairs are unique across all reads - primer_strand_pairs = set() - for read in spec.sequence_spec: - if (read.primer_id, read.strand) in primer_strand_pairs: - errors.append( - f"[error {idx}] primer_id '{read.primer_id}' and strand '{read.strand}' tuple is not unique across all reads" - ) - idx += 1 - else: - primer_strand_pairs.add((read.primer_id, read.strand)) + # Read ids are unique + def check_unique_read_ids(spec, spec_fn, errors, idx): + read_ids = set() + for read in spec.sequence_spec: + if read.read_id in read_ids: + errobj = { + "error_type": "check_unique_read_ids", + "error_message": f"read_id '{read.read_id}' is not unique across all reads", + "error_object": "read", + } + # errors.append( + # f"[error {idx}] read_id '{read.read_id}' is not unique across all reads" + # ) + errors.append(errobj) + idx += 1 + else: + read_ids.add(read.read_id) + return (errors, idx) - # TODO add option to check md5sum + # Read files exist + def check_read_files_exist(spec, spec_fn, errors, idx): + for read in spec.sequence_spec: + spec_fn = path.dirname(spec_fn) + for f in read.files: + if f.urltype == "local": + check = path.join(spec_fn, f.filename) + if not path.exists(check): + errobj = { + "error_type": "check_read_files_exist", + "error_message": f"{f.filename} does not exist", + "error_object": "file", + } + # errors.append(f"[error {idx}] {f.filename} does not exist") + errors.append(errobj) + idx += 1 + elif f.urltype == "http" or f.urltype == "https" or f.urltype == "ftp": + # ping the link with a simple http request to check if the file exists at that URI + if not file_exists(f.url): + errobj = { + "error_type": "check_read_files_exist", + "error_message": f"{f.filename} does not exist", + "error_object": "file", + } + # errors.append(f"[error {idx}] {f.filename} does not exist") + errors.append(errobj) + idx += 1 + return (errors, idx) - # check that the region_id is unique across all regions - rgn_ids = set() - for m in modes: - for rgn in spec.get_libspec(m).get_leaves(): - if rgn.region_id in rgn_ids: - errors.append( - f"[error {idx}] region_id '{rgn.region_id}' is not unique across all regions" - ) + # Primer ids, strand tuple pairs are unique across all reads + def check_unique_read_primer_strand_pairs(spec, spec_fn, errors, idx): + primer_strand_pairs = set() + for read in spec.sequence_spec: + if (read.primer_id, read.strand) in primer_strand_pairs: + errobj = { + "error_type": "check_unique_read_primer_strand_pairs", + "error_message": f"primer_id '{read.primer_id}' and strand '{read.strand}' tuple is not unique across all reads", + "error_object": "read", + } + # errors.append( + # f"[error {idx}] primer_id '{read.primer_id}' and strand '{read.strand}' tuple is not unique across all reads" + # ) + errors.append(errobj) idx += 1 else: - rgn_ids.add(rgn.region_id) - - # check that the modality is in the reads - for read in spec.sequence_spec: - if read.modality not in modes: - errors.append( - f"[error {idx}] '{read.read_id}' modality '{read.modality}' does not exist in the modalities" - ) - idx += 1 + primer_strand_pairs.add((read.primer_id, read.strand)) + return (errors, idx) + + # TODO add option to check md5sum + def check_md5sum(spec, spec_fn, errors, idx): + return (errors, idx) + + # Region_id is unique across all regions + def check_unique_region_ids(spec, spec_fn, errors, idx): + modes = spec.modalities + rgn_ids = set() + for m in modes: + for rgn in spec.get_libspec(m).get_leaves(): + if rgn.region_id in rgn_ids: + errobj = { + "error_type": "check_unique_region_ids", + "error_message": f"region_id '{rgn.region_id}' is not unique across all regions", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] region_id '{rgn.region_id}' is not unique across all regions" + # ) + errors.append(errobj) + idx += 1 + else: + rgn_ids.add(rgn.region_id) + return (errors, idx) + + # Modality is in the reads + def check_read_modalities(spec, spec_fn, errors, idx): + modes = spec.modalities + for read in spec.sequence_spec: + if read.modality not in modes: + errobj = { + "error_type": "check_read_modalities", + "error_message": f"read '{read.read_id}' modality '{read.modality}' does not exist in the modalities", + "error_object": "read", + } + # errors.append( + # f"[error {idx}] '{read.read_id}' modality '{read.modality}' does not exist in the modalities" + # ) + errors.append(errobj) + idx += 1 + return (errors, idx) # check that the unique primer ids exist as a region id in the library_spec - for read in spec.sequence_spec: - if read.primer_id not in rgn_ids: - errors.append( - f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist in the library_spec" - ) - idx += 1 + # TODO is there a better way to get the rgn_ids? + def check_primer_ids_in_region_ids(spec, spec_fn, errors, idx): + # first get all unique region_ids + modes = spec.modalities + rgn_ids = set() + for m in modes: + for rgn in spec.get_libspec(m).get_leaves(): + if rgn.region_id in rgn_ids: + pass + else: + rgn_ids.add(rgn.region_id) + + # then check that the primer ids exist in the region_ids + for read in spec.sequence_spec: + if read.primer_id not in rgn_ids: + errobj = { + "error_type": "check_primer_ids_in_region_ids", + "error_message": f"'{read.read_id}' primer_id '{read.primer_id}' does not exist in the library_spec", + "error_object": "read", + } + # errors.append( + # f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist in the library_spec" + # ) + errors.append(errobj) + idx += 1 + return (errors, idx) # NOTE: this is a strong assumption that may be relaxed in the future # check that the primer id for each read is in the leaves of the spec for that modality - for read in spec.sequence_spec: - mode = spec.get_libspec(read.modality) - leaves = mode.get_leaves() - if read.primer_id not in [i.region_id for i in leaves]: - errors.append( - f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist as an atomic region in the library_spec for modality '{read.modality}'" - ) - idx += 1 + def check_primer_ids_in_libspec_leaves(spec, spec_fn, errors, idx): + for read in spec.sequence_spec: + mode = spec.get_libspec(read.modality) + leaves = mode.get_leaves() + if read.primer_id not in [i.region_id for i in leaves]: + errobj = { + "error_type": "check_primer_ids_in_libspec_leaves", + "error_message": f"'{read.read_id}' primer_id '{read.primer_id}' does not exist as an atomic region in the library_spec for modality '{read.modality}'", + "error_object": "read", + } + # errors.append( + # f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist as an atomic region in the library_spec for modality '{read.modality}'" + # ) + errors.append(errobj) + idx += 1 + return (errors, idx) # check that the max read len is not longer than the max len of the lib spec after the primer # for read in spec.sequence_spec: @@ -202,89 +369,178 @@ def check(spec: Assay, spec_fn: str): # leaves = mode.get_leaves() # idx = [i.region_id for i in leaves].index(read.primer_id) - # if a region has a sequence type "fixed" then it should not contain subregions - # if a region has a sequence type "joiend" then it should contain subregions - # if a region has a sequence type "random" then it should not contain subregions and should be all X's - # if a region has a sequence type "onlist" then it should have an onlist object - def seqtype_check(rgn, errors, idx): - # this is a recursive function that iterates through all regions and checks the sequence type - if rgn.sequence_type == "fixed" and rgn.regions: - errors.append( - f"[error {idx}] '{rgn.region_id}' sequence_type is 'fixed' and contains subregions" - ) - idx += 1 - if rgn.sequence_type == "joined" and not rgn.regions: - errors.append( - f"[error {idx}] '{rgn.region_id}' sequence_type is 'joined' and does not contain subregions" - ) - idx += 1 - if rgn.sequence_type == "random" and rgn.regions: - errors.append( - f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and contains subregions" - ) - idx += 1 - if rgn.sequence_type == "random" and rgn.sequence != "X" * rgn.max_len: - errors.append( - f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and sequence is not all X's" - ) - idx += 1 - if rgn.sequence_type == "onlist" and not rgn.onlist: - errors.append( - f"[error {idx}] '{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object" - ) - idx += 1 - if rgn.regions: - for r in rgn.regions: - errors, idx = seqtype_check(r, errors, idx) - return (errors, idx) + def check_sequence_types(spec, spec_fn, errors, idx): + modes = spec.modalities + + # if a region has a sequence type "fixed" then it should not contain subregions + # if a region has a sequence type "joiend" then it should contain subregions + # if a region has a sequence type "random" then it should not contain subregions and should be all X's + # if a region has a sequence type "onlist" then it should have an onlist object + def seqtype_check(rgn, errors, idx): + # this is a recursive function that iterates through all regions and checks the sequence type + if rgn.sequence_type == "fixed" and rgn.regions: + errobj = { + "error_type": "check_sequence_types", + "error_message": f"'{rgn.region_id}' sequence_type is 'fixed' and contains subregions", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] '{rgn.region_id}' sequence_type is 'fixed' and contains subregions" + # ) + errors.append(errobj) + idx += 1 + if rgn.sequence_type == "joined" and not rgn.regions: + errobj = { + "error_type": "check_sequence_types", + "error_message": f"'{rgn.region_id}' sequence_type is 'joined' and does not contain subregions", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] '{rgn.region_id}' sequence_type is 'joined' and does not contain subregions" + # ) + errors.append(errobj) + idx += 1 + if rgn.sequence_type == "random" and rgn.regions: + errobj = { + "error_type": "check_sequence_types", + "error_message": f"'{rgn.region_id}' sequence_type is 'random' and contains subregions", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and contains subregions" + # ) + errors.append(errobj) + idx += 1 + if rgn.sequence_type == "random" and rgn.sequence != "X" * rgn.max_len: + errobj = { + "error_type": "check_sequence_types", + "error_message": f"'{rgn.region_id}' sequence_type is 'random' and sequence is not all X's", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and sequence is not all X's" + # ) + errors.append(errobj) + idx += 1 + if rgn.sequence_type == "onlist" and not rgn.onlist: + errobj = { + "error_type": "check_sequence_types", + "error_message": f"'{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] '{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object" + # ) + errors.append(errobj) + idx += 1 + if rgn.regions: + for r in rgn.regions: + errors, idx = seqtype_check(r, errors, idx) + return (errors, idx) + + for m in modes: + for rgn in [spec.get_libspec(m)]: + errors, idx = seqtype_check(rgn, errors, idx) - for m in modes: - for rgn in [spec.get_libspec(m)]: - errors, idx = seqtype_check(rgn, errors, idx) + return (errors, idx) # check the lengths of every region against the max_len, using a recursive function - def len_check(rgn, errors, idx): - if rgn.regions: - for r in rgn.regions: - errors, idx = len_check(r, errors, idx) - if rgn.max_len < rgn.min_len: - errors.append( - f"[error {idx}] '{rgn.region_id}' max_len is less than min_len" - ) - idx += 1 + def check_region_lengths(spec, spec_fn, errors, idx): + modes = spec.modalities + + def len_check(rgn, errors, idx): + if rgn.regions: + for r in rgn.regions: + errors, idx = len_check(r, errors, idx) + if rgn.max_len < rgn.min_len: + errobj = { + "error_type": "check_region_lengths", + "error_message": f"'{rgn.region_id}' max_len is less than min_len", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] '{rgn.region_id}' max_len is less than min_len" + # ) + errors.append(errobj) + idx += 1 + return (errors, idx) + + for m in modes: + for rgn in [spec.get_libspec(m)]: + errors, idx = len_check(rgn, errors, idx) return (errors, idx) - for m in modes: - for rgn in [spec.get_libspec(m)]: - errors, idx = len_check(rgn, errors, idx) + # errors, idx = check_region_lengths(spec, spec_fn, errors, idx) # check that the length of the sequence is equal to the max_len using a recursive function # an assumption in the code and spec is that the displayed sequence is equal to the max_len - def seq_len_check(rgn, errors, idx): - if rgn.regions: - for r in rgn.regions: - errors, idx = seq_len_check(r, errors, idx) - if rgn.sequence and ( - len(rgn.sequence) < rgn.min_len or len(rgn.sequence) > rgn.max_len - ): - # noqa - errors.append( - f"[error {idx}] '{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})" - ) - idx += 1 - return (errors, idx) + def check_sequence_lengths(spec, spec_fn, errors, idx): + modes = spec.modalities + + def seq_len_check(rgn, errors, idx): + if rgn.regions: + for r in rgn.regions: + errors, idx = seq_len_check(r, errors, idx) + if rgn.sequence and ( + len(rgn.sequence) < rgn.min_len or len(rgn.sequence) > rgn.max_len + ): + # noqa + errobj = { + "error_type": "check_sequence_lengths", + "error_message": f"'{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})", + "error_object": "region", + } + # errors.append( + # f"[error {idx}] '{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})" + # ) + errors.append(errobj) + idx += 1 + return (errors, idx) - for m in modes: - for rgn in [spec.get_libspec(m)]: - errors, idx = seq_len_check(rgn, errors, idx) + for m in modes: + for rgn in [spec.get_libspec(m)]: + errors, idx = seq_len_check(rgn, errors, idx) + return (errors, idx) # check that the number of files in each "File" object for all Read object are all the same length - nfiles = [] - for read in spec.sequence_spec: - nfiles.append(len(read.files)) + def check_read_file_count(spec, spec_fn, errors, idx): + nfiles = [] + for read in spec.sequence_spec: + nfiles.append(len(read.files)) + + if len(set(nfiles)) != 1: + errobj = { + "error_type": "check_read_file_count", + "error_message": "Reads must have the same number of files", + "error_object": "read", + } + # errors.append(f"[error {idx}] Reads must have the same number of files") + errors.append(errobj) + idx += 1 + return (errors, idx) - if len(set(nfiles)) != 1: - errors.append(f"[error {idx}] Reads must have the same number of files") - idx += 1 + # errors, idx = check_read_file_count(spec, spec_fn, errors, idx) + + errors = [] + idx = 0 + checks = { + "check_schema": check_schema, + "check_unique_modalities": check_unique_modalities, + "check_region_ids_modalities": check_region_ids_modalities, + "check_onlist_files_exist": check_onlist_files_exist, + "check_unique_read_ids": check_unique_read_ids, + "check_read_files_exist": check_read_files_exist, + "check_unique_read_primer_strand_pairs": check_unique_read_primer_strand_pairs, + "check_unique_region_ids": check_unique_region_ids, + "check_read_modalities": check_read_modalities, + "check_primer_ids_in_region_ids": check_primer_ids_in_region_ids, + "check_primer_ids_in_libspec_leaves": check_primer_ids_in_libspec_leaves, + "check_sequence_types": check_sequence_types, + "check_region_lengths": check_region_lengths, + "check_sequence_lengths": check_sequence_lengths, + "check_read_file_count": check_read_file_count, + } + for k, v in checks.items(): + errors, idx = v(spec, spec_fn, errors, idx) return errors From 1e1abedf30102fc2aa0c88855e096df04ef62a8e Mon Sep 17 00:00:00 2001 From: Sina Booeshaghi Date: Fri, 21 Feb 2025 14:41:51 -0800 Subject: [PATCH 09/21] updating seqspec-html to print read info --- seqspec/seqspec_print.py | 33 ++++++++------ seqspec/seqspec_print_html.py | 84 +++++++++++++++++++++++++++++++---- 2 files changed, 95 insertions(+), 22 deletions(-) diff --git a/seqspec/seqspec_print.py b/seqspec/seqspec_print.py index be7f304..1324ea8 100644 --- a/seqspec/seqspec_print.py +++ b/seqspec/seqspec_print.py @@ -71,7 +71,7 @@ def run_seqspec_print(spec_fn, fmt, o): s = CMD[fmt](spec) if fmt == "png": - return s.savefig(o, dpi=300, bbox_inches="tight") + return s.savefig(o, dpi=300, bbox_inches="tight") # if o: with open(o, "w") as f: @@ -84,10 +84,26 @@ def run_seqspec_print(spec_fn, fmt, o): def print_seqspec_ascii(spec): p = [] for modality in spec.modalities: - p.append(libseq(spec, modality)) + p.append(format_libseq(spec, modality, *libseq(spec, modality))) return "\n".join(p) +def format_libseq(spec, modality, p, n): + libspec = spec.get_libspec(modality) + + s = "\n".join( + [ + modality, + "---", + "\n".join(p), + libspec.sequence, + complement_sequence(libspec.sequence), + "\n".join(n), + ] + ) + return s + + def libseq(spec, modality): libspec = spec.get_libspec(modality) seqspec = spec.get_seqspec(modality) @@ -118,18 +134,7 @@ def libseq(spec, modality): arrow = arrowl * "-" n.append(f"{ws}<{arrow}|({idx}) {read_id}") - - s = "\n".join( - [ - modality, - "---", - "\n".join(p), - libspec.sequence, - complement_sequence(libspec.sequence), - "\n".join(n), - ] - ) - return s + return (p, n) def run_print(data): diff --git a/seqspec/seqspec_print_html.py b/seqspec/seqspec_print_html.py index f469238..69b904d 100644 --- a/seqspec/seqspec_print_html.py +++ b/seqspec/seqspec_print_html.py @@ -1,4 +1,7 @@ +from seqspec.Assay import Assay from seqspec.Region import Region +from seqspec.Read import Read +from seqspec.Read import File def print_seqspec_html(spec): @@ -94,6 +97,7 @@ def atomicRegionTemplate(
  • onlist: {onlist}
  • regions: {subseq}
  • + """ return s @@ -116,21 +120,85 @@ def regionsTemplate(regions): return s -def libStructTemplate(region): +def libStructTemplate(spec, modality): + from seqspec.seqspec_print import libseq + from seqspec.Region import complement_sequence + + libspec = spec.get_libspec(modality) + seqspec = spec.get_seqspec(modality) # noqa + p, n = libseq(spec, modality) + + cseq = colorSeq(libspec.get_leaves()) + seq = "\n".join( + [ + "\n".join(p), + cseq, + complement_sequence(libspec.sequence), + "\n".join(n), + ] + ) s = f""" -
    {region.name}
    +
    {modality}
    -{colorSeq(region.get_leaves())}
    +{seq} """ return s -def multiModalTemplate(library_spec): - s = "".join( - [libStructTemplate(v) + "\n" + regionsTemplate(v.regions) for v in library_spec] - ) +def atomicReadTemplate(read: Read): + files = "".join(atomicFileTemplate(f) for f in read.files) if read.files else "" + + s = f""" +
    + {read.name} +
      +
    • read_id: {read.read_id}
    • +
    • primer_id: {read.primer_id}
    • +
    • min_len: {read.min_len}
    • +
    • max_len: {read.max_len}
    • +
    • strand: {read.strand}
    • +
    • + files: +
        + {files} +
      +
    • +
    +
    + """ + return s + + +def atomicFileTemplate(file: File): + s = f""" +
  • {file.filename} (md5: {file.md5})
  • + """ + return s + + +def readsTemplate(reads): + s = f"""
    1. + {'
    2. '.join([atomicReadTemplate(r) for r in reads])} +
    """ + return s + + +def multiModalTemplate(spec: Assay): + modes = spec.modalities + s = "" + for m in modes: + libspec = spec.get_libspec(m) + seqspec = spec.get_seqspec(m) + + s += f""" + {libStructTemplate(spec, m)} +

    Sequence structure

    + {readsTemplate(seqspec)} +

    Library structure

    + {regionsTemplate(libspec.get_leaves())} + """ return s @@ -173,7 +241,7 @@ def htmlTemplate(spec):

    Final library

    - {multiModalTemplate(spec.library_spec)} + {multiModalTemplate(spec)}
    From 1b2f3456a0fc88655f692ff48f47d1e706e61379 Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Thu, 20 Mar 2025 08:49:59 -0500 Subject: [PATCH 10/21] CHECK-161-onlist (#3) --- seqspec/seqspec_check.py | 47 +++++++++++++++++++++++++++++----------- 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index f45afb9..766d4fd 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -32,7 +32,7 @@ def setup_check_args(parser): help=("Skip checks"), type=str, default=None, - choices=["igvf"], + choices=["igvf", "igvf_onlist_skip"], ) subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str) @@ -54,6 +54,8 @@ def run_check(spec_fn, o, s): errors = check(spec, spec_fn) if s == "igvf": errors = filter_errors(errors, "igvf") + elif s == "igvf_onlist_skip": + errors = filter_errors(errors, "igvf_onlist_skip") if errors: if o: @@ -66,24 +68,43 @@ def run_check(spec_fn, o, s): return errors -IGVF_FILTERS = [ - {"error_type": "check_schema", "error_object": "lib_struct"}, - {"error_type": "check_schema", "error_object": "library_protocol"}, - {"error_type": "check_schema", "error_object": "library_kit"}, - {"error_type": "check_schema", "error_object": "sequence_protocol"}, - {"error_type": "check_schema", "error_object": "sequence_kit"}, - {"error_type": "check_schema", "error_object": "md5"}, -] +IGVF_FILTERS = { + "check_schema": [ + "'lib_struct'", + "'library_protocol'", + "'library_kit'", + "'sequence_protocol'", + "'sequence_kit'", + "'md5'", + ], +} +IGVF_ONLIST_SKIP_FILTERS = { + "check_schema": [ + "'lib_struct'", + "'library_protocol'", + "'library_kit'", + "'sequence_protocol'", + "'sequence_kit'", + "'md5'", + ], + "check_onlist_files_exist": ["onlist"], +} def filter_errors(errors, filter_type): + filters = None if filter_type == "igvf": - et = set([i["error_type"] for i in IGVF_FILTERS]) - eo = set([i["error_object"] for i in IGVF_FILTERS]) + filters = IGVF_FILTERS + elif filter_type == "igvf_onlist_skip": + filters = IGVF_ONLIST_SKIP_FILTERS + if filters: ferrors = [] for i in errors: - if i["error_type"] not in et and i["error_object"] not in eo: - ferrors.append(i) + error_type = i["error_type"] + error_object = i["error_object"] + if error_type in filters and error_object in filters[error_type]: + continue + ferrors.append(i) return ferrors else: return errors From 676c0200edaca7e3cce6b3c15094fd5fb8ee899a Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Fri, 28 Mar 2025 14:03:08 -0500 Subject: [PATCH 11/21] Merge devel to dev (#7) --- seqspec/File.py | 24 ++++++----------- seqspec/Read.py | 17 +++--------- seqspec/seqspec_check.py | 51 ++++++++++++++++-------------------- seqspec/seqspec_print.py | 7 ++--- setup.cfg | 2 +- tests/test_region.py | 37 +++++++++++++++++++++----- tests/test_seqspec_check.py | 2 +- tests/test_seqspec_onlist.py | 35 +++++++++++++------------ tests/test_seqspec_print.py | 8 +++--- tests/test_utils.py | 50 +++++++++++++++++++++++++++-------- 10 files changed, 132 insertions(+), 101 deletions(-) diff --git a/seqspec/File.py b/seqspec/File.py index 70a52f1..423dced 100644 --- a/seqspec/File.py +++ b/seqspec/File.py @@ -24,26 +24,18 @@ def __init__( self.md5 = md5 def __repr__(self) -> str: - d = { - "file_id": self.file_id, - "filename": self.filename, - "filetype": self.filetype, - "filesize": self.filesize, - "url": self.url, - "urltype": self.urltype, - "md5": self.md5, - } + d = self.to_dict() return f"{d}" def to_dict(self): d = { - "file_id": self.file_id, - "filename": self.filename, - "filetype": self.filetype, - "filesize": self.filesize, - "url": self.url, - "urltype": self.urltype, - "md5": self.md5, + "file_id": getattr(self, "file_id", None), + "filename": getattr(self, "filename", None), + "filetype": getattr(self, "filetype", None), + "filesize": getattr(self, "filesize", None), + "url": getattr(self, "url", None), + "urltype": getattr(self, "urltype", None), + "md5": getattr(self, "md5", None), } return d diff --git a/seqspec/Read.py b/seqspec/Read.py index 6681db3..6984998 100644 --- a/seqspec/Read.py +++ b/seqspec/Read.py @@ -32,24 +32,13 @@ def set_files(self, files: Optional[List["File"]] = []): self.files = files def __repr__(self) -> str: - d = { - "read_id": self.read_id, - "name": self.name, - "modality": self.modality, - "primer_id": self.primer_id, - "min_len": self.min_len, - "max_len": self.max_len, - "strand": self.strand, - "files": self.files, - } + d = self.to_dict() return f"{d}" def to_dict(self): # TODO is this necessary for backwards compatibility? - if self.files: - files = [i.to_dict() for i in self.files] - else: - files = [] + files = getattr(self, "files", []) + files = [i.to_dict() for i in files] d = { "read_id": self.read_id, "name": self.name, diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index 766d4fd..1e62276 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -68,28 +68,17 @@ def run_check(spec_fn, o, s): return errors -IGVF_FILTERS = { - "check_schema": [ - "'lib_struct'", - "'library_protocol'", - "'library_kit'", - "'sequence_protocol'", - "'sequence_kit'", - "'md5'", - ], -} -IGVF_ONLIST_SKIP_FILTERS = { - "check_schema": [ - "'lib_struct'", - "'library_protocol'", - "'library_kit'", - "'sequence_protocol'", - "'sequence_kit'", - "'md5'", - ], - "check_onlist_files_exist": ["onlist"], -} - +IGVF_FILTERS = [ + {"error_type": "check_schema", "error_object": "'lib_struct'"}, + {"error_type": "check_schema", "error_object": "'library_protocol'"}, + {"error_type": "check_schema", "error_object": "'library_kit'"}, + {"error_type": "check_schema", "error_object": "'sequence_protocol'"}, + {"error_type": "check_schema", "error_object": "'sequence_kit'"}, + {"error_type": "check_schema", "error_object": "'md5'"}, +] +IGVF_ONLIST_SKIP_FILTERS = IGVF_FILTERS + [ + {"error_type": "check_onlist_files_exist", "error_object": "onlist"} +] def filter_errors(errors, filter_type): filters = None @@ -97,14 +86,20 @@ def filter_errors(errors, filter_type): filters = IGVF_FILTERS elif filter_type == "igvf_onlist_skip": filters = IGVF_ONLIST_SKIP_FILTERS + if filters: ferrors = [] - for i in errors: - error_type = i["error_type"] - error_object = i["error_object"] - if error_type in filters and error_object in filters[error_type]: - continue - ferrors.append(i) + for error in errors: + # Check if this specific error combination exists in the filters + should_filter = any( + error["error_type"] == filter_item["error_type"] + and error["error_object"] == filter_item["error_object"] + for filter_item in filters + ) + + # Only keep errors that don't match our filter criteria + if not should_filter: + ferrors.append(error) return ferrors else: return errors diff --git a/seqspec/seqspec_print.py b/seqspec/seqspec_print.py index 1324ea8..11322f6 100644 --- a/seqspec/seqspec_print.py +++ b/seqspec/seqspec_print.py @@ -201,9 +201,10 @@ def plot_png(assay, modalities, modes, nmodes, lengths): plt.rcParams.update({"font.size": fsize}) fig, ax = plt.subplots( - figsize=(10, 1 * nmodes), nrows=nmodes, constrained_layout=True + figsize=(10, 1 * nmodes), nrows=nmodes ) - fig.suptitle(assay) + title_offset = 0.98 if nmodes > 1 else 1.2 + fig.suptitle(assay, y=title_offset) rts = [] for m, ax in zip(modes, fig.get_axes()): # get leaves @@ -245,7 +246,7 @@ def plot_png(assay, modalities, modes, nmodes, lengths): ax.autoscale() # since all axes use the same scale, set the xlim to be 0 to the max length - ax.set(**{"xlim": (0, max(lengths))}) + ax.set(**{"xlim": (0, max(lengths)), "ylim": (0, 1)}) # hide the spines for spine in ["right", "top", "left", "bottom"]: diff --git a/setup.cfg b/setup.cfg index ec77cff..435b9be 100644 --- a/setup.cfg +++ b/setup.cfg @@ -16,7 +16,7 @@ max-line-length = 88 extend-ignore = E203,E501 [tox:tox] -env_list = py{37,38,39,310,311} +env_list = py{311,312,313} skip_missing_interpreters = True [testenv] diff --git a/tests/test_region.py b/tests/test_region.py index 2e8127c..710665e 100644 --- a/tests/test_region.py +++ b/tests/test_region.py @@ -52,21 +52,37 @@ def read_rna_dict(read_id, min_len=0, max_len=100): "min_len": min_len, "max_len": max_len, "strand": "pos", + "files": [], } return expected class TestOnlist(TestCase): def test_simple_onlist(self): - name = "barcodes.txt" + file_id = "123" + filename = "barcodes.tsv" + filetype = "tsv" + filesize = 300 + url = filename + urltype = "file" md5sum = "d41d8cd98f00b204e9800998ecf8427e" location = "local" - permit = Onlist(name, md5sum, location) + permit = Onlist( + file_id, filename, filetype, filesize, url, "file", md5sum, location + ) self.assertEqual( permit.to_dict(), - {"filename": name, "md5": md5sum, "location": location}, + { + "file_id": file_id, + "filename": filename, + "filetype": filetype, + "filesize": filesize, + "url": url, + "urltype": urltype, + "md5": md5sum, + }, ) @@ -181,11 +197,16 @@ def test_onlists(self): sequence_type = "stuff" sequence = "AACGTGAT" - list_name = "barcodes.txt" + list_id = "123" + list_name = "barcodes.tsv" + list_type = "tsv" + list_size = 300 + list_url = list_name + list_urltype = "file" list_md5sum = "d41d8cd98f00b204e9800998ecf8427e" list_location = "local" - permited = Onlist(list_name, list_md5sum, list_location) + permited = Onlist(list_id, list_name, list_type, list_size, list_url, list_urltype, list_md5sum, list_location) r = Region( region_name, @@ -202,8 +223,12 @@ def test_onlists(self): "name": region_name, "sequence_type": sequence_type, "onlist": { + "file_id": list_id, "filename": list_name, - "location": list_location, + "filetype": list_type, + "filesize": list_size, + "url": list_name, + "urltype": list_urltype, "md5": list_md5sum, }, "sequence": sequence, diff --git a/tests/test_seqspec_check.py b/tests/test_seqspec_check.py index c613257..ae8ad6b 100644 --- a/tests/test_seqspec_check.py +++ b/tests/test_seqspec_check.py @@ -49,4 +49,4 @@ def test_validate_check_args(self): with patch("os.path.exists") as path_exists: path_exists.return_value = True errors = validate_check_args(None, args) - self.assertEqual(errors, None) + self.assertEqual(errors, []) diff --git a/tests/test_seqspec_onlist.py b/tests/test_seqspec_onlist.py index 6037088..40f6ad9 100644 --- a/tests/test_seqspec_onlist.py +++ b/tests/test_seqspec_onlist.py @@ -49,30 +49,30 @@ def test_run_onlist_region(self): self.assertEqual(len(regions), 1) region = regions[0] self.assertEqual(region.location, "local") - self.assertEqual(region.filename, "index_onlist.txt") + self.assertEqual(region.filename, "index_onlist.tsv") self.assertEqual(region.md5, "939cb244b4c43248fcc795bbe79599b0") def test_run_onlist_read(self): - with create_temporary_barcode_files(["index_onlist.txt"]): + with create_temporary_barcode_files(["index_onlist.tsv"]): spec = load_example_spec(example_spec) reads = run_onlist_read(spec, "rna", "read2.fastq.gz") self.assertEqual(len(reads), 1) read = reads[0] self.assertEqual(read.location, "local") - self.assertEqual(read.filename, "index_onlist.txt") + self.assertEqual(read.filename, "index_onlist.tsv") self.assertEqual(read.md5, "939cb244b4c43248fcc795bbe79599b0") def test_find_list_target_dir_local(self): with create_temporary_barcode_files(["index_onlist.txt"]) as tmpdir: - filename = os.path.join(tmpdir, "temp.txt") + filename = os.path.join(tmpdir, "temp.tsv") - onlist1 = Onlist(filename, "d41d8cd98f00b204e9800998ecf8427e", "local") + onlist1 = Onlist("temp_id", filename, "tsv", 300, filename, "local", "d41d8cd98f00b204e9800998ecf8427e", "local") target_dir = find_list_target_dir([onlist1]) self.assertEqual(target_dir, tmpdir) def test_find_list_target_dir_remote(self): - onlist1 = Onlist("http:localhost:9/temp.txt", "d41d8cd98f00b204e9800998ecf8427e", "remote") + onlist1 = Onlist("temp_id", "temp.tsv", "tsv", 300, "http://localhost:9/temp.tsv", "http", "d41d8cd98f00b204e9800998ecf8427e", "remote") target_dir = find_list_target_dir([onlist1]) self.assertEqual(target_dir, os.getcwd()) @@ -124,7 +124,7 @@ def test_join_onlist_multi(self): self.assertEqual(joined[2], "- GGTT") def test_local_validate_onlist_args(self): - onlist_name = "index_onlist.txt" + onlist_name = "index_onlist.tsv" with create_temporary_barcode_files([onlist_name]) as tmpdir: expected_onlist_path = os.path.join(tmpdir, onlist_name) spec_path = os.path.join(tmpdir, "spec.yaml") @@ -133,15 +133,13 @@ def test_local_validate_onlist_args(self): subparser = parser.add_subparsers(dest="command") subparser = setup_onlist_args(subparser) args = parser.parse_args([ - "onlist", "-m", "rna", "-r", "read1.fastq.gz", "-f", "multi", spec_path]) + "onlist", "-m", "rna", "-i", "read1.fastq.gz", "-f", "multi", spec_path]) def load_spec(*args, **kwargs): return load_example_spec(example_spec) with patch("seqspec.seqspec_onlist.load_spec", load_spec) as loader: - onlist_path = validate_onlist_args(parser, args) - - self.assertEqual(onlist_path, expected_onlist_path) + validate_onlist_args(parser, args) def test_local_cached_remote_validate_onlist_args(self): # Test that we will can use a locally cached copy of one barcode file @@ -155,23 +153,26 @@ def test_local_cached_remote_validate_onlist_args(self): subparser = parser.add_subparsers(dest="command") subparser = setup_onlist_args(subparser) args = parser.parse_args([ - "onlist", "-m", "rna", "-r", "read1.fastq.gz", "-f", "multi", spec_path]) + "onlist", "-m", "rna", "-i", "read1.fastq.gz", "-f", "multi", spec_path]) def load_spec(*args, **kwargs): remote_spec = example_spec.replace( "location: local", "location: remote" ).replace( - "filename: index_onlist.txt", - "filename: http://localhost:9/foo/index_onlist.txt" + "url: index_onlist.tsv", + "url: http://localhost:9/foo/index_onlist.tsv" + ).replace( + "urltype: local", + "urltype: http", ) print(remote_spec) return load_example_spec(remote_spec) - with patch("seqspec.seqspec_onlist.load_spec", load_spec) as loader: - onlist_path = validate_onlist_args(parser, args) + with patch("seqspec.seqspec_onlist.load_spec", load_spec) as loader, patch("seqspec.seqspec_onlist.read_remote_list", return_value="index_onlist.tsv") as fake_remote_list: + # Failed validation would raise an exception + validate_onlist_args(parser, args) - self.assertEqual(onlist_path, expected_onlist_path) def test_write_onlist_no_double_spacing(self): # Make sure that joined onlists don't end up double spaced. diff --git a/tests/test_seqspec_print.py b/tests/test_seqspec_print.py index 955281a..c8b229e 100644 --- a/tests/test_seqspec_print.py +++ b/tests/test_seqspec_print.py @@ -5,8 +5,8 @@ from matplotlib.figure import Figure from seqspec.seqspec_print import ( - run_print_library_tree, - run_print_library_png, + print_library_ascii, + print_seqspec_png, ) from seqspec.utils import load_spec_stream @@ -18,10 +18,10 @@ def setUp(self): self.example_spec = load_spec_stream(StringIO(example_spec_text)) def test_seqspec_print_tree(self): - tree = run_print_library_tree(self.example_spec) + tree = print_library_ascii(self.example_spec) self.assertIn("SOLiD_P1_adapter", tree) def test_seqspec_print_png(self): - fig = run_print_library_png(self.example_spec) + fig = print_seqspec_png(self.example_spec) self.assertIsInstance(fig, Figure) diff --git a/tests/test_utils.py b/tests/test_utils.py index e1cc281..34a36bc 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -28,8 +28,8 @@ region_rna_linker_dict, ) -example_spec = f"""!Assay -seqspec_version: { __version__ } +example_spec = """!Assay +seqspec_version: 0.3.0 assay_id: test_assay name: my assay doi: https://doi.org/10.1038/nmeth.1315 @@ -39,10 +39,10 @@ modalities: - rna lib_struct: https://teichlab.github.io/scg_lib_structs/methods_html/tang2009.html -library_protocol: custom 1 -library_kit: custom 2 -sequence_protocol: custom 3 -sequence_kit: custom 4 +library_protocol: "Custom" +library_kit: "Custom" +sequence_protocol: "Custom" +sequence_kit: "Custom" sequence_spec: - !Read read_id: read1.fastq.gz @@ -51,8 +51,16 @@ primer_id: SOLiD_P1_adapter min_len: 90 max_len: 187 - # this is a guess strand: pos + files: + - !File + file_id: read1 + filename: read1.fastq.gz + filetype: fastq + filesize: 123456789 + url: read1.fastq.gz + urltype: local + md5: 68b329da9893e34099c7d8ad5cb9c940 - !Read read_id: read2.fastq.gz name: read2 for experiment @@ -61,6 +69,15 @@ min_len: 25 max_len: 25 strand: neg + files: + - !File + file_id: read2 + filename: read2.fastq.gz + filetype: fastq + filesize: 123456789 + url: read2.fastq.gz + urltype: local + md5: 68b329da9893e34099c7d8ad5cb9c940 library_spec: - !Region region_id: rna @@ -114,7 +131,12 @@ min_len: 6 max_len: 6 onlist: !Onlist - filename: index_onlist.txt + file_id: onlist-1 + filename: index_onlist.tsv + filetype: tsv + filesize: 300 + url: index_onlist.tsv + urltype: local md5: 939cb244b4c43248fcc795bbe79599b0 location: local regions: null @@ -195,7 +217,7 @@ def test_read_local_list(self): with gzip.open(temp_list_filename, "wt") as stream: stream.write(fake_contents) - onlist1 = Onlist(temp_list_filename, fake_md5, "local") + onlist1 = Onlist("123", temp_list_filename, "tsv", 300, temp_list_filename, "local", fake_md5, "local") loaded_list = read_local_list(onlist1) self.assertEqual(fake_onlist, loaded_list) @@ -210,7 +232,7 @@ def test_read_local_list_gz(self): with open(temp_list_filename, "wt") as stream: stream.write(fake_contents) - onlist1 = Onlist(temp_list_filename, fake_md5, "local") + onlist1 = Onlist("123", temp_list_filename, "tsv", 300, temp_list_filename, "local", fake_md5, "local") loaded_list = read_local_list(onlist1) self.assertEqual(fake_onlist, loaded_list) @@ -233,12 +255,18 @@ def raise_for_status(self): return response() with patch("requests.get", new=fake_request_get): - onlist1 = Onlist("http://localhost/testlist.txt", fake_md5, "remote") + url = "http://localhost/testlist.txt" + onlist1 = Onlist("123", "testlist.txt", "http", 300, url, "http", fake_md5, "remote") loaded_list = read_remote_list(onlist1) self.assertEqual(fake_onlist, loaded_list) def test_get_igvf_auth(self): + # clean out the environment we inherited + for term in ["IGVF_SECRET_KEY", "IGVF_API_KEY"]: + if term in os.environ: + del os.environ[term] + test_data = [ (None, None, None), ("user", "pass", ("user", "pass")), From c62b0079ee4c813d9ed8fe141d4fb35d5040a366 Mon Sep 17 00:00:00 2001 From: Mingjie Li Date: Wed, 30 Apr 2025 14:58:02 -0500 Subject: [PATCH 12/21] fix version --- README.md | 3 +-- seqspec/__init__.py | 9 ++++++++- seqspec/seqspec_check.py | 4 +++- seqspec/seqspec_onlist.py | 16 ++++++++++++++++ seqspec/seqspec_upgrade.py | 7 ++++--- setup.cfg | 1 - setup.py | 3 ++- 7 files changed, 34 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 2c7c115..9ddbafb 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,6 @@ # seqspec -![github version](https://img.shields.io/badge/Version-0.3.1-informational) -[![pypi version](https://img.shields.io/pypi/v/seqspec)](https://pypi.org/project/seqspec/0.3.1/) +![github version](https://img.shields.io/badge/Version-0.3.0-informational) ![python versions](https://img.shields.io/pypi/pyversions/seqspec) [![license](https://img.shields.io/pypi/l/seqspec)](LICENSE) diff --git a/seqspec/__init__.py b/seqspec/__init__.py index 260c070..ad6cb61 100644 --- a/seqspec/__init__.py +++ b/seqspec/__init__.py @@ -1 +1,8 @@ -__version__ = "0.3.1" +__version__ = "0.3.0" + + +def get_version(): + """ + Returns the version of the package. + """ + return __version__ diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index 1e62276..3b5a8b0 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -4,6 +4,7 @@ from seqspec.utils import load_spec, file_exists from seqspec.Assay import Assay from argparse import RawTextHelpFormatter +from seqspec import get_version def setup_check_args(parser): @@ -80,6 +81,7 @@ def run_check(spec_fn, o, s): {"error_type": "check_onlist_files_exist", "error_object": "onlist"} ] + def filter_errors(errors, filter_type): filters = None if filter_type == "igvf": @@ -200,7 +202,7 @@ def check_onlist_files_exist(spec, spec_fn, errors, idx): idx += 1 elif ol.urltype == "http" or ol.urltype == "https" or ol.urltype == "ftp": # ping the link with a simple http request to check if the file exists at that URI - if spec.seqspec_version == "0.3.0": + if spec.seqspec_version == get_version(): if not file_exists(ol.url): errobj = { "error_type": "check_onlist_files_exist", diff --git a/seqspec/seqspec_onlist.py b/seqspec/seqspec_onlist.py index b021a5a..dac8230 100644 --- a/seqspec/seqspec_onlist.py +++ b/seqspec/seqspec_onlist.py @@ -124,6 +124,9 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o): } onlists = CMD[idtype](spec, modality, ids) + print("idtype:", idtype) + print("modality:", modality) + print("ids:", ids) if len(onlists) == 0: raise ValueError(f"No onlist found for {modality}, {idtype}, {ids}") @@ -162,6 +165,7 @@ def run_onlist_region_type( spec: Assay, modality: str, region_type: str ) -> List[Onlist]: regions = find_by_region_type(spec, modality, region_type) + print("regions:", regions) onlists: List[Onlist] = [] for r in regions: ol = r.get_onlist() @@ -182,15 +186,27 @@ def run_onlist_region(spec: Assay, modality: str, region_id: str) -> List[Onlist def run_onlist_read(spec: Assay, modality: str, read_id: str) -> List[Onlist]: (read, rgns) = map_read_id_to_regions(spec, modality, read_id) + print("read:", read) + print() + print("rgns:", rgns) + print() # convert regions to region coordinates rcs = project_regions_to_coordinates(rgns) + print("rcs:", rcs) + print() # intersect read with region coordinates new_rcs = itx_read(rcs, 0, read.max_len) + print("new_rcs:", new_rcs) + print("reads mex len:", read.max_len) + print() onlists: List[Onlist] = [] for r in new_rcs: ol = r.get_onlist() if ol: + print("region:", r) + print("onlist:", ol) + print() onlists.append(ol) return onlists diff --git a/seqspec/seqspec_upgrade.py b/seqspec/seqspec_upgrade.py index 8a982a4..032adbb 100644 --- a/seqspec/seqspec_upgrade.py +++ b/seqspec/seqspec_upgrade.py @@ -2,6 +2,7 @@ from seqspec.File import File from seqspec.Region import Onlist from argparse import RawTextHelpFormatter +from seqspec import get_version def setup_upgrade_args(parser): @@ -53,14 +54,14 @@ def upgrade(spec, version): "0.1.0": upgrade_0_1_0_to_0_3_0, "0.1.1": upgrade_0_1_1_to_0_3_0, "0.2.0": upgrade_0_2_0_to_0_3_0, - "0.3.0": upgrade_0_3_0_to_0_3_0, + get_version(): no_upgrade, } u = UPGRADE[version](spec) return u -def upgrade_0_3_0_to_0_3_0(spec): +def no_upgrade(spec): return spec @@ -100,7 +101,7 @@ def upgrade_0_2_0_to_0_3_0(spec): md5=md5, location=location, ) - spec.seqspec_version = "0.3.0" + spec.seqspec_version = get_version() return spec diff --git a/setup.cfg b/setup.cfg index 435b9be..594931a 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,4 @@ [bumpversion] -current_version = 0.3.1 commit = True tag = True diff --git a/setup.py b/setup.py index 5afafa8..1cff4a8 100644 --- a/setup.py +++ b/setup.py @@ -1,4 +1,5 @@ from setuptools import find_packages, setup +import seqspec def read(path): @@ -10,7 +11,7 @@ def read(path): setup( name="seqspec", - version="0.3.1", + version=seqspec.get_version(), url="https://github.com/sbooeshaghi/seqspec", author="Sina Booeshaghi", author_email="abooesha@caltech.edu", From 255c2ca64527765215aaea8a432b337d4d357fde Mon Sep 17 00:00:00 2001 From: Mingjie Li Date: Wed, 30 Apr 2025 15:01:40 -0500 Subject: [PATCH 13/21] remove print --- seqspec/seqspec_onlist.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/seqspec/seqspec_onlist.py b/seqspec/seqspec_onlist.py index dac8230..b021a5a 100644 --- a/seqspec/seqspec_onlist.py +++ b/seqspec/seqspec_onlist.py @@ -124,9 +124,6 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o): } onlists = CMD[idtype](spec, modality, ids) - print("idtype:", idtype) - print("modality:", modality) - print("ids:", ids) if len(onlists) == 0: raise ValueError(f"No onlist found for {modality}, {idtype}, {ids}") @@ -165,7 +162,6 @@ def run_onlist_region_type( spec: Assay, modality: str, region_type: str ) -> List[Onlist]: regions = find_by_region_type(spec, modality, region_type) - print("regions:", regions) onlists: List[Onlist] = [] for r in regions: ol = r.get_onlist() @@ -186,27 +182,15 @@ def run_onlist_region(spec: Assay, modality: str, region_id: str) -> List[Onlist def run_onlist_read(spec: Assay, modality: str, read_id: str) -> List[Onlist]: (read, rgns) = map_read_id_to_regions(spec, modality, read_id) - print("read:", read) - print() - print("rgns:", rgns) - print() # convert regions to region coordinates rcs = project_regions_to_coordinates(rgns) - print("rcs:", rcs) - print() # intersect read with region coordinates new_rcs = itx_read(rcs, 0, read.max_len) - print("new_rcs:", new_rcs) - print("reads mex len:", read.max_len) - print() onlists: List[Onlist] = [] for r in new_rcs: ol = r.get_onlist() if ol: - print("region:", r) - print("onlist:", ol) - print() onlists.append(ol) return onlists From 17b1923cd507ecdf42e5b36d6c544fecc9e2b171 Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Mon, 30 Jun 2025 14:07:33 -0500 Subject: [PATCH 14/21] CHECK-207-kb-single (#9) --- requirements.txt | 3 +- seqspec/Region.py | 2 +- seqspec/main.py | 133 ++++++++----- seqspec/seqspec_check.py | 50 +++-- seqspec/seqspec_convert.py | 137 ++++++++----- seqspec/seqspec_diff.py | 169 +++++++++++++--- seqspec/seqspec_file.py | 292 +++++++++++++++++----------- seqspec/seqspec_find.py | 151 ++++++++++----- seqspec/seqspec_format.py | 75 ++++++-- seqspec/seqspec_index.py | 180 +++++++++++------ seqspec/seqspec_info.py | 79 +++++--- seqspec/seqspec_init.py | 190 ++++++++++-------- seqspec/seqspec_methods.py | 91 +++++---- seqspec/seqspec_modify.py | 233 +++++++++++----------- seqspec/seqspec_onlist.py | 101 ++++++---- seqspec/seqspec_print.py | 342 +++++++++++++++------------------ seqspec/seqspec_print_html.py | 95 ++++----- seqspec/seqspec_print_utils.py | 79 ++++++++ seqspec/seqspec_split.py | 73 ++++--- seqspec/seqspec_upgrade.py | 84 ++++---- seqspec/seqspec_version.py | 59 +++--- setup.cfg | 2 +- 22 files changed, 1603 insertions(+), 1017 deletions(-) create mode 100644 seqspec/seqspec_print_utils.py diff --git a/requirements.txt b/requirements.txt index 4d23455..acff050 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,4 +3,5 @@ jsonschema newick requests biopython -packaging \ No newline at end of file +packaging +matplotlib>=3.4.0 \ No newline at end of file diff --git a/seqspec/Region.py b/seqspec/Region.py index 9a2a7fd..f7edbc0 100644 --- a/seqspec/Region.py +++ b/seqspec/Region.py @@ -417,7 +417,7 @@ def __init__( url: str, urltype: str, md5: str, - location: Optional[str], + # location: Optional[str], ) -> None: super().__init__() self.file_id = file_id diff --git a/seqspec/main.py b/seqspec/main.py index a5bb35e..b800ab7 100644 --- a/seqspec/main.py +++ b/seqspec/main.py @@ -1,46 +1,48 @@ -from . import __version__ -import argparse -import sys -from .seqspec_format import setup_format_args, validate_format_args -from .seqspec_print import setup_print_args, validate_print_args -from .seqspec_check import setup_check_args, validate_check_args -from .seqspec_find import setup_find_args, validate_find_args - -from .seqspec_convert import setup_convert_args, validate_convert_args -from .seqspec_modify import setup_modify_args, validate_modify_args -from .seqspec_index import setup_index_args, validate_index_args -from .seqspec_info import setup_info_args, validate_info_args - -from .seqspec_split import setup_split_args, validate_split_args -from .seqspec_init import setup_init_args, validate_init_args -from .seqspec_onlist import setup_onlist_args, validate_onlist_args -from .seqspec_version import setup_version_args, validate_version_args -from .seqspec_methods import setup_methods_args, validate_methods_args -from .seqspec_file import setup_file_args, validate_file_args -from .seqspec_upgrade import setup_upgrade_args, validate_upgrade_args +"""Main module for seqspec CLI. -import warnings - -# Steps to add new subcommands -# Create seqspec_subcommand.py (create setup_subcmd_args, validate_subcmd_args, run_subcmd in that file) -# (in this file) from seqspec_subcmd import setup_subcmd_args, validate_subcmd_args -# Add setup_subcmd_args to command_to_parser along with its key==str(subcmd) -# Add validate_subcmd_args to COMMAND_TO_FUNCTION along with its key==str(subcmd) +This module provides the main entry point for the seqspec command-line interface. +It handles argument parsing, command routing, and execution of subcommands. +""" +import sys +import warnings +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace +from typing import Dict, Callable, Any -def main(): - warnings.simplefilter("default", DeprecationWarning) +from . import __version__ - # setup parsers - parser = argparse.ArgumentParser( +# Import subcommand modules +from .seqspec_format import setup_format_args, run_format +from .seqspec_print import setup_print_args, run_print +from .seqspec_check import setup_check_args, run_check +from .seqspec_find import setup_find_args, run_find +from .seqspec_convert import setup_convert_args, run_convert +from .seqspec_modify import setup_modify_args, run_modify +from .seqspec_index import setup_index_args, run_index +from .seqspec_info import setup_info_args, run_info +from .seqspec_split import setup_split_args, run_split +from .seqspec_init import setup_init_args, run_init +from .seqspec_onlist import setup_onlist_args, run_onlist +from .seqspec_version import setup_version_args, run_version +from .seqspec_methods import setup_methods_args, run_methods +from .seqspec_file import setup_file_args, run_file +from .seqspec_upgrade import setup_upgrade_args, run_upgrade + + +def setup_parser(): + """Create and configure the main argument parser. + + Returns: + Configured ArgumentParser instance. + """ + parser = ArgumentParser( description=f""" seqspec {__version__}: A machine-readable file format for genomic library sequence and structure. GitHub: https://github.com/pachterlab/seqspec Documentation: https://pachterlab.github.io/seqspec/ - """, - formatter_class=argparse.RawTextHelpFormatter, + formatter_class=RawTextHelpFormatter, ) subparsers = parser.add_subparsers( @@ -67,7 +69,18 @@ def main(): "version": setup_version_args(subparsers), } - # Show help when no arguments are given + return parser, command_to_parser + + +def handle_no_args( + parser: ArgumentParser, command_to_parser: Dict[str, ArgumentParser] +) -> None: + """Handle case when no arguments are provided. + + Args: + parser: Main argument parser. + command_to_parser: Dictionary mapping commands to their parsers. + """ if len(sys.argv) == 1: parser.print_help(sys.stderr) sys.exit(1) @@ -80,27 +93,43 @@ def main(): parser.print_help(sys.stderr) sys.exit(1) + +def main() -> None: + """Main entry point for the seqspec CLI.""" + warnings.simplefilter("default", DeprecationWarning) + + parser, command_to_parser = setup_parser() + handle_no_args(parser, command_to_parser) + args = parser.parse_args() - # Setup validator and runner for all subcommands (validate and run if valid) - COMMAND_TO_FUNCTION = { - "format": validate_format_args, - "print": validate_print_args, - "check": validate_check_args, - "find": validate_find_args, - "index": validate_index_args, - "info": validate_info_args, - "init": validate_init_args, - "methods": validate_methods_args, - "modify": validate_modify_args, - "onlist": validate_onlist_args, - "split": validate_split_args, - "version": validate_version_args, - "file": validate_file_args, - "upgrade": validate_upgrade_args, - "convert": validate_convert_args, + # Setup validator and runner for all subcommands + command_to_function: Dict[str, Callable[[ArgumentParser, Namespace], Any]] = { + "format": run_format, + "print": run_print, + "check": run_check, + "find": run_find, + "index": run_index, + "info": run_info, + "init": run_init, + "methods": run_methods, + "modify": run_modify, + "onlist": run_onlist, + "split": run_split, + "version": run_version, + "file": run_file, + "upgrade": run_upgrade, + "convert": run_convert, } - COMMAND_TO_FUNCTION[sys.argv[1]](parser, args) + + try: + command_to_function[sys.argv[1]](parser, args) + except KeyError: + parser.print_help(sys.stderr) + sys.exit(1) + except Exception as e: + print(f"Error: {str(e)}", file=sys.stderr) + sys.exit(1) if __name__ == "__main__": diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index 3b5a8b0..82a2894 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -1,13 +1,23 @@ +"""Check module for seqspec CLI. + +This module provides functionality to validate seqspec files against the specification schema. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace + from jsonschema import Draft4Validator import yaml from os import path + from seqspec.utils import load_spec, file_exists from seqspec.Assay import Assay -from argparse import RawTextHelpFormatter + from seqspec import get_version def setup_check_args(parser): + """Create and configure the check command subparser.""" subparser = parser.add_parser( "check", description=""" @@ -22,45 +32,51 @@ def setup_check_args(parser): ) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) subparser.add_argument( "-s", + "--skip", metavar="SKIP", - help=("Skip checks"), + help="Skip checks", type=str, default=None, choices=["igvf", "igvf_onlist_skip"], ) - subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str) + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) return subparser -def validate_check_args(parser, args): - spec_fn = args.yaml - o = args.o - s = args.s +def validate_check_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the check command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") - return run_check(spec_fn, o, s) +def run_check(parser: ArgumentParser, args: Namespace): + """Run the check command.""" + validate_check_args(parser, args) -def run_check(spec_fn, o, s): - spec = load_spec(spec_fn) + spec = load_spec(args.yaml) + errors = check(spec, args.yaml) - errors = check(spec, spec_fn) - if s == "igvf": + if args.skip == "igvf": errors = filter_errors(errors, "igvf") - elif s == "igvf_onlist_skip": + elif args.skip == "igvf_onlist_skip": errors = filter_errors(errors, "igvf_onlist_skip") if errors: - if o: - with open(o, "w") as f: + if args.output: + with open(args.output, "w") as f: for idx, e in enumerate(errors, 1): print(format_error(e, idx), file=f) else: diff --git a/seqspec/seqspec_convert.py b/seqspec/seqspec_convert.py index ccffa7c..11a4d1a 100644 --- a/seqspec/seqspec_convert.py +++ b/seqspec/seqspec_convert.py @@ -1,16 +1,22 @@ -from seqspec.utils import load_genbank +"""Convert module for seqspec CLI. + +This module provides functionality to convert between different formats (seqspec, genbank, token). +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace import json -from seqspec.Region import Region -from seqspec.Assay import Assay -from seqspec.utils import load_spec import numpy as np from typing import Dict, List, Tuple -from os import path -from pathlib import Path +import os +from seqspec.utils import load_genbank, load_spec +from seqspec.Region import Region +from seqspec.Assay import Assay -schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json") +# Load schema and constants +schema_fn = os.path.join(os.path.dirname(__file__), "schema/seqspec.schema.json") with open(schema_fn, "r") as f: schema = json.load(f) REGION_TYPES = schema["$defs"]["region"]["properties"]["region_type"]["enum"] @@ -18,51 +24,96 @@ SEQUENCE_TYPES = schema["$defs"]["region"]["properties"]["sequence_type"]["enum"] -def setup_convert_args(parser): +def setup_convert_args(parser) -> ArgumentParser: + """Create and configure the convert command subparser.""" subparser = parser.add_parser( "convert", - description="get genbank about seqspec file", - help="get genbank about seqspec file", + description=""" +Convert between different formats (seqspec, genbank, token). + +Examples: +seqspec convert -ifmt seqspec -ofmt token spec.yaml -o output_dir # Convert seqspec to token format +seqspec convert -ifmt genbank -ofmt seqspec input.gb -o spec.yaml # Convert genbank to seqspec +--- +""", + help="Convert between different formats", + formatter_class=RawTextHelpFormatter, ) choices = ["genbank", "seqspec", "token"] subparser.add_argument( - "-ifmt", help="Input format", type=str, default="seqspec", choices=choices + "-ifmt", + "--input-format", + help="Input format", + type=str, + default="seqspec", + choices=choices, ) subparser.add_argument( - "-ofmt", help="Output format", type=str, default="token", choices=choices + "-ofmt", + "--output-format", + help="Output format", + type=str, + default="token", + choices=choices, ) - subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file or directory", + type=Path, default=None, required=False, ) subparser.add_argument( - "input_file", metavar="IN", help="Path to input file", type=str + "input_file", + metavar="IN", + help="Path to input file", + type=Path, ) return subparser -def validate_convert_args(parser, args): - # if everything is valid the run_convert - fn = args.input_file - ifmt = args.ifmt - ofmt = args.ofmt - o = args.o +def validate_convert_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the convert command arguments.""" + if not Path(args.input_file).exists(): + parser.error(f"Input file does not exist: {args.input_file}") + + if args.output and Path(args.output).exists(): + if args.output_format == "token": + if not Path(args.output).is_dir(): + parser.error( + f"Output path exists but is not a directory: {args.output}" + ) + else: + if not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") + + +def run_convert(parser: ArgumentParser, args: Namespace) -> None: + """Run the convert command.""" + validate_convert_args(parser, args) + + CONVERT = { + ("genbank", "seqspec"): gb_to_seqspec, + ("seqspec", "token"): seqspec_to_token, + } - cnv = run_convert(fn, ifmt, ofmt, o) - print(cnv) + file = load_input_file(args.input_file, args.input_format) + result = CONVERT[(args.input_format, args.output_format)](file) - # if o: - # spec.to_YAML(o) - # else: - # print(json.dumps(spec, sort_keys=False, indent=4)) + if args.output: + if args.output_format == "token": + save_tokenized_spec(*result, str(args.output)) # noqa + else: + # Handle other output formats here + pass + else: + return result -def load_input_file(fn, ifmt): +def load_input_file(fn: Path, ifmt: str): + """Load input file based on format.""" LOAD = { "genbank": load_genbank, "seqspec": load_spec, @@ -123,20 +174,6 @@ def save_tokenized_spec( f.write(f"{feature}\n") -def run_convert(fn, ifmt, ofmt, o): - CONVERT = { - ("genbank", "seqspec"): gb_to_seqspec, - ("seqspec", "token"): seqspec_to_token, - } - file = load_input_file(fn, ifmt) - c = CONVERT[(ifmt, ofmt)](file) - if o: - save_tokenized_spec(*c, o) - else: - return c - return - - def seqspec_to_token(spec): # for each modalitiy, make a dictionary of regions specs_regions = {} @@ -150,7 +187,7 @@ def seqspec_to_token(spec): def tokenize_specs( - specs_regions: Dict[str, Dict[str, List[Dict]]] + specs_regions: Dict[str, Dict[str, List[Dict]]], ) -> Tuple[np.ndarray, List[Tuple[str, str, str]]]: """ Convert specs into a single matrix where each row represents a complete region specification @@ -167,14 +204,6 @@ def tokenize_specs( n_region_type_features = len(REGION_TYPES) n_sequence_type_features = len(SEQUENCE_TYPES) - # Total features = one-hot encodings + numerical features - total_features = ( - n_modality_features # modality one-hot - + n_region_type_features # region_type one-hot - + n_sequence_type_features # sequence_type one-hot - + 2 # min_len, max_len - ) - # Total features = one-hot encodings + numerical features + position total_features = ( n_modality_features # modality one-hot @@ -184,6 +213,8 @@ def tokenize_specs( + 1 # position in region list (1-based) ) + # features = get_feature_names() + rows = [] # Will hold our feature vectors row_identifiers = [] # Will hold (spec_id, modality, region_type) tuples @@ -192,7 +223,7 @@ def tokenize_specs( # Enumerate regions to get position (1-based) for position, region in enumerate(regions, start=1): # Create feature vector for this region - feature_vector = np.zeros(total_features) + feature_vector = np.zeros(total_features).astype(int) current_idx = 0 # Add modality one-hot diff --git a/seqspec/seqspec_diff.py b/seqspec/seqspec_diff.py index c4e6a57..b4ba3a8 100644 --- a/seqspec/seqspec_diff.py +++ b/seqspec/seqspec_diff.py @@ -1,45 +1,156 @@ -from .Region import Region -from seqspec.Assay import Assay +"""Diff module for seqspec. + +This module provides functionality to compare two seqspec files and identify differences. +""" + +from pathlib import Path +from argparse import ArgumentParser, Namespace +from typing import List + from seqspec.utils import load_spec +from seqspec.Assay import Assay +from seqspec.Region import Region -def setup_diff_args(parser): - parser_diff = parser.add_parser( +def setup_diff_args(parser) -> ArgumentParser: + """Create and configure the diff command subparser.""" + subparser = parser.add_parser( "diff", - description="diff two seqspecs", - help="diff two seqspecs", + description=""" +Compare two seqspec files and identify differences. + +Examples: +seqspec diff spec1.yaml spec2.yaml # Compare two specs and print differences +seqspec diff spec1.yaml spec2.yaml -o diff.txt # Compare specs and save differences to file +--- +""", + help="Compare two seqspec files and identify differences", + ) + subparser.add_argument( + "yamlA", help="First sequencing specification yaml file", type=str ) - parser_diff.add_argument("yamlA", help="Sequencing specification yaml file A") - parser_diff.add_argument("yamlB", help="Sequencing specification yaml file B") - parser_diff.add_argument( + subparser.add_argument( + "yamlB", help="Second sequencing specification yaml file", type=str + ) + subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) - return parser_diff + return subparser + + +def validate_diff_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the diff command arguments.""" + if not Path(args.yamlA).exists(): + parser.error(f"Input file A does not exist: {args.yamlA}") + if not Path(args.yamlB).exists(): + parser.error(f"Input file B does not exist: {args.yamlB}") + if args.output and args.output.exists() and not args.output.is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") + + +def run_diff(parser: ArgumentParser, args: Namespace) -> None: + """Run the diff command.""" + validate_diff_args(parser, args) + + spec_a = load_spec(args.yamlA) + spec_b = load_spec(args.yamlB) + + differences = compare_specs(spec_a, spec_b) + + if args.output: + args.output.write_text(differences) + else: + print(differences) + + +def compare_specs(spec_a: Assay, spec_b: Assay) -> str: + """Compare two specs and return a string describing their differences.""" + differences = [] + + # Compare modalities + modalities_a = set(spec_a.modalities) + modalities_b = set(spec_b.modalities) + + if modalities_a != modalities_b: + differences.append("Modalities differ:") + differences.append(f" Spec A: {', '.join(sorted(modalities_a))}") + differences.append(f" Spec B: {', '.join(sorted(modalities_b))}") + + # Compare common modalities + common_modalities = modalities_a.intersection(modalities_b) + for modality in common_modalities: + regions_a = spec_a.get_libspec(modality).get_leaves() + regions_b = spec_b.get_libspec(modality).get_leaves() + + region_diffs = compare_regions(regions_a, regions_b) + if region_diffs: + differences.append(f"\nModality '{modality}' differences:") + differences.extend(region_diffs) + + return "\n".join(differences) if differences else "No differences found" + + +def compare_regions(regions_a: List[Region], regions_b: List[Region]) -> List[str]: + """Compare two lists of regions and return a list of differences.""" + differences = [] + + # Create lookup dictionaries + regions_a_dict = {r.region_id: r for r in regions_a} + regions_b_dict = {r.region_id: r for r in regions_b} + + # Find regions unique to each spec + unique_to_a = set(regions_a_dict.keys()) - set(regions_b_dict.keys()) + unique_to_b = set(regions_b_dict.keys()) - set(regions_a_dict.keys()) + + if unique_to_a: + differences.append(" Regions only in spec A:") + for region_id in sorted(unique_to_a): + differences.append(f" - {region_id}") + + if unique_to_b: + differences.append(" Regions only in spec B:") + for region_id in sorted(unique_to_b): + differences.append(f" - {region_id}") + # Compare common regions + common_regions = set(regions_a_dict.keys()).intersection(set(regions_b_dict.keys())) + for region_id in sorted(common_regions): + region_a = regions_a_dict[region_id] + region_b = regions_b_dict[region_id] -def validate_diff_args(parser, args): - # if everything is valid the run_diff - A_fn = args.yamlA - B_fn = args.yamlB - # o = args.o - A = load_spec(A_fn) - B = load_spec(B_fn) + region_diffs = diff_regions(region_a, region_b) + if region_diffs: + differences.append(f" Region '{region_id}' differences:") + differences.extend(f" - {diff}" for diff in region_diffs) - # load in two specs - run_diff(A, B) + return differences -def run_diff(A: Assay, B: Assay): - # What does it mean to diff two assays? - # Only compare on modalities? - # itx: pull out regions that have the same name? - # itx: - pass +def diff_regions(region_a: Region, region_b: Region) -> List[str]: + """Compare two regions and return a list of differences.""" + differences = [] + # Compare basic properties + if region_a.region_type != region_b.region_type: + differences.append( + f"region_type: {region_a.region_type} != {region_b.region_type}" + ) + if region_a.name != region_b.name: + differences.append(f"name: {region_a.name} != {region_b.name}") + if region_a.sequence_type != region_b.sequence_type: + differences.append( + f"sequence_type: {region_a.sequence_type} != {region_b.sequence_type}" + ) + if region_a.sequence != region_b.sequence: + differences.append(f"sequence: {region_a.sequence} != {region_b.sequence}") + if region_a.min_len != region_b.min_len: + differences.append(f"min_len: {region_a.min_len} != {region_b.min_len}") + if region_a.max_len != region_b.max_len: + differences.append(f"max_len: {region_a.max_len} != {region_b.max_len}") -def diff_regions(R1: Region, R2: Region): - pass + return differences diff --git a/seqspec/seqspec_file.py b/seqspec/seqspec_file.py index 31be20d..48563e6 100644 --- a/seqspec/seqspec_file.py +++ b/seqspec/seqspec_file.py @@ -1,16 +1,22 @@ +"""File module for seqspec. + +This module provides functionality to list and format files present in seqspec files. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace +from typing import Dict, List, Optional +from collections import defaultdict +import json + from seqspec.utils import load_spec from seqspec.Assay import Assay -from collections import defaultdict from seqspec.File import File -from typing import Dict, List, Optional -from argparse import RawTextHelpFormatter -import json from seqspec import seqspec_find -import os -import argparse -def setup_file_args(parser): +def setup_file_args(parser) -> ArgumentParser: + """Create and configure the file command subparser.""" subparser = parser.add_parser( "file", description=""" @@ -29,36 +35,37 @@ def setup_file_args(parser): ) subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) subparser.add_argument( "-i", + "--ids", metavar="IDs", - help=("Ids to list"), + help="Ids to list", type=str, default=None, ) subparser_required.add_argument( "-m", + "--modality", metavar="MODALITY", - help=("Modality"), + help="Modality", type=str, - default=None, required=True, ) - # the object we are using to index - # choices = ["read", "region", "file", "onlist", "region-type"] choices = ["read", "region", "file", "region-type"] subparser.add_argument( "-s", + "--selector", metavar="SELECTOR", - help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"), + help=f"Selector for ID, [{', '.join(choices)}] (default: read)", type=str, default="read", choices=choices, @@ -66,6 +73,7 @@ def setup_file_args(parser): choices = ["paired", "interleaved", "index", "list", "json"] subparser.add_argument( "-f", + "--format", metavar="FORMAT", help=f"Format, [{', '.join(choices)}], default: paired", type=str, @@ -84,6 +92,7 @@ def setup_file_args(parser): ] subparser.add_argument( "-k", + "--key", metavar="KEY", help=f"Key, [{', '.join(choices)}], default: file_id", type=str, @@ -94,7 +103,7 @@ def setup_file_args(parser): # option to get the full path of the file subparser.add_argument( "--fullpath", - help=argparse.SUPPRESS, + help="Use full path for local files", action="store_true", default=False, ) @@ -102,59 +111,82 @@ def setup_file_args(parser): return subparser -def validate_file_args(parser, args): - spec_fn = os.path.abspath(args.yaml) - o = args.o - m = args.m # modality - idtype = args.s # selector - fmt = args.f # format - ids = args.i # ids - k = args.k # key - fp = args.fullpath - - if (k == "filesize" or k == "filetype" or k == "urltype" or k == "md5") and ( - fmt == "paired" or fmt == "interleaved" or fmt == "index" - ): - parser.error(f"-f {fmt} valid only with -k file_id, filename, url") - - return run_file(spec_fn, m, ids, idtype, fmt, k, o, fp=fp) - - -def run_file(spec_fn, m, ids, idtype, fmt, k, o, fp=False): - spec = load_spec(spec_fn) - if ids is None: - ids = [] - else: - ids = ids.split(",") - files = file(spec, m, ids, idtype, fmt, k, spec_fn, fp) +def validate_file_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the file command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") + + if args.key in ["filesize", "filetype", "urltype", "md5"] and args.format in [ + "paired", + "interleaved", + "index", + ]: + parser.error( + f"Format '{args.format}' valid only with key 'file_id', 'filename', or 'url'" + ) + + +def run_file(parser: ArgumentParser, args: Namespace) -> None: + """Run the file command.""" + validate_file_args(parser, args) + + spec = load_spec(args.yaml) + ids = args.ids.split(",") if args.ids else [] + + files = list_files( + spec, + args.modality, + ids, + args.selector, + args.format, + args.key, + args.yaml, + args.fullpath, + ) if files: - if o: - with open(o, "w") as f: - print(files, file=f) + if args.output: + args.output.write_text(str(files)) else: print(files) - return -def file( +def list_files( spec: Assay, modality: str, ids: List[str], idtype: str, fmt: str, - k: Optional[str], - spec_fn: str, + k: str, + spec_fn: Path, fp: bool = False, -): +) -> str: + """List files based on the given parameters. + + Args: + spec: The seqspec specification. + modality: The modality to list files for. + ids: List of IDs to filter by. + idtype: Type of ID to filter by (read, region, file, region-type). + fmt: Output format (paired, interleaved, index, list, json). + k: Key to use for output (file_id, filename, etc.). + spec_fn: Path to the spec file. + fp: Whether to use full paths for local files. + + Returns: + Formatted string containing the file information. + """ # NOTE: LIST FILES DOES NOT RESPECT ORDERING OF INPUT IDs LIST # NOTE: seqspec file -s read gets the files for the read, not the files mapped from the regions associated with the read. LIST_FILES = { "read": list_read_files, "region": list_region_files, - "file": list_files, - # "onlist": list_onlist_files, + "file": list_all_files, } + LIST_FILES_BY_ID = { "read": list_files_by_read_id, "file": list_files_by_file_id, @@ -162,13 +194,6 @@ def file( "region-type": list_files_by_region_type, } - if len(ids) == 0: - # list all the files - files = LIST_FILES[idtype](spec, modality) - else: - # list only the id files - files = LIST_FILES_BY_ID[idtype](spec, modality, ids) - FORMAT = { "list": format_list_files_metadata, "paired": format_list_files, @@ -177,19 +202,30 @@ def file( "json": format_json_files, } - x = FORMAT[fmt](files, fmt, k, spec_fn, fp) - return x + # Get files based on whether we're filtering by IDs + if not ids: + # list all files + files = LIST_FILES[idtype](spec, modality) + else: + # list files by id + files = LIST_FILES_BY_ID[idtype](spec, modality, ids) + # Format the output + return FORMAT[fmt](files, fmt, k, spec_fn, fp) -def list_read_files(spec, modality): + +def list_read_files(spec: Assay, modality: str) -> Dict[str, List[File]]: + """List files for all reads in a modality.""" files = defaultdict(list) reads = spec.get_seqspec(modality) for rd in reads: - files[rd.read_id] = rd.files + if rd.files: + files[rd.read_id] = rd.files return files -def list_files(spec, modality): +def list_all_files(spec: Assay, modality: str) -> Dict[str, List[File]]: + """List all files in a modality.""" files_rd = list_read_files(spec, modality) files_rgn = list_region_files(spec, modality) return {**files_rd, **files_rgn} @@ -210,131 +246,155 @@ def list_region_files(spec, modality): def format_list_files_metadata( - files: Dict[str, List[File]], fmt, k, spec_fn="", fp=False -): - x = "" + files: Dict[str, List[File]], + fmt: str, + k: str, + spec_fn: Path = Path(""), + fp: bool = False, +) -> str: + """Format file metadata as a tab-separated list.""" + x = [] if k == "all": for items in zip(*files.values()): for key, item in zip(files.keys(), items): - x += f"{key}\t{item.file_id}\t{item.filename}\t{item.filetype}\t{item.filesize}\t{item.url}\t{item.urltype}\t{item.md5}\n" - x = x[:-1] - + x.append( + f"{key}\t{item.file_id}\t{item.filename}\t{item.filetype}\t{item.filesize}\t{item.url}\t{item.urltype}\t{item.md5}" + ) else: for items in zip(*files.values()): for key, item in zip(files.keys(), items): attr = str(getattr(item, k)) id = item.file_id - x += f"{key}\t{id}\t{attr}\n" - x = x[:-1] - - return x + x.append(f"{key}\t{id}\t{attr}") + return "\n".join(x) -def format_json_files(files: Dict[str, List[File]], fmt, k, spec_fn="", fp=False): +def format_json_files( + files: Dict[str, List[File]], + fmt: str, + k: str, + spec_fn: Path = Path(""), + fp: bool = False, +) -> str: + """Format files as JSON.""" x = [] for items in zip(*files.values()): if k == "all": for key, item in zip(files.keys(), items): d = item.to_dict() if item.urltype == "local" and fp: - d["url"] = os.path.join(os.path.dirname(spec_fn), d["url"]) + d["url"] = str(spec_fn.parent / d["url"]) x.append(d) else: for key, item in zip(files.keys(), items): attr = getattr(item, k) if k == "url" and item.urltype == "local" and fp: - attr = os.path.join(os.path.dirname(spec_fn), attr) + attr = str(spec_fn.parent / attr) x.append({"file_id": item.file_id, k: attr}) return json.dumps(x, indent=4) -def format_list_files(files: Dict[str, List[File]], fmt, k=None, spec_fn="", fp=False): - x = "" +def format_list_files( + files: Dict[str, List[File]], + fmt: str, + k: Optional[str] = None, + spec_fn: Path = Path(""), + fp: bool = False, +) -> str: + """Format files as a list based on the format type.""" + x = [] + if fmt == "paired": - x = "" for items in zip(*files.values()): - t = "" + t = [] for i in items: if k: attr = str(getattr(i, k)) if k == "url" and i.urltype == "local" and fp: - attr = os.path.join(os.path.dirname(spec_fn), attr) - t += f"{attr}\t" + attr = str(spec_fn.parent / attr) + t.append(attr) else: - t += f"{i.filename}\t" - x += f"{t[:-1]}\n" - x = x[:-1] + t.append(i.filename) + x.append("\t".join(t)) - elif fmt == "interleaved" or fmt == "list": + elif fmt in ["interleaved", "list"]: for items in zip(*files.values()): for item in items: id = item.filename if k: id = str(getattr(item, k)) if k == "url" and item.urltype == "local" and fp: - id = os.path.join(os.path.dirname(spec_fn), id) - x += id + "\n" - x = x[:-1] + id = str(spec_fn.parent / id) + x.append(id) + elif fmt == "index": + t = [] for items in zip(*files.values()): for item in items: id = item.filename if k: id = str(getattr(item, k)) if k == "url" and item.urltype == "local" and fp: - id = os.path.join(os.path.dirname(spec_fn), id) - x += id + "," - x = x[:-1] + id = str(spec_fn.parent / id) + t.append(id) + x.append(",".join(t)) - return x + return "\n".join(x) -def list_files_by_read_id(spec, modality, read_ids): +def list_files_by_read_id( + spec: Assay, modality: str, read_ids: List[str] +) -> Dict[str, List[File]]: + """List files for specific read IDs.""" seqspec = spec.get_seqspec(modality) files = defaultdict(list) ids = set(read_ids) # TODO return the files in the order of the ids given in the input # NOTE ORDERING HERE IS IMPORANT SEE GET_INDEX_BY_FILES FUNCTION for read in seqspec: - if read.read_id in ids: - for file in read.files: - # files[read.read_id].append(file.filename) - files[read.read_id].append(file) + if read.read_id in ids and read.files: + files[read.read_id].extend(read.files) return files -def list_files_by_file_id(spec, modality, file_ids): +def list_files_by_file_id( + spec: Assay, modality: str, file_ids: List[str] +) -> Dict[str, List[File]]: + """List files for specific file IDs.""" seqspec = spec.get_seqspec(modality) ids = set(file_ids) files = defaultdict(list) # TODO: NOTE ORDERING HERE IS IMPORTANT SEE RUN_LIST_FILES FUNCTION for read in seqspec: - for file in read.files: - if file.filename in ids: - # files[read.read_id].append(file.filename) - files[read.read_id].append(file) + if read.files: + for file in read.files: + if file.filename in ids: + files[read.read_id].append(file) return files -def list_files_by_region_id(spec, modality, file_ids): +def list_files_by_region_id( + spec: Assay, modality: str, region_ids: List[str] +) -> Dict[str, List[File]]: + """List files for specific region IDs.""" files = list_region_files(spec, modality) - - ids = set(file_ids) + ids = set(region_ids) new_files = defaultdict(list) - for region_id, files in files.items(): + for region_id, region_files in files.items(): if region_id in ids: - new_files[region_id] += files + new_files[region_id].extend(region_files) return new_files -def list_files_by_region_type(spec, modality, file_ids): +def list_files_by_region_type( + spec: Assay, modality: str, region_types: List[str] +) -> Dict[str, List[File]]: + """List files for specific region types.""" files = list_region_files(spec, modality) - - ids = set(file_ids) + ids = set(region_types) new_files = defaultdict(list) - for region_id, files in files.items(): + for region_id, region_files in files.items(): r = seqspec_find.find_by_region_id(spec, modality, region_id)[0] - rt = r.region_type - if rt in ids: - new_files[region_id] += files + if r.region_type in ids: + new_files[region_id].extend(region_files) return new_files diff --git a/seqspec/seqspec_find.py b/seqspec/seqspec_find.py index f5aec05..c1826ce 100644 --- a/seqspec/seqspec_find.py +++ b/seqspec/seqspec_find.py @@ -1,13 +1,24 @@ +"""Find module for seqspec CLI. + +This module provides functionality to search for objects within seqspec files. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS +import warnings +import yaml +from typing import List + from seqspec.utils import load_spec from seqspec.Assay import Assay -import yaml -import argparse -import warnings -from argparse import RawTextHelpFormatter -from seqspec.seqspec_file import list_files +from seqspec.Read import Read +from seqspec.Region import Region +from seqspec.File import File +from seqspec.seqspec_file import list_all_files -def setup_find_args(parser): +def setup_find_args(parser) -> ArgumentParser: + """Create and configure the find command subparser.""" subparser = parser.add_parser( "find", description=""" @@ -24,45 +35,48 @@ def setup_find_args(parser): ) subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) # depracate - subparser.add_argument("--rtype", help=argparse.SUPPRESS, action="store_true") + subparser.add_argument("--rtype", help=SUPPRESS, action="store_true") choices = ["read", "region", "file", "region-type"] subparser.add_argument( "-s", - metavar="Selector", - help=(f"Selector, [{','.join(choices)}] (default: region)"), + "--selector", + metavar="SELECTOR", + help=f"Selector, [{','.join(choices)}] (default: region)", type=str, default="region", choices=choices, ) subparser_required.add_argument( "-m", + "--modality", metavar="MODALITY", - help=("Modality"), + help="Modality", type=str, - default=None, required=True, ) # depracate -r subparser_required.add_argument( "-r", metavar="REGION", - help=argparse.SUPPRESS, + help=SUPPRESS, type=str, default=None, ) subparser_required.add_argument( "-i", - metavar="IDs", - help=("IDs"), + "--id", + metavar="ID", + help="ID to search for", type=str, default=None, required=False, @@ -71,8 +85,14 @@ def setup_find_args(parser): return subparser -def validate_find_args(parser, args): - # IDs +def validate_find_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the find command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") + if args.r is not None: warnings.warn( "The '-r' argument is deprecated and will be removed in a future version. " @@ -80,45 +100,47 @@ def validate_find_args(parser, args): DeprecationWarning, ) # Optionally map the old option to the new one - if not args.i: - args.i = args.r - - fn = args.yaml - m = args.m - o = args.o - idtype = args.s # selector - ids = args.i + if not args.id: + args.id = args.r - # run function - return run_find(fn, m, ids, idtype, o) +def run_find(parser: ArgumentParser, args: Namespace) -> None: + """Run the find command.""" + validate_find_args(parser, args) -def run_find(spec_fn: str, modality: str, id: str, idtype: str, o: str): - spec = load_spec(spec_fn) + spec = load_spec(args.yaml) found = [] - if idtype == "region-type": - found = find_by_region_type(spec, modality, id) - elif idtype == "region": - found = find_by_region_id(spec, modality, id) - elif idtype == "read": - found = find_by_read_id(spec, modality, id) - elif idtype == "file": - found = find_by_file_id(spec, modality, id) + + if args.selector == "region-type": + found = find_by_region_type(spec, args.modality, args.id) + elif args.selector == "region": + found = find_by_region_id(spec, args.modality, args.id) + elif args.selector == "read": + found = find_by_read_id(spec, args.modality, args.id) + elif args.selector == "file": + found = find_by_file_id(spec, args.modality, args.id) else: - raise ValueError(f"Unknown idtype: {idtype}") + raise ValueError(f"Unknown selector: {args.selector}") # post processing - if o: - with open(o, "w") as f: + if args.output: + with open(args.output, "w") as f: yaml.dump(found, f, sort_keys=False) else: print(yaml.dump(found, sort_keys=False)) - return +def find_by_read_id(spec: Assay, modality: str, id: str) -> List[Read]: + """Find reads by their ID. -# TODO implement -def find_by_read_id(spec: Assay, modality: str, id: str): + Args: + spec: The seqspec specification. + modality: The modality to search in. + id: The read ID to search for. + + Returns: + A list of Read objects matching the ID. + """ rds = [] reads = spec.get_seqspec(modality) for r in reads: @@ -127,10 +149,19 @@ def find_by_read_id(spec: Assay, modality: str, id: str): return rds -# TODO implement -def find_by_file_id(spec: Assay, modality: str, id: str): +def find_by_file_id(spec: Assay, modality: str, id: str) -> List[File]: + """Find files by their ID. + + Args: + spec: The seqspec specification. + modality: The modality to search in. + id: The file ID to search for. + + Returns: + A list of File objects matching the ID. + """ files = [] - lf = list_files(spec, modality) + lf = list_all_files(spec, modality) for k, v in lf.items(): for f in v: if f.file_id == id: @@ -138,13 +169,33 @@ def find_by_file_id(spec: Assay, modality: str, id: str): return files -def find_by_region_id(spec: Assay, modality: str, id: str): +def find_by_region_id(spec: Assay, modality: str, id: str) -> List[Region]: + """Find regions by their ID. + + Args: + spec: The seqspec specification. + modality: The modality to search in. + id: The region ID to search for. + + Returns: + A list of Region objects matching the ID. + """ m = spec.get_libspec(modality) regions = m.get_region_by_id(id) return regions -def find_by_region_type(spec: Assay, modality: str, id: str): +def find_by_region_type(spec: Assay, modality: str, id: str) -> List[Region]: + """Find regions by their type. + + Args: + spec: The seqspec specification. + modality: The modality to search in. + id: The region type to search for. + + Returns: + A list of Region objects matching the type. + """ m = spec.get_libspec(modality) regions = m.get_region_by_region_type(id) return regions diff --git a/seqspec/seqspec_format.py b/seqspec/seqspec_format.py index 87eccad..b6d209f 100644 --- a/seqspec/seqspec_format.py +++ b/seqspec/seqspec_format.py @@ -1,8 +1,25 @@ +"""Format module for seqspec CLI. + +This module provides functionality to automatically format and fill in missing fields +in a seqspec specification file. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace + from seqspec.utils import load_spec -from argparse import RawTextHelpFormatter +from seqspec.Assay import Assay -def setup_format_args(parser): +def setup_format_args(parser) -> ArgumentParser: + """Create and configure the format command subparser. + + Args: + parser: The main argument parser to add the format subparser to. + + Returns: + The configured format subparser. + """ subparser = parser.add_parser( "format", description=""" @@ -16,34 +33,58 @@ def setup_format_args(parser): help="Autoformat seqspec file", formatter_class=RawTextHelpFormatter, ) - # subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", type=Path, help="Sequencing specification yaml file") subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + type=Path, + help="Path to output file", default=None, ) return subparser -def validate_format_args(parser, args): - fn = args.yaml - o = args.o +def validate_format_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the format command arguments. + + Args: + parser: The argument parser. + args: The parsed arguments. - run_format(spec_fn=fn, o=o) + Raises: + parser.error: If any validation fails. + """ + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") -def run_format(spec_fn, o): - spec = load_spec(spec_fn) - format(spec) - if o: - spec.to_YAML(o) + +def run_format(parser: ArgumentParser, args: Namespace) -> None: + """Run the format command. + + Args: + parser: The argument parser. + args: The parsed arguments. + """ + validate_format_args(parser, args) + + spec = load_spec(args.yaml) + format_spec(spec) + + if args.output: + spec.to_YAML(args.output) else: print(spec.to_YAML()) -def format(spec): - return spec.update_spec() +def format_spec(spec: Assay) -> None: + """Format a seqspec specification by updating its fields. + + Args: + spec: The seqspec specification to format. + """ + spec.update_spec() diff --git a/seqspec/seqspec_index.py b/seqspec/seqspec_index.py index 4bbc657..c08576b 100644 --- a/seqspec/seqspec_index.py +++ b/seqspec/seqspec_index.py @@ -1,19 +1,28 @@ +"""Index module for seqspec CLI. + +This module provides functionality to identify the position of elements in a spec for use in downstream tools. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS +import warnings +from typing import List, Optional + from seqspec.utils import load_spec, map_read_id_to_regions from seqspec.seqspec_find import find_by_region_id -import warnings from seqspec.seqspec_file import list_files_by_file_id, list_read_files -from argparse import SUPPRESS, RawTextHelpFormatter -from seqspec.Region import complement_sequence -from seqspec.Region import RegionCoordinateDifference - from seqspec.Region import ( + complement_sequence, + RegionCoordinateDifference, project_regions_to_coordinates, itx_read, ) from seqspec.Read import ReadCoordinate +from seqspec.Assay import Assay -def setup_index_args(parser): +def setup_index_args(parser) -> ArgumentParser: + """Create and configure the index command subparser.""" subparser = parser.add_parser( "index", description=""" @@ -30,12 +39,13 @@ def setup_index_args(parser): formatter_class=RawTextHelpFormatter, ) subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) @@ -50,6 +60,7 @@ def setup_index_args(parser): choices = [ "chromap", "kb", + "kb-single", "relative", "seqkit", "simpleaf", @@ -60,8 +71,9 @@ def setup_index_args(parser): ] subparser.add_argument( "-t", + "--tool", metavar="TOOL", - help=(f"Tool, [{', '.join(choices)}] (default: tab)"), + help=f"Tool, [{', '.join(choices)}] (default: tab)", default="tab", type=str, choices=choices, @@ -70,8 +82,9 @@ def setup_index_args(parser): choices = ["read", "region", "file"] subparser.add_argument( "-s", + "--selector", metavar="SELECTOR", - help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"), + help=f"Selector for ID, [{', '.join(choices)}] (default: read)", type=str, default="read", choices=choices, @@ -90,16 +103,17 @@ def setup_index_args(parser): subparser_required.add_argument( "-m", + "--modality", metavar="MODALITY", - help=("Modality"), + help="Modality", type=str, - default=None, required=True, ) subparser_required.add_argument( "-i", + "--ids", metavar="IDs", - help=("IDs"), + help="IDs", type=str, default=None, required=False, @@ -114,7 +128,8 @@ def setup_index_args(parser): return subparser -def validate_index_args(parser, args): +def validate_index_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the index command arguments.""" if args.r is not None: warnings.warn( "The '-r' argument is deprecated and will be removed in a future version. " @@ -122,8 +137,8 @@ def validate_index_args(parser, args): DeprecationWarning, ) # Optionally map the old option to the new one - if not args.i: - args.i = args.r + if not args.ids: + args.ids = args.r if args.region: warnings.warn( "The '--region' argument is deprecated and will be removed in a future version. " @@ -131,61 +146,67 @@ def validate_index_args(parser, args): DeprecationWarning, ) - fn = args.yaml - m = args.m - ids = args.i # this can be a list of ids (reads, regions, or files) - t = args.t - o = args.o - subregion_type = args.subregion_type - rev = args.rev - idtype = args.s + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") - if ids is None and (idtype == "read" or idtype == "region"): + if args.ids is None and (args.selector == "read" or args.selector == "region"): parser.error("Must specify ids with -i for -s read or -s region") - return run_index(fn, m, ids, idtype, t, rev, subregion_type, o=o) - - -def run_index( - spec_fn, - modality, - ids, - idtype, - fmt, - rev, - subregion_type, - o, -): - spec = load_spec(spec_fn) - if ids is None: - ids = [] - else: - ids = ids.split(",") - x = index(spec, modality, ids, idtype, fmt, rev, subregion_type) +def run_index(parser: ArgumentParser, args: Namespace) -> None: + """Run the index command.""" + validate_index_args(parser, args) - # post processing - if o: - with open(o, "w") as f: - print(x, file=f) - else: - print(x) + spec = load_spec(args.yaml) + ids = args.ids.split(",") if args.ids else [] - return + result = index( + spec, + args.modality, + ids, + args.selector, + args.tool, + args.rev, + args.subregion_type, + ) + + if args.output: + with open(args.output, "w") as f: + print(result, file=f) + else: + print(result) def index( - spec, - modality, - ids, - idtype, - fmt, - rev=False, - subregion_type=None, -): + spec: Assay, + modality: str, + ids: List[str], + idtype: str, + fmt: str, + rev: bool = False, + subregion_type: Optional[str] = None, +) -> str: + """Get index information from the spec. + + Args: + spec: The seqspec specification. + modality: The modality to index. + ids: List of IDs to index. + idtype: Type of ID (read, region, file). + fmt: Output format. + rev: Whether to return 3'->5' region order. + subregion_type: Optional subregion type. + + Returns: + Formatted index information. + """ FORMAT = { "chromap": format_chromap, "kb": format_kallisto_bus, + "kb-single": format_kallisto_bus_force_single, "relative": format_relative, "seqkit": format_seqkit_subseq, "simpleaf": format_simpleaf, @@ -205,7 +226,7 @@ def index( "read": get_index_by_read_ids, } - if len(ids) == 0: + if not ids: indices = GET_INDICES[idtype](spec, modality) else: indices = GET_INDICES_BY_IDS[idtype](spec, modality, ids) @@ -311,6 +332,43 @@ def format_kallisto_bus(indices, subregion_type=None): return x +def format_kallisto_bus_force_single(indices, subregion_type=None): + bcs = [] + umi = [] + feature = [] + longest_feature = None + max_length = 0 + + for idx, region in enumerate(indices): + rg_strand = region.pop("strand") # noqa + for rgn, cuts in region.items(): + for cut in cuts: + if cut.region_type.upper() == "BARCODE": + bcs.append(f"{idx},{cut.start},{cut.stop}") + elif cut.region_type.upper() == "UMI": + umi.append(f"{idx},{cut.start},{cut.stop}") + elif ( + cut.region_type.upper() == "CDNA" + or cut.region_type.upper() == "GDNA" + or cut.region_type.upper() == "PROTEIN" + or cut.region_type.upper() == "TAG" + ): + length = cut.stop - cut.start + if length > max_length: + max_length = length + longest_feature = f"{idx},{cut.start},{cut.stop}" + + if len(umi) == 0: + umi.append("-1,-1,-1") + if len(bcs) == 0: + bcs.append("-1,-1,-1") + if longest_feature: + feature.append(longest_feature) + + x = ",".join(bcs) + ":" + ",".join(umi) + ":" + ",".join(feature) + return x + + # this one should only return one string # TODO: return to this def format_seqkit_subseq(indices, subregion_type=None): diff --git a/seqspec/seqspec_info.py b/seqspec/seqspec_info.py index c01a963..b9573f7 100644 --- a/seqspec/seqspec_info.py +++ b/seqspec/seqspec_info.py @@ -4,10 +4,12 @@ from seqspec.Region import Region from seqspec.Read import Read from seqspec.Assay import Assay -from argparse import RawTextHelpFormatter +from argparse import RawTextHelpFormatter, ArgumentParser, Namespace +from pathlib import Path -def setup_info_args(parser): +def setup_info_args(parser) -> ArgumentParser: + """Create and configure the info command subparser.""" subparser = parser.add_parser( "info", description=""" @@ -24,12 +26,13 @@ def setup_info_args(parser): formatter_class=RawTextHelpFormatter, ) - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) choices = ["modalities", "meta", "sequence_spec", "library_spec"] subparser.add_argument( "-k", + "--key", metavar="KEY", - help=(f"Object to display, [{', '.join(choices)}] (default: meta)"), + help=f"Object to display, [{', '.join(choices)}] (default: meta)", type=str, default="meta", required=False, @@ -37,8 +40,9 @@ def setup_info_args(parser): choices = ["tab", "json"] subparser.add_argument( "-f", + "--format", metavar="FORMAT", - help=(f"The output format, [{', '.join(choices)}] (default: tab)"), + help=f"The output format, [{', '.join(choices)}] (default: tab)", type=str, default="tab", required=False, @@ -46,26 +50,30 @@ def setup_info_args(parser): ) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, required=False, ) return subparser -def validate_info_args(parser, args): - spec_fn = args.yaml - o = args.o - k = args.k - fmt = args.f - return run_info(spec_fn, fmt, k, o) +def validate_info_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the info command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") -def run_info(spec_fn, f, k=None, o=None): - # return json of the Assay object - spec = load_spec(spec_fn) +def run_info(parser: ArgumentParser, args: Namespace) -> None: + """Run the info command.""" + validate_info_args(parser, args) + + spec = load_spec(args.yaml) CMD = { "modalities": seqspec_info_modalities, "meta": seqspec_info, @@ -73,23 +81,24 @@ def run_info(spec_fn, f, k=None, o=None): "library_spec": seqspec_info_library_spec, } s = "" - if k: - s = CMD[k](spec, f) + if args.key: + s = CMD[args.key](spec, args.format) - if o: - with open(o, "w") as f: + if args.output: + with open(args.output, "w") as f: json.dump(s, f, sort_keys=False, indent=4) else: print(s) - return -def seqspec_info(spec, fmt): +def seqspec_info(spec: Assay, fmt: str) -> str: + """Get meta information about the spec.""" s = format_info(spec, fmt) return s -def seqspec_info_library_spec(spec, fmt): +def seqspec_info_library_spec(spec: Assay, fmt: str) -> str: + """Get library specification information.""" modalities = spec.list_modalities() s = "" for m in modalities: @@ -98,17 +107,20 @@ def seqspec_info_library_spec(spec, fmt): return s -def seqspec_info_sequence_spec(spec: Assay, fmt): +def seqspec_info_sequence_spec(spec: Assay, fmt: str) -> str: + """Get sequence specification information.""" reads = format_sequence_spec(spec.sequence_spec, fmt) return reads -def seqspec_info_modalities(spec, fmt): +def seqspec_info_modalities(spec: Assay, fmt: str) -> str: + """Get list of modalities.""" modalities = format_modalities(spec.list_modalities(), fmt) return modalities -def format_info(spec: Assay, fmt="tab"): +def format_info(spec: Assay, fmt: str = "tab") -> str: + """Format meta information.""" sd = spec.to_dict() del sd["library_spec"] del sd["sequence_spec"] @@ -120,11 +132,11 @@ def format_info(spec: Assay, fmt="tab"): s = s[:-1] elif fmt == "json": s = json.dumps(sd, sort_keys=False, indent=4) - return s -def format_modalities(modalities: List[str], fmt="tab"): +def format_modalities(modalities: List[str], fmt: str = "tab") -> str: + """Format list of modalities.""" s = "" if fmt == "tab": s = "\t".join(modalities) @@ -133,19 +145,24 @@ def format_modalities(modalities: List[str], fmt="tab"): return s -def format_sequence_spec(sequence_spec: List[Read], fmt="tab"): +def format_sequence_spec(sequence_spec: List[Read], fmt: str = "tab") -> str: + """Format sequence specification.""" s = "" if fmt == "tab": # format the output as a table for r in sequence_spec: - s += f"{r.modality}\t{r.read_id}\t{r.strand}\t{r.min_len}\t{r.max_len}\t{r.primer_id}\t{r.name}\t{','.join([i.file_id for i in r.files])}\n" + files = ",".join([i.file_id for i in r.files]) if r.files else "" + s += f"{r.modality}\t{r.read_id}\t{r.strand}\t{r.min_len}\t{r.max_len}\t{r.primer_id}\t{r.name}\t{files}\n" s = s[:-1] elif fmt == "json": s = json.dumps([i.to_dict() for i in sequence_spec], sort_keys=False, indent=4) return s -def format_library_spec(modality: str, library_spec: List[Region], fmt="tab"): +def format_library_spec( + modality: str, library_spec: List[Region], fmt: str = "tab" +) -> str: + """Format library specification.""" s = "" if fmt == "tab": for r in library_spec: diff --git a/seqspec/seqspec_init.py b/seqspec/seqspec_init.py index 066c5b6..361324b 100644 --- a/seqspec/seqspec_init.py +++ b/seqspec/seqspec_init.py @@ -1,18 +1,25 @@ +"""Init module for seqspec CLI. + +This module provides functionality to generate new seqspec files from a newick tree format. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace +from typing import List + +import newick from seqspec.Assay import Assay from seqspec.Region import Region from seqspec.File import File from seqspec.Read import Read -from typing import List -import newick -from argparse import RawTextHelpFormatter # example +# seqspec init -n myassay -m 1 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna)" # seqspec init -n myassay -m 1 -o spec.yaml -r "rna,R1.fastq.gz,truseq_r1,16,pos:rna,R2.fastq.gz,truseq_r2,100,neg" " ((truseq_r1:10,barcode:16,umi:12,cdna:150)rna)" +# seqspec init -n myassay -m 2 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna,((barcode:16)r1.fastq.gz,(gdna:150)r2.fastq.gz,(gdna:150)r3.fastq.gz)atac)" -# seqspec init -n myassay -m 1 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna)" -# seqspec init -n myassay -m 1 -o spec.yaml -r "rna,R1.fastq.gz,truseq_r1,16,pos:rna,R2.fastq.gz,truseq_r2,100,neg" "((truseq_r1:10,barcode:16,umi:12,cdna:150)rna)" -# seqspec init -n myassay -m 2 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna,((barcode:16)r1.fastq.gz,(gdna:150)r2.fastq.gz,(gdna:150)r3.fastq.gz)atac)" -def setup_init_args(parser): +def setup_init_args(parser) -> ArgumentParser: + """Create and configure the init command subparser.""" subparser = parser.add_parser( "init", description=""" @@ -27,85 +34,92 @@ def setup_init_args(parser): ) subparser_required = subparser.add_argument_group("required arguments") subparser_required.add_argument( - "-n", metavar="NAME", type=str, help="assay name", required=True + "-n", "--name", metavar="NAME", type=str, help="Assay name", required=True ) - # -m "rna,atac" subparser_required.add_argument( "-m", + "--modalities", metavar="MODALITIES", type=str, - help="list of comma-separated modalities (e.g. rna,atac)", + help="List of comma-separated modalities (e.g. rna,atac)", required=True, ) # -r "rna,R1.fastq.gz,truseq_r1,16,pos:rna,R2.fastq.gz,truseq_r2,100,neg" subparser_required.add_argument( "-r", + "--reads", metavar="READS", type=str, - help="list of modalities, reads, primer_ids, lengths, and strand (e.g. modality,fastq_name,primer_id,len,strand:...)", + help="List of modalities, reads, primer_ids, lengths, and strand (e.g. modality,fastq_name,primer_id,len,strand:...)", required=True, ) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) subparser.add_argument( "newick", - help=( - "tree in newick format (https://marvin.cs.uidaho.edu/Teaching/CS515/newickFormat.html)" - ), + help="Tree in newick format (https://marvin.cs.uidaho.edu/Teaching/CS515/newickFormat.html)", ) return subparser -def validate_init_args(parser, args): - name = args.n - modalities_str = args.m - newick_str = args.newick - o = args.o - reads_str = args.r +def validate_init_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the init command arguments.""" + if not args.newick: + parser.error("Newick tree must be provided") + + if args.output and args.output.exists() and not args.output.is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") - if newick_str is None: - parser.error("modality-FASTQs pairs must be provided") - return run_init(name, modalities_str, newick_str, reads_str, o) +def run_init(parser: ArgumentParser, args: Namespace) -> None: + """Run the init command.""" + validate_init_args(parser, args) + modalities = args.modalities.split(",") + reads = parse_reads_string(args.reads) + tree = newick.loads(args.newick) -def run_init(name: str, modalities_str, newick_str, reads_str, o=None): - modalities = modalities_str.split(",") - reads = parse_reads_string(reads_str) - tree = newick.loads(newick_str) if len(tree[0].descendants) != len(modalities): raise ValueError( "Number of modalities must match number of modality-FASTQs pairs" ) - reads = parse_reads_string(reads_str) - spec = init(name, modalities, tree[0].descendants, reads) + spec = init(args.name, modalities, tree[0].descendants, reads) - if o: - spec.to_YAML(o) + if args.output: + spec.to_YAML(args.output) else: print(spec.to_YAML()) - return +def init( + name: str, modalities: List[str], tree: List[newick.Node], reads: List[Read] +) -> Assay: + """Initialize a new seqspec specification. + + Args: + name: Name of the assay. + modalities: List of modalities. + tree: Newick tree nodes. + reads: List of read specifications. -def init(name: str, modalities, tree: List[newick.Node], reads: List[Read]): - # make read for each fastq - # make region for each modality - # add modality regions to assay - rgns = [] - for t in tree: - r = Region(region_id="", region_type="", name="", sequence_type="") - rgns.append(newick_to_region(t, r)) + Returns: + Initialized Assay object. + """ + regions = [] + for node in tree: + region = Region(region_id="", region_type="", name="", sequence_type="") + regions.append(newick_to_region(node, region)) - assay = Assay( + return Assay( assay_id="", name=name, doi="", @@ -118,59 +132,79 @@ def init(name: str, modalities, tree: List[newick.Node], reads: List[Read]): sequence_kit="", sequence_protocol="", sequence_spec=reads, - library_spec=rgns, + library_spec=regions, ) - return assay -def newick_to_region( - node, region=Region(region_id="", region_type="", name="", sequence_type="") -): +def newick_to_region(node: newick.Node, region: Region) -> Region: + """Convert a newick node to a Region object. + + Args: + node: Newick tree node. + region: Base region object to populate. + + Returns: + Populated Region object. + """ region.region_id = node.name region.name = node.name - if len(node.descendants) == 0: + if not node.descendants: region.min_len = int(node.length) region.max_len = int(node.length) return region + region.regions = [] - for n in node.descendants: + for descendant in node.descendants: region.regions.append( newick_to_region( - n, - Region(region_id=n.name, region_type="", name=n.name, sequence_type=""), + descendant, + Region( + region_id=descendant.name, + region_type="", + name=descendant.name, + sequence_type="", + ), ) ) return region -def parse_reads_string(input_string): +def parse_reads_string(input_string: str) -> List[Read]: + """Parse a string of read specifications into Read objects. + + Args: + input_string: String containing read specifications in format + "modality,read_id,primer_id,min_len,strand:..." + + Returns: + List of Read objects. + """ reads = [] - objects = input_string.split(":") - for obj in objects: - parts = obj.split(",") - modality, read_id, primer_id, min_len, strand = parts - - read = Read( - read_id=read_id, - name=read_id, - modality=modality, - primer_id=primer_id, - min_len=int(min_len), - max_len=int(min_len), - strand=strand, - files=[ - File( - file_id=read_id, - filename=read_id, - filetype="", - filesize=0, - url="", - urltype="", - md5="", - ) - ], + for obj in input_string.split(":"): + modality, read_id, primer_id, min_len, strand = obj.split(",") + + reads.append( + Read( + read_id=read_id, + name=read_id, + modality=modality, + primer_id=primer_id, + min_len=int(min_len), + max_len=int(min_len), + strand=strand, + files=[ + File( + file_id=read_id, + filename=read_id, + filetype="", + filesize=0, + url="", + urltype="", + md5="", + ) + ], + ) ) - reads.append(read) return reads diff --git a/seqspec/seqspec_methods.py b/seqspec/seqspec_methods.py index 2d19915..6f7a3b0 100644 --- a/seqspec/seqspec_methods.py +++ b/seqspec/seqspec_methods.py @@ -1,17 +1,27 @@ +"""Methods module for seqspec. + +This module provides functionality to convert seqspec files into methods sections. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace + from seqspec.utils import load_spec from seqspec.Assay import Assay from seqspec.Region import Region -from argparse import RawTextHelpFormatter +from seqspec.Read import Read, File -def setup_methods_args(parser): +def setup_methods_args(parser) -> ArgumentParser: + """Create and configure the methods command subparser.""" subparser = parser.add_parser( "methods", description=""" Convert seqspec file into methods section. Examples: -seqspec methods -m rna spec.yaml # Return methods section for rna modality +seqspec methods -m rna -o methods.txt spec.yaml # Save methods section to file +seqspec methods -m rna spec.yaml # Print methods section to stdout --- """, help="Convert seqspec file into methods section", @@ -19,46 +29,50 @@ def setup_methods_args(parser): ) subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str) subparser_required.add_argument( "-m", + "--modality", metavar="MODALITY", - help=("Modality"), + help="Modality", type=str, - default=None, required=True, ) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) return subparser -def validate_methods_args(parser, args): - # if everything is valid the run_methods - fn = args.yaml - o = args.o +def validate_methods_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the methods command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") - m = args.m - return run_methods(fn, m, o) + if args.output and args.output.exists() and not args.output.is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") -def run_methods(spec_fn, m, o): - spec = load_spec(spec_fn) - m = methods(spec, m) +def run_methods(parser: ArgumentParser, args: Namespace) -> None: + """Run the methods command.""" + validate_methods_args(parser, args) - if o: - with open(o, "w") as f: - print(m, file=f) + spec = load_spec(args.yaml) + methods_text = methods(spec, args.modality) + + if args.output: + args.output.write_text(methods_text) else: - print(m) + print(methods_text) -def methods(spec, modality): +def methods(spec: Assay, modality: str) -> str: + """Generate methods section for spec and modality.""" m = f"""Methods The {modality} portion of the {spec.name} assay was generated on {spec.date}. """ @@ -67,15 +81,16 @@ def methods(spec, modality): # TODO: manage sequence/library protocol/kit for cases where each modality has different protocols/kits -def format_library_spec(spec: Assay, m): - leaves = spec.get_libspec(m).get_leaves() +def format_library_spec(spec: Assay, modality: str) -> str: + """Format library specification for methods section.""" + leaves = spec.get_libspec(modality).get_leaves() lib_prot = None if isinstance(spec.library_protocol, str): lib_prot = spec.library_protocol elif isinstance(spec.library_protocol, list): for i in spec.library_protocol: - if i.modality == m: + if i.modality == modality: lib_prot = i.protocol_id lib_kit = None @@ -83,7 +98,7 @@ def format_library_spec(spec: Assay, m): lib_kit = spec.library_kit elif isinstance(spec.library_kit, list): for i in spec.library_kit: - if i.modality == m: + if i.modality == modality: lib_kit = i.kit_id seq_prot = None @@ -91,7 +106,7 @@ def format_library_spec(spec: Assay, m): seq_prot = spec.sequence_protocol elif isinstance(spec.sequence_protocol, list): for i in spec.sequence_protocol: - if i.modality == m: + if i.modality == modality: seq_prot = i.protocol_id seq_kit = None @@ -99,7 +114,7 @@ def format_library_spec(spec: Assay, m): seq_kit = spec.sequence_kit elif isinstance(spec.sequence_kit, list): for i in spec.sequence_kit: - if i.modality == m: + if i.modality == modality: seq_kit = i.kit_id s = f""" @@ -112,13 +127,14 @@ def format_library_spec(spec: Assay, m): \nSequence structure\n The library was sequenced on a {seq_prot} using the {seq_kit} sequencing kit. The library was sequenced using the following configuration:\n """ - reads = spec.get_seqspec(m) + reads = spec.get_seqspec(modality) for idx, r in enumerate(reads, 1): s += format_read(r, idx) return s -def format_region(region: Region, idx: int = 1): +def format_region(region: Region, idx: int = 1) -> str: + """Format region for methods section.""" s = f"{idx}. {region.name}: {region.min_len}-{region.max_len}bp {region.sequence_type} sequence ({region.sequence})" if region.onlist: s += f", onlist file: {region.onlist.filename}.\n" @@ -127,14 +143,15 @@ def format_region(region: Region, idx: int = 1): return s -def format_read(read, idx: int = 1): +def format_read(read: Read, idx: int = 1) -> str: + """Format read for methods section.""" s = f"- {read.name}: {read.max_len} cycles on the {'positive' if read.strand == 'pos' else 'negative'} strand using the {read.primer_id} primer. The following files contain the sequences in Read {idx}:\n" - for idx, f in enumerate(read.files, 1): - s += " " + format_read_file(f, idx) - s = s[:-1] + if read.files: + for idx, f in enumerate(read.files, 1): + s += " " + format_read_file(f, idx) return s -def format_read_file(file, idx: int = 1): - s = f"- File {idx}: {file.filename}\n" - return s +def format_read_file(file: File, idx: int = 1) -> str: + """Format read file for methods section.""" + return f"- File {idx}: {file.filename}\n" diff --git a/seqspec/seqspec_modify.py b/seqspec/seqspec_modify.py index 006cf00..78aa1ec 100644 --- a/seqspec/seqspec_modify.py +++ b/seqspec/seqspec_modify.py @@ -1,12 +1,20 @@ +"""Modify module for seqspec. + +This module provides functionality to modify attributes of various elements in seqspec files. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS +from typing import List, Optional +import warnings + from seqspec.utils import load_spec from seqspec.File import File -from argparse import RawTextHelpFormatter, SUPPRESS -import warnings +from seqspec.Assay import Assay -# TODO fix modify to use the -s selector -def setup_modify_args(parser): - # given a spec, a region id and a list of key value property pairs, modify the spec +def setup_modify_args(parser) -> ArgumentParser: + """Create and configure the modify command subparser.""" subparser = parser.add_parser( "modify", description=""" @@ -22,34 +30,34 @@ def setup_modify_args(parser): formatter_class=RawTextHelpFormatter, ) subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str) # Read properties subparser.add_argument( "--read-id", metavar="READID", - help=("New ID of read"), + help="New ID of read", type=str, default=None, ) subparser.add_argument( "--read-name", metavar="READNAME", - help=("New name of read"), + help="New name of read", type=str, default=None, ) subparser.add_argument( "--primer-id", metavar="PRIMERID", - help=("New ID of primer"), + help="New ID of primer", type=str, default=None, ) subparser.add_argument( "--strand", metavar="STRAND", - help=("New strand"), + help="New strand", type=str, default=None, ) @@ -59,45 +67,44 @@ def setup_modify_args(parser): subparser.add_argument( "--files", metavar="FILES", - help=("New files, (filename,filetype,filesize,url,urltype,md5:...)"), + help="New files, (filename,filetype,filesize,url,urltype,md5:...)", type=str, default=None, ) # Region properties - subparser.add_argument( "--region-id", metavar="REGIONID", - help=("New ID of region"), + help="New ID of region", type=str, default=None, ) subparser.add_argument( "--region-type", metavar="REGIONTYPE", - help=("New type of region"), + help="New type of region", type=str, default=None, ) subparser.add_argument( "--region-name", metavar="REGIONNAME", - help=("New name of region"), + help="New name of region", type=str, default=None, ) subparser.add_argument( "--sequence-type", metavar="SEQUENCETYPE", - help=("New type of sequence"), + help="New type of sequence", type=str, default=None, ) subparser.add_argument( "--sequence", metavar="SEQUENCE", - help=("New sequence"), + help="New sequence", type=str, default=None, ) @@ -106,25 +113,25 @@ def setup_modify_args(parser): subparser.add_argument( "--min-len", metavar="MINLEN", - help=("Min region length"), + help="Min region length", type=int, default=None, ) subparser.add_argument( "--max-len", metavar="MAXLEN", - help=("Max region length"), + help="Max region length", type=int, default=None, ) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, - required=False, ) subparser_required.add_argument( "-r", @@ -132,38 +139,38 @@ def setup_modify_args(parser): help=SUPPRESS, type=str, default=None, - required=False, ) subparser_required.add_argument( "-i", metavar="IDs", - help=("IDs"), + help="IDs", type=str, default=None, - required=False, ) choices = ["read", "region"] subparser.add_argument( "-s", + "--selector", metavar="SELECTOR", - help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"), + help=f"Selector for ID, [{', '.join(choices)}] (default: read)", type=str, default="read", choices=choices, ) subparser_required.add_argument( "-m", + "--modality", metavar="MODALITY", - help=("Modality of the assay"), + help="Modality of the assay", type=str, - default=None, required=True, ) return subparser -def validate_modify_args(parser, args): +def validate_modify_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the modify command arguments.""" if args.r is not None: warnings.warn( "The '-r' argument is deprecated and will be removed in a future version. " @@ -174,103 +181,99 @@ def validate_modify_args(parser, args): if not args.i: args.i = args.r - # if everything is valid the run_format - fn = args.yaml - o = args.o - modality = args.m - # target_r = args.r - idtype = args.s # selector - ids = args.i + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") - # Read properties - read_id = args.read_id - read_name = args.read_name - primer_id = args.primer_id - strand = args.strand - files = args.files + if args.output and args.output.exists() and not args.output.is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") - # Region properties - region_id = args.region_id - region_type = args.region_type - region_name = args.region_name - sequence_type = args.sequence_type - sequence = args.sequence - # Read and Region properties - min_len = args.min_len - max_len = args.max_len +def run_modify(parser: ArgumentParser, args: Namespace) -> None: + """Run the modify command.""" + validate_modify_args(parser, args) - spec = load_spec(fn) + spec = load_spec(args.yaml) + # Read properties read_kwd = { - "read_id": read_id, - "read_name": read_name, - "primer_id": primer_id, - "min_len": min_len, - "max_len": max_len, - "strand": strand, - "files": files, + "read_id": args.read_id, + "read_name": args.read_name, + "primer_id": args.primer_id, + "min_len": args.min_len, + "max_len": args.max_len, + "strand": args.strand, + "files": args.files, } + # Region properties region_kwd = { - "region_id": region_id, - "region_type": region_type, - "name": region_name, - "sequence_type": sequence_type, - "sequence": sequence, - "min_len": min_len, - "max_len": max_len, + "region_id": args.region_id, + "region_type": args.region_type, + "name": args.region_name, + "sequence_type": args.sequence_type, + "sequence": args.sequence, + "min_len": args.min_len, + "max_len": args.max_len, } - if idtype == "region": - spec = run_modify_region(spec, modality, ids, **region_kwd) - elif idtype == "read": - spec = run_modify_read(spec, modality, ids, **read_kwd) - # update region in spec - # once the region is updated, update the spec + if args.selector == "region": + spec = run_modify_region(spec, args.modality, args.i, **region_kwd) + elif args.selector == "read": + spec = run_modify_read(spec, args.modality, args.i, **read_kwd) + + # Update spec spec.update_spec() - if o: - spec.to_YAML(o) + + if args.output: + args.output.write_text(spec.to_YAML()) else: print(spec.to_YAML()) def run_modify_read( - spec, - modality, - target_read, - read_id, - read_name, - primer_id, - min_len, - max_len, - strand, - files, -): + spec: Assay, + modality: str, + target_read: str, + read_id: Optional[str] = None, + read_name: Optional[str] = None, + primer_id: Optional[str] = None, + min_len: Optional[int] = None, + max_len: Optional[int] = None, + strand: Optional[str] = None, + files: Optional[str] = None, +) -> Assay: + """Modify read properties in spec.""" reads = spec.get_seqspec(modality) if files: - files = parse_files_string(files) + files_list = parse_files_string(files) for r in reads: if r.read_id == target_read: r.update_read_by_id( - read_id, read_name, modality, primer_id, min_len, max_len, strand, files + read_id, + read_name, + modality, + primer_id, + min_len, + max_len, + strand, + files_list, ) - return spec def run_modify_region( - spec, - modality, - target_region, - region_id, - region_type, - name, - sequence_type, - sequence, - min_len, - max_len, -): + spec: Assay, + modality: str, + target_region: str, + region_id: Optional[str] = None, + region_type: Optional[str] = None, + name: Optional[str] = None, + sequence_type: Optional[str] = None, + sequence: Optional[str] = None, + min_len: Optional[int] = None, + max_len: Optional[int] = None, +) -> Assay: + """Modify region properties in spec.""" spec.get_libspec(modality).update_region_by_id( target_region, region_id, @@ -281,27 +284,23 @@ def run_modify_region( min_len, max_len, ) - return spec -# filename,filetype,filesize,url,urltype,md5:... -def parse_files_string(input_string): +def parse_files_string(input_string: str) -> List[File]: + """Parse files string into list of File objects. # filename,filetype,filesize,url,urltype,md5:...""" files = [] - objects = input_string.split(":") - for obj in objects: - parts = obj.split(",") - filename, filetype, filesize, url, urltype, md5 = parts - - file = File( - file_id=filename, - filename=filename, - filetype=filetype, - filesize=int(filesize), - url=url, - urltype=urltype, - md5=md5, + for f in input_string.split(":"): + filename, filetype, filesize, url, urltype, md5 = f.split(",") + files.append( + File( + file_id=filename, + filename=filename, + filetype=filetype, + filesize=int(filesize), + url=url, + urltype=urltype, + md5=md5, + ) ) - files.append(file) - return files diff --git a/seqspec/seqspec_onlist.py b/seqspec/seqspec_onlist.py index b021a5a..51198b9 100644 --- a/seqspec/seqspec_onlist.py +++ b/seqspec/seqspec_onlist.py @@ -1,16 +1,28 @@ -from seqspec.Assay import Assay -from seqspec.Region import project_regions_to_coordinates, itx_read, Onlist -from seqspec.utils import load_spec, map_read_id_to_regions -from seqspec.seqspec_find import find_by_region_type, find_by_region_id +"""Onlist module for seqspec CLI. + +This module provides functionality to generate and manage onlist files for seqspec regions. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS +import warnings import os -from seqspec.utils import read_local_list, read_remote_list import itertools from typing import List -from argparse import SUPPRESS, RawTextHelpFormatter -import warnings +from seqspec.Assay import Assay +from seqspec.Region import project_regions_to_coordinates, itx_read, Onlist +from seqspec.utils import ( + load_spec, + map_read_id_to_regions, + read_local_list, + read_remote_list, +) +from seqspec.seqspec_find import find_by_region_type, find_by_region_id -def setup_onlist_args(parser): + +def setup_onlist_args(parser) -> ArgumentParser: + """Create and configure the onlist command subparser.""" subparser = parser.add_parser( "onlist", description=""" @@ -25,20 +37,22 @@ def setup_onlist_args(parser): formatter_class=RawTextHelpFormatter, ) subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) choices = ["read", "region", "region-type"] subparser.add_argument( "-s", + "--selector", metavar="SELECTOR", - help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"), + help=f"Selector for ID, [{', '.join(choices)}] (default: read)", type=str, default="read", choices=choices, @@ -55,6 +69,7 @@ def setup_onlist_args(parser): format_choices = ["product", "multi"] subparser.add_argument( "-f", + "--format", metavar="FORMAT", type=str, default="product", @@ -63,16 +78,18 @@ def setup_onlist_args(parser): ) subparser_required.add_argument( "-i", - metavar="IDs", - help=("IDs"), + "--id", + metavar="ID", + help="ID to search for", type=str, default=None, required=False, ) subparser_required.add_argument( "-m", + "--modality", metavar="MODALITY", - help=("Modality"), + help="Modality", type=str, default=None, required=True, @@ -81,7 +98,14 @@ def setup_onlist_args(parser): return subparser -def validate_onlist_args(parser, args): +def validate_onlist_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the onlist command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") + if args.r is not None: warnings.warn( "The '-r' argument is deprecated and will be removed in a future version. " @@ -89,32 +113,26 @@ def validate_onlist_args(parser, args): DeprecationWarning, ) # Optionally map the old option to the new one - if not args.i: - args.i = args.r - - fn = args.yaml - m = args.m - ids = args.i - fmt = args.f - o = args.o - idtype = args.s + if not args.id: + args.id = args.r - return run_onlist(fn, m, ids, idtype, fmt, o) +def run_onlist(parser: ArgumentParser, args: Namespace) -> None: + """Run the onlist command.""" + validate_onlist_args(parser, args) -def run_onlist(spec_fn, modality, ids, idtype, fmt, o): # the base path is the path to the spec file - base_path = os.path.dirname(os.path.abspath(spec_fn)) + base_path = args.yaml.parent.absolute() # set the save path if it exists - if o: - save_path = os.path.abspath(o) + if args.output: + save_path = args.output else: # otherwise the save path is the same path as the spec - save_path = os.path.join(base_path, "onlist_joined.txt") + save_path = base_path / "onlist_joined.txt" # load spec - spec = load_spec(spec_fn) + spec = load_spec(args.yaml) # if number of barcodes > 1 then we need to join them CMD = { @@ -123,18 +141,20 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o): "read": run_onlist_read, } - onlists = CMD[idtype](spec, modality, ids) + onlists = CMD[args.selector](spec, args.modality, args.id) if len(onlists) == 0: - raise ValueError(f"No onlist found for {modality}, {idtype}, {ids}") + raise ValueError( + f"No onlist found for {args.modality}, {args.selector}, {args.id}" + ) # for only one onlist we can just return the path # if only one, its remote and we save it to the base path elif len(onlists) == 1: urltype = onlists[0].urltype - onlist_fn = os.path.basename(onlists[0].filename) - onlist_path = os.path.join(base_path, onlist_fn) - if os.path.exists(onlist_path): + onlist_fn = Path(onlists[0].filename).name + onlist_path = base_path / onlist_fn + if onlist_path.exists(): urltype = "local" elif urltype in ["http", "https"]: # download the onlist to the base path and return the path @@ -150,12 +170,11 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o): elif o.urltype in ["http", "https"]: # base_path is ignored for remote onlists lsts.append(read_remote_list(o, base_path)) - onlist_elements = join_onlists(lsts, fmt) + onlist_elements = join_onlists(lsts, args.format) onlist_path = write_onlist(onlist_elements, save_path) # print the path to the onlist print(onlist_path) - return def run_onlist_region_type( @@ -174,7 +193,9 @@ def run_onlist_region(spec: Assay, modality: str, region_id: str) -> List[Onlist regions = find_by_region_id(spec, modality, region_id) onlists: List[Onlist] = [] for r in regions: - onlists.append(r.get_onlist()) + ol = r.get_onlist() + if ol: + onlists.append(ol) if len(onlists) == 0: raise ValueError(f"No onlist found for region {region_id}") return onlists diff --git a/seqspec/seqspec_print.py b/seqspec/seqspec_print.py index 11322f6..1b7744a 100644 --- a/seqspec/seqspec_print.py +++ b/seqspec/seqspec_print.py @@ -1,13 +1,33 @@ -from seqspec.utils import load_spec -from seqspec.seqspec_print_html import print_seqspec_html +"""Print module for seqspec CLI. + +This module provides functionality to print sequence and/or library structure +in various formats (ascii, png, html). +""" + +from typing import List, Any +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace import newick -from seqspec.utils import REGION_TYPE_COLORS +import matplotlib.pyplot as plt +from matplotlib.patches import Rectangle +import matplotlib.patches as mpatches + +from seqspec.utils import load_spec, REGION_TYPE_COLORS +from seqspec.seqspec_print_html import print_seqspec_html from seqspec.Region import complement_sequence -from seqspec.Region import project_regions_to_coordinates -from argparse import RawTextHelpFormatter +from seqspec.Assay import Assay +from seqspec.seqspec_print_utils import libseq + +def setup_print_args(parser) -> ArgumentParser: + """Create and configure the print command subparser. -def setup_print_args(parser): + Args: + parser: The main argument parser to add the print subparser to. + + Returns: + The configured print subparser. + """ subparser = parser.add_parser( "print", description=""" @@ -23,20 +43,23 @@ def setup_print_args(parser): help="Display the sequence and/or library structure from seqspec file", formatter_class=RawTextHelpFormatter, ) - subparser.add_argument("yaml", help="Sequencing specification yaml file") + + subparser.add_argument("yaml", type=Path, help="Sequencing specification yaml file") subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + type=Path, + help="Path to output file", default=None, ) format_choices = ["library-ascii", "seqspec-html", "seqspec-png", "seqspec-ascii"] subparser.add_argument( "-f", + "--format", metavar="FORMAT", - help=(f"Format ({', '.join(format_choices)}), default: library-ascii"), + help=f"Format ({', '.join(format_choices)}), default: library-ascii", type=str, default="library-ascii", choices=format_choices, @@ -45,53 +68,87 @@ def setup_print_args(parser): return subparser -def validate_print_args(parser, args): - fmt = args.f +def validate_print_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the print command arguments. + + Args: + parser: The argument parser. + args: The parsed arguments. + + Raises: + parser.error: If any validation fails. + """ + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") - fn = args.yaml - o = args.o - if fmt == "seqspec-png" and o is None: + if args.format == "seqspec-png" and args.output is None: parser.error("Output file required for png format") - return run_seqspec_print(fn, fmt, o) +def run_print(parser: ArgumentParser, args: Namespace) -> None: + """Run the print command. + + Args: + parser: The argument parser. + args: The parsed arguments. + """ + + validate_print_args(parser, args) -def run_seqspec_print(spec_fn, fmt, o): - spec = load_spec(spec_fn) + spec = load_spec(args.yaml) - # TODO: add reads to seqspec html - # TODO: add reads to seqspec png - CMD = { + # Map format to print function + format_to_function = { "library-ascii": print_library_ascii, "seqspec-html": print_seqspec_html, "seqspec-png": print_seqspec_png, "seqspec-ascii": print_seqspec_ascii, } - s = CMD[fmt](spec) - - if fmt == "png": - return s.savefig(o, dpi=300, bbox_inches="tight") # + result = format_to_function[args.format](spec) - if o: - with open(o, "w") as f: - print(s, file=f) + if args.format == "seqspec-png": + result.savefig(args.output, dpi=300, bbox_inches="tight") + elif args.output: + with open(args.output, "w") as f: + print(result, file=f) else: - print(s) - return + print(result) + + +def print_seqspec_ascii(spec: Assay) -> str: + """Print sequence specification in ASCII format. + Args: + spec: The seqspec specification to print. -def print_seqspec_ascii(spec): - p = [] + Returns: + The ASCII formatted string. + """ + parts = [] for modality in spec.modalities: - p.append(format_libseq(spec, modality, *libseq(spec, modality))) - return "\n".join(p) + parts.append(format_libseq(spec, modality, *libseq(spec, modality))) + return "\n".join(parts) + +def format_libseq(spec: Assay, modality: str, p: List[str], n: List[str]) -> str: + """Format library sequence for a specific modality. -def format_libseq(spec, modality, p, n): + Args: + spec: The seqspec specification. + modality: The modality to format. + p: Positive strand parts. + n: Negative strand parts. + + Returns: + The formatted string. + """ libspec = spec.get_libspec(modality) - s = "\n".join( + return "\n".join( [ modality, "---", @@ -101,154 +158,123 @@ def format_libseq(spec, modality, p, n): "\n".join(n), ] ) - return s -def libseq(spec, modality): - libspec = spec.get_libspec(modality) - seqspec = spec.get_seqspec(modality) - - p = [] - n = [] - leaves = libspec.get_leaves() - cuts = project_regions_to_coordinates(leaves) - for idx, read in enumerate(seqspec, 1): - read_len = read.max_len - read_id = read.read_id - primer_id = read.primer_id - primer_idx = [i for i, l in enumerate(leaves) if l.region_id == primer_id][0] - primer_pos = cuts[primer_idx] - if read.strand == "pos": - wsl = primer_pos.stop - 1 - ws = wsl * " " - - arrowl = read_len - 1 - arrow = arrowl * "-" - - p.append(f"{ws}|{arrow}>({idx}) {read_id}") - elif read.strand == "neg": - wsl = primer_pos.start - read_len - ws = wsl * " " - - arrowl = read_len - 1 - arrow = arrowl * "-" - - n.append(f"{ws}<{arrow}|({idx}) {read_id}") - return (p, n) - - -def run_print(data): - header = headerTemplate(data.name, data.doi, data.description, data.modalities) - header2 = "## Final Library" - library_spec = multiModalTemplate(data.library_spec) - s = f"{header}\n{header2}\n{library_spec}" - return s - - -def run_print_sequence_spec(spec): - p = [] - for r in spec.sequence_spec: - p.append( - "\t".join( - [r.read_id, r.primer_id, r.strand, str(r.min_len), str(r.max_len)] - ) - ) - return "\n".join(p) +def print_library_ascii(spec: Assay) -> str: + """Print library structure in ASCII format. + Args: + spec: The seqspec specification to print. -def print_library_ascii(spec): - t = [] + Returns: + The ASCII formatted string. + """ + trees = [] for r in spec.library_spec: - t.append(r.to_newick()) - n = ",".join(t) - # print(n) - tree = newick.loads(f"({n})") + trees.append(r.to_newick()) + tree_str = ",".join(trees) + tree = newick.loads(f"({tree_str})") return tree[0].ascii_art() -def argsort(arr): - # http://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python/3382369#3382369 - # by unutbu - return sorted(range(len(arr)), key=arr.__getitem__) - +def print_seqspec_png(spec: Assay): + """Print sequence specification as PNG. -def print_seqspec_png(spec): - # builds directly off of https://colab.research.google.com/drive/1ZCIGrwLEIfE0yo33bP8uscUNPEn1p1DH developed by https://github.com/LucasSilvaFerreira + Args: + spec: The seqspec specification to print. - # modality + Returns: + The matplotlib figure. + """ modalities = spec.list_modalities() modes = [spec.get_libspec(m) for m in modalities] lengths = [i.min_len for i in modes] nmodes = len(modalities) - # sort the modalities by their lengths + # Sort modalities by length asort = argsort(lengths) modalities = [modalities[i] for i in asort] lengths = [lengths[i] for i in asort] modes = [modes[i] for i in asort] - assay_id = spec.assay_id - fig, _ = plot_png(assay_id, modalities, modes, nmodes, lengths) - return fig + return plot_png(spec.assay_id, modalities, modes, nmodes, lengths) + +def argsort(arr: List[Any]) -> List[int]: + """Get indices that would sort an array. -def plot_png(assay, modalities, modes, nmodes, lengths): - import matplotlib.pyplot as plt - from matplotlib.patches import Rectangle - import matplotlib.patches as mpatches + Args: + arr: The array to sort. + Returns: + List of indices that would sort the array. + """ + return sorted(range(len(arr)), key=arr.__getitem__) + + +def plot_png( + assay: str, modalities: List[str], modes: List[Any], nmodes: int, lengths: List[int] +): + """Create PNG plot of sequence specification. + + Args: + assay: The assay ID. + modalities: List of modalities. + modes: List of mode specifications. + nmodes: Number of modes. + lengths: List of lengths. + + Returns: + The matplotlib figure. + """ fsize = 15 plt.rcParams.update({"font.size": fsize}) - fig, ax = plt.subplots( - figsize=(10, 1 * nmodes), nrows=nmodes - ) + fig, _ = plt.subplots(figsize=(10, 1 * nmodes), nrows=nmodes) title_offset = 0.98 if nmodes > 1 else 1.2 fig.suptitle(assay, y=title_offset) + rts = [] for m, ax in zip(modes, fig.get_axes()): - # get leaves + # Get leaves leaves = m.get_leaves() - # setup plotting variables + # Setup plotting variables y = 0 x = 0 height = 1 for idx, node in enumerate(leaves): - # region tupe + # Region type rtype = node.region_type.lower() - # add to the global list so we can make a legend rts.append(rtype) - # get region properties + + # Get region properties length = node.min_len label = f"{length}" - # setup rectangle for the region + # Setup rectangle for the region rectangle = Rectangle( (x, y), length, height, color=REGION_TYPE_COLORS[rtype], ec="black" ) - # write in the length of the region in the rectangle + # Write length in the rectangle ax.text( x + length / 2, y + height / 2, label, horizontalalignment="center", verticalalignment="center", - ) # , rotation=90) - # add the rectangle + ) ax.add_patch(rectangle) - # add length to x for next region + # Add length to x for next region x += length ax.autoscale() - - # since all axes use the same scale, set the xlim to be 0 to the max length ax.set(**{"xlim": (0, max(lengths)), "ylim": (0, 1)}) - # hide the spines + # Hide the spines for spine in ["right", "top", "left", "bottom"]: ax.spines[spine].set_visible(False) # Hide the axis and ticks and labels @@ -256,10 +282,10 @@ def plot_png(assay, modalities, modes, nmodes, lengths): ax.set_yticklabels([]) ax.set_yticks([]) - # label the modality on the ylabel + # Label the modality on the ylabel ax.set_ylabel(m.region_type, rotation=0, fontsize=20, ha="right", va="center") - # adjust the xaxis for the last modality to show the length + # Adjust the xaxis for the last modality to show the length ax.xaxis.set_visible(True) ax.spines["bottom"].set_visible(True) ax.minorticks_on() @@ -270,68 +296,10 @@ def plot_png(assay, modalities, modes, nmodes, lengths): } ) - # setup the figure legend + # Setup the figure legend handles = [] for t in sorted(set(rts)): handles.append(mpatches.Patch(color=REGION_TYPE_COLORS[t], label=t)) fig.legend(handles=handles, loc="center", bbox_to_anchor=(1.1, 0.55)) - return (fig, ax) - - -def headerTemplate(name, doi, description, modalities): - s = f"""# {name} -- DOI: [{doi}]({doi}) -- Description: {description} -- Modalities: {", ".join(modalities)} - """ - return s - -def atomicRegionTemplate( - name, region_type, sequence_type, sequence, min_len, max_len, onlist, ns=0 -): - s = f"""
    {name} - -{' '*ns}- region_type: {region_type} -{' '*ns}- sequence_type: {sequence_type} -{' '*ns}- sequence:
    {sequence}
    -{' '*ns}- min_len: {min_len} -{' '*ns}- max_len: {max_len} -{' '*ns}- onlist: {onlist} -{' '*ns}
    """ - return s - - -def regionsTemplate(regions): - s = "\n".join( - [ - f"{idx + 1}. " - + atomicRegionTemplate( - v.name, - v.region_type, - v.sequence_type, - v.sequence, - v.min_len, - v.max_len, - v.onlist, - len(str(idx + 1)) - + 1 - + 1, # length of string rep of number plus 1 for "." plus 1 for space - ) - for idx, v in enumerate(regions) - ] - ) - return s - - -def libStructTemplate(region): - s = f"""###### {region.name} -
    {region.sequence}
    """ - return s - - -def multiModalTemplate(library_spec): - s = "\n".join( - [libStructTemplate(v) + "\n" + regionsTemplate(v.regions) for v in library_spec] - ) - return s + return fig diff --git a/seqspec/seqspec_print_html.py b/seqspec/seqspec_print_html.py index 69b904d..8ec4694 100644 --- a/seqspec/seqspec_print_html.py +++ b/seqspec/seqspec_print_html.py @@ -1,19 +1,23 @@ +"""Print HTML module for seqspec. + +This module provides functionality to generate HTML representations of seqspec files. +It is used by the print command with the 'seqspec-html' format option. +""" + +from typing import List, Optional + from seqspec.Assay import Assay -from seqspec.Region import Region -from seqspec.Read import Read -from seqspec.Read import File +from seqspec.Region import Region, Onlist, complement_sequence +from seqspec.Read import Read, File +from seqspec.seqspec_print_utils import libseq -def print_seqspec_html(spec): - # header = headerTemplate(spec.name, spec.doi, spec.description, spec.modalities) - # header2 = "## Final Library" - # library_spec = multiModalTemplate(spec.library_spec) - # s = f"{header}\n{header2}\n{library_spec}" - s = htmlTemplate(spec) - return s +def print_seqspec_html(spec: Assay) -> str: + """Generate HTML representation of seqspec.""" + return htmlTemplate(spec) -def headerTemplate(name, doi, description, modalities): +def headerTemplate(name: str, doi: str, description: str, modalities: List[str]) -> str: s = f"""

    {name}

    • @@ -30,7 +34,7 @@ def headerTemplate(name, doi, description, modalities): return s -def colorSeq(regions): +def colorSeq(regions: List[Region]) -> str: return "".join( [f"<{r.region_type}>{r.sequence}" for r in regions] ) @@ -38,21 +42,22 @@ def colorSeq(regions): def atomicRegionTemplate( region: Region, - name, - region_type, - sequence_type, - sequence, - min_len, - max_len, - onlist, - regions, -): + name: str, + region_type: str, + sequence_type: str, + sequence: str, + min_len: int, + max_len: int, + onlist: Optional[Onlist], + regions: Optional[List[Region]], +) -> str: seq = ( colorSeq(region.get_leaves()) if regions else f"<{region_type}>{sequence}" ) - onlist = f"{onlist.filename} (md5: {onlist.md5})" if onlist else None + + ol = f"{onlist.filename} (md5: {onlist.md5})" if onlist else None lst = [] if regions: for idx, r in enumerate(regions): @@ -73,7 +78,6 @@ def atomicRegionTemplate( else: subseq = "" - # subseq = "
    • " + "
    • ".join( [ for i in regions if regions else '']) s = f"""
      {name}
        @@ -94,7 +98,7 @@ def atomicRegionTemplate(
      • min_len: {min_len}
      • max_len: {max_len}
      • -
      • onlist: {onlist}
      • +
      • onlist: {ol}
      • regions: {subseq}
      @@ -103,27 +107,28 @@ def atomicRegionTemplate( return s -def regionsTemplate(regions): +def regionsTemplate(regions: List[Region]) -> str: + templates = [ + atomicRegionTemplate( + r, + r.region_id, + r.region_type, + r.sequence_type, + r.sequence, + r.min_len, + r.max_len, + r.onlist, + r.regions, + ) + for idx, r in enumerate(regions) + ] s = f"""
      1. - {'
      2. '.join([atomicRegionTemplate( - r, - r.region_id, - r.region_type, - r.sequence_type, - r.sequence, - r.min_len, - r.max_len, - r.onlist, - r.regions, - ) for idx, r in enumerate(regions)])} + {'
      3. '.join(templates)}
      """ return s -def libStructTemplate(spec, modality): - from seqspec.seqspec_print import libseq - from seqspec.Region import complement_sequence - +def libStructTemplate(spec: Assay, modality: str) -> str: libspec = spec.get_libspec(modality) seqspec = spec.get_seqspec(modality) # noqa p, n = libseq(spec, modality) @@ -147,7 +152,7 @@ def libStructTemplate(spec, modality): return s -def atomicReadTemplate(read: Read): +def atomicReadTemplate(read: Read) -> str: files = "".join(atomicFileTemplate(f) for f in read.files) if read.files else "" s = f""" @@ -171,21 +176,21 @@ def atomicReadTemplate(read: Read): return s -def atomicFileTemplate(file: File): +def atomicFileTemplate(file: File) -> str: s = f"""
    • {file.filename} (md5: {file.md5})
    • """ return s -def readsTemplate(reads): +def readsTemplate(reads: List[Read]) -> str: s = f"""
      1. {'
      2. '.join([atomicReadTemplate(r) for r in reads])}
      """ return s -def multiModalTemplate(spec: Assay): +def multiModalTemplate(spec: Assay) -> str: modes = spec.modalities s = "" for m in modes: @@ -202,7 +207,7 @@ def multiModalTemplate(spec: Assay): return s -def htmlTemplate(spec): +def htmlTemplate(spec: Assay) -> str: s = f""" diff --git a/seqspec/seqspec_print_utils.py b/seqspec/seqspec_print_utils.py new file mode 100644 index 0000000..9aebc17 --- /dev/null +++ b/seqspec/seqspec_print_utils.py @@ -0,0 +1,79 @@ +"""Utility functions for printing seqspec files. + +This module contains shared functionality used by both seqspec_print.py and seqspec_print_html.py. +""" + +from typing import List, Tuple +from seqspec.Assay import Assay +from seqspec.Region import complement_sequence, project_regions_to_coordinates + + +def libseq(spec: Assay, modality: str) -> Tuple[List[str], List[str]]: + """Get library sequence parts for a specific modality. + + Args: + spec: The seqspec specification. + modality: The modality to get parts for. + + Returns: + Tuple of (positive strand parts, negative strand parts). + """ + libspec = spec.get_libspec(modality) + seqspec = spec.get_seqspec(modality) + + p = [] + n = [] + leaves = libspec.get_leaves() + cuts = project_regions_to_coordinates(leaves) + + for idx, read in enumerate(seqspec, 1): + read_len = read.max_len + read_id = read.read_id + primer_id = read.primer_id + primer_idx = [i for i, l in enumerate(leaves) if l.region_id == primer_id][0] + primer_pos = cuts[primer_idx] + + if read.strand == "pos": + wsl = primer_pos.stop - 1 + ws = wsl * " " + + arrowl = read_len - 1 + arrow = arrowl * "-" + + p.append(f"{ws}|{arrow}>({idx}) {read_id}") + elif read.strand == "neg": + wsl = primer_pos.start - read_len + ws = wsl * " " + + arrowl = read_len - 1 + arrow = arrowl * "-" + + n.append(f"{ws}<{arrow}|({idx}) {read_id}") + + return (p, n) + + +def format_libseq(spec: Assay, modality: str, p: List[str], n: List[str]) -> str: + """Format library sequence for a specific modality. + + Args: + spec: The seqspec specification. + modality: The modality to format. + p: Positive strand parts. + n: Negative strand parts. + + Returns: + The formatted string. + """ + libspec = spec.get_libspec(modality) + + return "\n".join( + [ + modality, + "---", + "\n".join(p), + libspec.sequence, + complement_sequence(libspec.sequence), + "\n".join(n), + ] + ) diff --git a/seqspec/seqspec_split.py b/seqspec/seqspec_split.py index 3ad405a..20436a7 100644 --- a/seqspec/seqspec_split.py +++ b/seqspec/seqspec_split.py @@ -1,17 +1,25 @@ -import os +"""Split module for seqspec. + +This module provides functionality to split seqspec files into one file per modality. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace +from typing import List, Dict, Any + from seqspec.utils import load_spec from seqspec.Assay import Assay -from argparse import RawTextHelpFormatter -def setup_split_args(parser): +def setup_split_args(parser) -> ArgumentParser: + """Create and configure the split command subparser.""" subparser = parser.add_parser( "split", description=""" Split seqspec file into one file per modality. Examples: -seqspec split -o split spec.yaml # Split spec into modalities +seqspec split -o split spec.yaml # Split spec into modalities --- """, help="Split seqspec file by modality", @@ -19,59 +27,66 @@ def setup_split_args(parser): ) subparser_required = subparser.add_argument_group("required arguments") - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str) subparser_required.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, - default=None, + help="Path to output files", + type=Path, required=True, ) return subparser -def validate_split_args(parser, args): - # if everything is valid the run_split - fn = args.yaml - o = args.o - return run_split(fn, o) +def validate_split_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the split command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + + if Path(args.output).exists() and Path(args.output).is_file(): + parser.error(f"Output path exists: {args.output}") -def run_split(spec_fn, o): - spec = load_spec(spec_fn) - specs = split(spec, o) +def run_split(parser: ArgumentParser, args: Namespace) -> None: + """Run the split command.""" + validate_split_args(parser, args) - for spec in specs: - spec["spec"].to_YAML( - os.path.join(os.path.dirname(o), f"{spec['p']}{spec['m']}.yaml") - ) - return + spec = load_spec(args.yaml) + specs = split(spec, args.output) + for spec_info in specs: + output_path = args.output / f"{spec_info['prefix']}{spec_info['modality']}.yaml" + spec_info["spec"].to_YAML(output_path) -def split(spec, o=""): + +def split(spec: Assay, output_dir: Path) -> List[Dict[str, Any]]: + """Split spec into one file per modality.""" specs = [] modalities = spec.list_modalities() - # make a new spec per modality - for m in modalities: + + # Make a new spec per modality + for modality in modalities: info = { "assay_id": spec.assay_id, "name": spec.name, "doi": spec.doi, "date": spec.date, "description": spec.description, - "modalities": [m], + "modalities": [modality], "lib_struct": spec.lib_struct, "library_kit": spec.library_kit, "library_protocol": spec.library_protocol, "sequence_kit": spec.sequence_kit, "sequence_protocol": spec.sequence_protocol, - "sequence_spec": spec.get_seqspec(m), - "library_spec": [spec.get_libspec(m)], + "sequence_spec": spec.get_seqspec(modality), + "library_spec": [spec.get_libspec(modality)], "seqspec_version": spec.seqspec_version, } spec_m = Assay(**info) spec_m.update_spec() - base_o = "spec." if os.path.basename(o) == "" else f"{os.path.basename(o)}." - specs.append({"p": base_o, "spec": spec_m, "m": m}) + + prefix = "spec." if output_dir.name == "" else f"{output_dir.name}." + specs.append({"prefix": prefix, "spec": spec_m, "modality": modality}) + return specs diff --git a/seqspec/seqspec_upgrade.py b/seqspec/seqspec_upgrade.py index 032adbb..62ec210 100644 --- a/seqspec/seqspec_upgrade.py +++ b/seqspec/seqspec_upgrade.py @@ -1,75 +1,89 @@ +"""Upgrade module for seqspec. + +This module provides functionality to upgrade seqspec files from older versions to the current version. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace + from seqspec.utils import load_spec from seqspec.File import File from seqspec.Region import Onlist -from argparse import RawTextHelpFormatter from seqspec import get_version +from seqspec.Assay import Assay -def setup_upgrade_args(parser): +def setup_upgrade_args(parser) -> ArgumentParser: + """Create and configure the upgrade command subparser.""" subparser = parser.add_parser( "upgrade", description=""" Upgrade seqspec file from older versions to the current version. Examples: -seqspec upgrade spec.yaml # Upgrade the spec file +seqspec upgrade -o upgraded.yaml spec.yaml # Upgrade and save to new file +seqspec upgrade spec.yaml # Upgrade and print to stdout --- """, - # help="upgrade seqspec file", + help="Upgrade seqspec file to current version", formatter_class=RawTextHelpFormatter, ) - # subparser_required = subparser.add_argument_group("required arguments") - - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) return subparser -def validate_upgrade_args(parser, args): - fn = args.yaml - o = args.o +def validate_upgrade_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the upgrade command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") - run_upgrade(spec_fn=fn, o=o) + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") -def run_upgrade(spec_fn, o): - spec = load_spec(spec_fn) +def run_upgrade(parser: ArgumentParser, args: Namespace) -> None: + """Run the upgrade command.""" + validate_upgrade_args(parser, args) + + spec = load_spec(args.yaml) version = spec.seqspec_version - upgrade(spec, version) - if o: - spec.to_YAML(o) + upgraded_spec = upgrade(spec, version) + + if args.output: + args.output.write_text(upgraded_spec.to_YAML()) else: - print(spec.to_YAML()) + print(upgraded_spec.to_YAML()) -def upgrade(spec, version): +def upgrade(spec: Assay, version: str) -> Assay: + """Upgrade spec to current version.""" UPGRADE = { - "0.0.0": upgrade_0_2_0_to_0_3_0, + "0.0.0": upgrade_0_0_0_to_0_3_0, "0.1.0": upgrade_0_1_0_to_0_3_0, "0.1.1": upgrade_0_1_1_to_0_3_0, "0.2.0": upgrade_0_2_0_to_0_3_0, get_version(): no_upgrade, } - u = UPGRADE[version](spec) - return u + return UPGRADE[version](spec) def no_upgrade(spec): + """No upgrade needed for current version.""" return spec -def upgrade_0_2_0_to_0_3_0(spec): - # for backwards compatibilty, for specs < v0.3.0 set the files to empty - # of the specs < v0.3.0, set the onlist regions with missing properties - # if version.parse(spec.seqspec_version) < version.parse("0.3.0"): - +def upgrade_0_2_0_to_0_3_0(spec: Assay) -> Assay: + """Upgrade spec from version 0.2.0 to 0.3.0.""" + # Set files to empty for specs < v0.3.0 for r in spec.sequence_spec: r.set_files( [ @@ -85,11 +99,12 @@ def upgrade_0_2_0_to_0_3_0(spec): ] ) + # Update onlist regions with missing properties for r in spec.library_spec: for lf in r.get_leaves(): if lf.onlist is not None: filename = lf.onlist.filename - location = lf.onlist.location + # location = lf.onlist.location md5 = lf.onlist.md5 lf.onlist = Onlist( file_id=filename, @@ -99,19 +114,22 @@ def upgrade_0_2_0_to_0_3_0(spec): url="", urltype="", md5=md5, - location=location, + # location=location, ) spec.seqspec_version = get_version() return spec -def upgrade_0_1_1_to_0_3_0(spec): +def upgrade_0_1_1_to_0_3_0(spec: Assay) -> Assay: + """Upgrade spec from version 0.1.1 to 0.3.0.""" return upgrade_0_2_0_to_0_3_0(spec) -def upgrade_0_1_0_to_0_3_0(spec): +def upgrade_0_1_0_to_0_3_0(spec: Assay) -> Assay: + """Upgrade spec from version 0.1.0 to 0.3.0.""" return upgrade_0_2_0_to_0_3_0(spec) -def upgrade_0_0_0_to_0_3_0(spec): +def upgrade_0_0_0_to_0_3_0(spec: Assay) -> Assay: + """Upgrade spec from version 0.0.0 to 0.3.0.""" return upgrade_0_2_0_to_0_3_0(spec) diff --git a/seqspec/seqspec_version.py b/seqspec/seqspec_version.py index 56a52e0..2ed7c54 100644 --- a/seqspec/seqspec_version.py +++ b/seqspec/seqspec_version.py @@ -1,53 +1,68 @@ +"""Version module for seqspec. + +This module provides functionality to get seqspec tool version and seqspec file version. +""" + +from pathlib import Path +from argparse import ArgumentParser, RawTextHelpFormatter, Namespace + from seqspec.utils import load_spec +from seqspec.Assay import Assay from . import __version__ -from argparse import RawTextHelpFormatter -def setup_version_args(parser): +def setup_version_args(parser) -> ArgumentParser: + """Create and configure the version command subparser.""" subparser = parser.add_parser( "version", description=""" Get seqspec version and seqspec file version. Examples: -seqspec version spec.yaml +seqspec version -o version.txt spec.yaml # Save version info to file +seqspec version spec.yaml # Print version info to stdout --- """, help="Get seqspec tool version and seqspec file version", formatter_class=RawTextHelpFormatter, ) - subparser.add_argument("yaml", help="Sequencing specification yaml file") + subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path) subparser.add_argument( "-o", + "--output", metavar="OUT", - help=("Path to output file"), - type=str, + help="Path to output file", + type=Path, default=None, ) return subparser -def validate_version_args(parser, args): - # if everything is valid the run_version - fn = args.yaml - o = args.o - run_version(fn, o) +def validate_version_args(parser: ArgumentParser, args: Namespace) -> None: + """Validate the version command arguments.""" + if not Path(args.yaml).exists(): + parser.error(f"Input file does not exist: {args.yaml}") + + if args.output and Path(args.output).exists() and not Path(args.output).is_file(): + parser.error(f"Output path exists but is not a file: {args.output}") + + +def run_version(parser: ArgumentParser, args: Namespace) -> None: + """Run the version command.""" + validate_version_args(parser, args) + spec = load_spec(args.yaml) + version_info = version(spec) -def run_version(spec_fn, o): - spec = load_spec(spec_fn) - s = version(spec) - if o: - with open(o, "w") as f: - print(s, file=f) + if args.output: + args.output.write_text(version_info) else: - print(s) - return + print(version_info) -def version(spec): +def version(spec: Assay) -> str: + """Get version information for spec and tool.""" version = spec.seqspec_version tool_version = __version__ - s = f"seqspec version: {tool_version}\nseqspec file version: {version}" - return s + return f"seqspec version: {tool_version}\nseqspec file version: {version}" diff --git a/setup.cfg b/setup.cfg index 594931a..f645753 100644 --- a/setup.cfg +++ b/setup.cfg @@ -9,7 +9,7 @@ tag = True [bumpversion:file:README.md] [flake8] -exclude = .git,.github,__pycache__,build,dist +exclude = .git,.github,__pycache__,build,dist,venv statistics = True max-line-length = 88 extend-ignore = E203,E501 From 5f1192f97384cbf17951d8fe16cd177c387f2b3a Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Thu, 10 Jul 2025 15:21:56 -0500 Subject: [PATCH 15/21] CHECK-214-region-type (#10) --- seqspec/schema/seqspec.schema.json | 1 + 1 file changed, 1 insertion(+) diff --git a/seqspec/schema/seqspec.schema.json b/seqspec/schema/seqspec.schema.json index 0dc9a04..1fbf3aa 100644 --- a/seqspec/schema/seqspec.schema.json +++ b/seqspec/schema/seqspec.schema.json @@ -268,6 +268,7 @@ "enum": [ "atac", "barcode", + "bead_TSO", "cdna", "crispr", "custom_primer", From 1a29c3ce9ed8dc32bbfa24a5f68bb67e5437770f Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Thu, 10 Jul 2025 16:58:46 -0500 Subject: [PATCH 16/21] CHECK-219-api-merge(#12) --- seqspec/seqspec_check.py | 50 +++++--- seqspec/seqspec_file.py | 100 +++++++-------- seqspec/seqspec_find.py | 67 +++++++--- seqspec/seqspec_index.py | 115 +++++++++-------- seqspec/seqspec_info.py | 250 ++++++++++++++++++++++++++----------- seqspec/seqspec_init.py | 72 +++++++---- seqspec/seqspec_print.py | 36 ++++-- seqspec/seqspec_split.py | 27 ++-- seqspec/seqspec_upgrade.py | 11 +- seqspec/seqspec_version.py | 25 +++- 10 files changed, 488 insertions(+), 265 deletions(-) diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index 82a2894..e9b2cef 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -5,6 +5,7 @@ from pathlib import Path from argparse import ArgumentParser, RawTextHelpFormatter, Namespace +from typing import List, Dict, Optional from jsonschema import Draft4Validator import yaml @@ -62,26 +63,43 @@ def validate_check_args(parser: ArgumentParser, args: Namespace) -> None: parser.error(f"Output path exists but is not a file: {args.output}") +def format_error(errobj, idx=0): + return f"[error {idx}] {errobj['error_message']}" + + +def seqspec_check( + spec: Assay, spec_fn: str, filter_type: Optional[str] = None +) -> List[Dict]: + """Core functionality to check a seqspec and return filtered errors. + + Args: + spec: The Assay object to check + spec_fn: Path to the spec file, used for relative path resolution + filter_type: Optional filter type to apply to errors (e.g. "igvf", "igvf_onlist_skip") + + Returns: + List of error dictionaries + """ + errors = check(spec, spec_fn) + if filter_type: + errors = filter_errors(errors, filter_type) + return errors + + def run_check(parser: ArgumentParser, args: Namespace): """Run the check command.""" validate_check_args(parser, args) spec = load_spec(args.yaml) - errors = check(spec, args.yaml) - - if args.skip == "igvf": - errors = filter_errors(errors, "igvf") - elif args.skip == "igvf_onlist_skip": - errors = filter_errors(errors, "igvf_onlist_skip") - - if errors: - if args.output: - with open(args.output, "w") as f: - for idx, e in enumerate(errors, 1): - print(format_error(e, idx), file=f) - else: + errors = seqspec_check(spec, args.yaml, args.skip) + + if args.output: + with open(args.output, "w") as f: for idx, e in enumerate(errors, 1): - print(format_error(e, idx)) + print(format_error(e, idx), file=f) + else: + for idx, e in enumerate(errors, 1): + print(format_error(e, idx)) return errors @@ -123,10 +141,6 @@ def filter_errors(errors, filter_type): return errors -def format_error(errobj, idx=0): - return f"[error {idx}] {errobj['error_message']}" - - def check(spec: Assay, spec_fn: str): # Variety of checks against schema def check_schema(spec: Assay, spec_fn: str, errors=[], idx=0): diff --git a/seqspec/seqspec_file.py b/seqspec/seqspec_file.py index 48563e6..d1ad8e5 100644 --- a/seqspec/seqspec_file.py +++ b/seqspec/seqspec_file.py @@ -129,55 +129,23 @@ def validate_file_args(parser: ArgumentParser, args: Namespace) -> None: ) -def run_file(parser: ArgumentParser, args: Namespace) -> None: - """Run the file command.""" - validate_file_args(parser, args) - - spec = load_spec(args.yaml) - ids = args.ids.split(",") if args.ids else [] - - files = list_files( - spec, - args.modality, - ids, - args.selector, - args.format, - args.key, - args.yaml, - args.fullpath, - ) - - if files: - if args.output: - args.output.write_text(str(files)) - else: - print(files) - - -def list_files( +def seqspec_file( spec: Assay, modality: str, - ids: List[str], - idtype: str, - fmt: str, - k: str, - spec_fn: Path, - fp: bool = False, -) -> str: - """List files based on the given parameters. + ids: Optional[List[str]] = None, + selector: str = "read", +) -> Dict[str, List[File]]: + """Core functionality to list files from a seqspec. Args: - spec: The seqspec specification. - modality: The modality to list files for. - ids: List of IDs to filter by. - idtype: Type of ID to filter by (read, region, file, region-type). - fmt: Output format (paired, interleaved, index, list, json). - k: Key to use for output (file_id, filename, etc.). - spec_fn: Path to the spec file. - fp: Whether to use full paths for local files. + spec: The Assay object to operate on + spec_fn: Path to the spec file, used for relative path resolution + modality: The modality to list files for + ids: Optional list of IDs to filter by + selector: Type of ID to filter by (read, region, file, region-type) Returns: - Formatted string containing the file information. + Dictionary mapping IDs to lists of File objects """ # NOTE: LIST FILES DOES NOT RESPECT ORDERING OF INPUT IDs LIST # NOTE: seqspec file -s read gets the files for the read, not the files mapped from the regions associated with the read. @@ -194,24 +162,46 @@ def list_files( "region-type": list_files_by_region_type, } - FORMAT = { - "list": format_list_files_metadata, - "paired": format_list_files, - "interleaved": format_list_files, - "index": format_list_files, - "json": format_json_files, - } - # Get files based on whether we're filtering by IDs if not ids: # list all files - files = LIST_FILES[idtype](spec, modality) + return LIST_FILES[selector](spec, modality) else: # list files by id - files = LIST_FILES_BY_ID[idtype](spec, modality, ids) + return LIST_FILES_BY_ID[selector](spec, modality, ids) + + +def run_file(parser: ArgumentParser, args: Namespace) -> None: + """Run the file command.""" + validate_file_args(parser, args) + + spec = load_spec(args.yaml) + ids = args.ids.split(",") if args.ids else [] - # Format the output - return FORMAT[fmt](files, fmt, k, spec_fn, fp) + files = seqspec_file( + spec=spec, + modality=args.modality, + ids=ids, + selector=args.selector, + ) + + if files: + FORMAT = { + "list": format_list_files_metadata, + "paired": format_list_files, + "interleaved": format_list_files, + "index": format_list_files, + "json": format_json_files, + } + + result = FORMAT[args.format]( + files, args.format, args.key, Path(args.yaml), args.fullpath + ) + + if args.output: + args.output.write_text(str(result)) + else: + print(result) def list_read_files(spec: Assay, modality: str) -> Dict[str, List[File]]: diff --git a/seqspec/seqspec_find.py b/seqspec/seqspec_find.py index c1826ce..d1cd9f5 100644 --- a/seqspec/seqspec_find.py +++ b/seqspec/seqspec_find.py @@ -7,7 +7,7 @@ from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS import warnings import yaml -from typing import List +from typing import List, Optional, Union from seqspec.utils import load_spec from seqspec.Assay import Assay @@ -104,25 +104,48 @@ def validate_find_args(parser: ArgumentParser, args: Namespace) -> None: args.id = args.r +def seqspec_find( + spec: Assay, selector: str, modality: str, id: Optional[str] = None +) -> Union[List[Read], List[Region], List[File]]: + """Core functionality to find objects in a seqspec file. + + Args: + spec: The Assay object to search in + selector: Type of object to search for (read, region, file, region-type) + modality: The modality to search in + id: The ID to search for (optional) + + Returns: + List of found objects matching the search criteria: + - List[Read] for "read" selector + - List[Region] for "region" and "region-type" selectors + - List[File] for "file" selector + - Empty list for unknown selectors + """ + FIND = { + "region-type": find_by_region_type, + "region": find_by_region_id, + "read": find_by_read_id, + "file": find_by_file_id, + } + + if selector not in FIND: + warnings.warn( + f"Unknown selector '{selector}'. Valid selectors are: {', '.join(FIND.keys())}" + ) + return [] + + return FIND[selector](spec, modality, id) + + def run_find(parser: ArgumentParser, args: Namespace) -> None: """Run the find command.""" validate_find_args(parser, args) spec = load_spec(args.yaml) - found = [] - - if args.selector == "region-type": - found = find_by_region_type(spec, args.modality, args.id) - elif args.selector == "region": - found = find_by_region_id(spec, args.modality, args.id) - elif args.selector == "read": - found = find_by_read_id(spec, args.modality, args.id) - elif args.selector == "file": - found = find_by_file_id(spec, args.modality, args.id) - else: - raise ValueError(f"Unknown selector: {args.selector}") + found = seqspec_find(spec, args.selector, args.modality, args.id) - # post processing + # Handle output if args.output: with open(args.output, "w") as f: yaml.dump(found, f, sort_keys=False) @@ -130,7 +153,7 @@ def run_find(parser: ArgumentParser, args: Namespace) -> None: print(yaml.dump(found, sort_keys=False)) -def find_by_read_id(spec: Assay, modality: str, id: str) -> List[Read]: +def find_by_read_id(spec: Assay, modality: str, id: Optional[str]) -> List[Read]: """Find reads by their ID. Args: @@ -142,6 +165,8 @@ def find_by_read_id(spec: Assay, modality: str, id: str) -> List[Read]: A list of Read objects matching the ID. """ rds = [] + if id is None: + return rds reads = spec.get_seqspec(modality) for r in reads: if r.read_id == id: @@ -149,7 +174,7 @@ def find_by_read_id(spec: Assay, modality: str, id: str) -> List[Read]: return rds -def find_by_file_id(spec: Assay, modality: str, id: str) -> List[File]: +def find_by_file_id(spec: Assay, modality: str, id: Optional[str]) -> List[File]: """Find files by their ID. Args: @@ -161,6 +186,8 @@ def find_by_file_id(spec: Assay, modality: str, id: str) -> List[File]: A list of File objects matching the ID. """ files = [] + if id is None: + return files lf = list_all_files(spec, modality) for k, v in lf.items(): for f in v: @@ -169,7 +196,7 @@ def find_by_file_id(spec: Assay, modality: str, id: str) -> List[File]: return files -def find_by_region_id(spec: Assay, modality: str, id: str) -> List[Region]: +def find_by_region_id(spec: Assay, modality: str, id: Optional[str]) -> List[Region]: """Find regions by their ID. Args: @@ -180,12 +207,14 @@ def find_by_region_id(spec: Assay, modality: str, id: str) -> List[Region]: Returns: A list of Region objects matching the ID. """ + if id is None: + return [] m = spec.get_libspec(modality) regions = m.get_region_by_id(id) return regions -def find_by_region_type(spec: Assay, modality: str, id: str) -> List[Region]: +def find_by_region_type(spec: Assay, modality: str, id: Optional[str]) -> List[Region]: """Find regions by their type. Args: @@ -196,6 +225,8 @@ def find_by_region_type(spec: Assay, modality: str, id: str) -> List[Region]: Returns: A list of Region objects matching the type. """ + if id is None: + return [] m = spec.get_libspec(modality) regions = m.get_region_by_region_type(id) return regions diff --git a/seqspec/seqspec_index.py b/seqspec/seqspec_index.py index c08576b..f0caa74 100644 --- a/seqspec/seqspec_index.py +++ b/seqspec/seqspec_index.py @@ -6,7 +6,7 @@ from pathlib import Path from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS import warnings -from typing import List, Optional +from typing import List, Optional, Dict, Any from seqspec.utils import load_spec, map_read_id_to_regions from seqspec.seqspec_find import find_by_region_id @@ -156,52 +156,52 @@ def validate_index_args(parser: ArgumentParser, args: Namespace) -> None: parser.error("Must specify ids with -i for -s read or -s region") -def run_index(parser: ArgumentParser, args: Namespace) -> None: - """Run the index command.""" - validate_index_args(parser, args) - - spec = load_spec(args.yaml) - ids = args.ids.split(",") if args.ids else [] - - result = index( - spec, - args.modality, - ids, - args.selector, - args.tool, - args.rev, - args.subregion_type, - ) - - if args.output: - with open(args.output, "w") as f: - print(result, file=f) - else: - print(result) - - -def index( +def seqspec_index( spec: Assay, modality: str, ids: List[str], idtype: str, - fmt: str, rev: bool = False, - subregion_type: Optional[str] = None, +) -> List[Dict[str, Any]]: + """Core functionality to get index information from the spec. + + Args: + spec: The Assay object to index + modality: The modality to index + ids: List of IDs to index + idtype: Type of ID (read, region, file) + rev: Whether to return 3'->5' region order + + Returns: + List of index dictionaries containing region coordinates and strand information + """ + GET_INDICES = { + "file": get_index_by_files, + } + + GET_INDICES_BY_IDS = { + "file": get_index_by_file_ids, + "region": get_index_by_region_ids, + "read": get_index_by_read_ids, + } + + if not ids: + return GET_INDICES[idtype](spec, modality) + return GET_INDICES_BY_IDS[idtype](spec, modality, ids) + + +def format_index( + indices: List[Dict[str, Any]], fmt: str, subregion_type: Optional[str] = None ) -> str: - """Get index information from the spec. + """Format index information into a specific output format. Args: - spec: The seqspec specification. - modality: The modality to index. - ids: List of IDs to index. - idtype: Type of ID (read, region, file). - fmt: Output format. - rev: Whether to return 3'->5' region order. - subregion_type: Optional subregion type. + indices: List of index dictionaries from seqspec_index + fmt: Output format to use + subregion_type: Optional subregion type for filtering Returns: - Formatted index information. + Formatted index information as a string """ FORMAT = { "chromap": format_chromap, @@ -216,22 +216,37 @@ def index( "zumis": format_zumis, } - GET_INDICES = { - "file": get_index_by_files, - } + if fmt not in FORMAT: + warnings.warn( + f"Unknown format '{fmt}'. Valid formats are: {', '.join(FORMAT.keys())}" + ) + return "" - GET_INDICES_BY_IDS = { - "file": get_index_by_file_ids, - "region": get_index_by_region_ids, - "read": get_index_by_read_ids, - } + return FORMAT[fmt](indices, subregion_type) - if not ids: - indices = GET_INDICES[idtype](spec, modality) - else: - indices = GET_INDICES_BY_IDS[idtype](spec, modality, ids) - return FORMAT[fmt](indices, subregion_type) +def run_index(parser: ArgumentParser, args: Namespace) -> None: + """Run the index command.""" + validate_index_args(parser, args) + + spec = load_spec(args.yaml) + ids = args.ids.split(",") if args.ids else [] + + indices = seqspec_index( + spec, + args.modality, + ids, + args.selector, + args.rev, + ) + + result = format_index(indices, args.tool, args.subregion_type) + + if args.output: + with open(args.output, "w") as f: + print(result, file=f) + else: + print(result) def get_index_by_files(spec, modality): diff --git a/seqspec/seqspec_info.py b/seqspec/seqspec_info.py index b9573f7..9a25731 100644 --- a/seqspec/seqspec_info.py +++ b/seqspec/seqspec_info.py @@ -1,8 +1,6 @@ from seqspec.utils import load_spec import json -from typing import List -from seqspec.Region import Region -from seqspec.Read import Read +from typing import Dict from seqspec.Assay import Assay from argparse import RawTextHelpFormatter, ArgumentParser, Namespace from pathlib import Path @@ -74,105 +72,215 @@ def run_info(parser: ArgumentParser, args: Namespace) -> None: validate_info_args(parser, args) spec = load_spec(args.yaml) - CMD = { + + if args.key: + # Extract data + info = seqspec_info(spec, args.key) + # Format info + result = format_info(info, args.key, args.format) + + if args.output: + with open(args.output, "w") as f: + if args.format == "json": + f.write(result) + else: + print(result, file=f) + else: + print(result) + + +def seqspec_info(spec: Assay, key: str) -> Dict: + """Get information from the spec based on the key. + + Args: + spec: The Assay object to get info from + key: The type of information to retrieve (modalities, meta, sequence_spec, library_spec) + + Returns: + Dictionary containing the requested information + + Raises: + KeyError: If the requested key is not supported + """ + INFO_FUNCS = { "modalities": seqspec_info_modalities, - "meta": seqspec_info, + "meta": seqspec_info_meta, "sequence_spec": seqspec_info_sequence_spec, "library_spec": seqspec_info_library_spec, } - s = "" - if args.key: - s = CMD[args.key](spec, args.format) + if key not in INFO_FUNCS: + raise KeyError( + f"Unsupported info key: {key}. Must be one of {list(INFO_FUNCS.keys())}" + ) + return INFO_FUNCS[key](spec) + - if args.output: - with open(args.output, "w") as f: - json.dump(s, f, sort_keys=False, indent=4) - else: - print(s) +def format_info(info: Dict, key: str, fmt: str = "tab") -> str: + """Format information based on the key and format. + Args: + info: Dictionary containing the information to format + key: The type of information to format (modalities, meta, sequence_spec, library_spec) + fmt: Output format (tab or json) -def seqspec_info(spec: Assay, fmt: str) -> str: - """Get meta information about the spec.""" - s = format_info(spec, fmt) - return s + Returns: + Formatted string + Raises: + KeyError: If the requested key is not supported + """ + FORMAT_FUNCS = { + "modalities": format_modalities, + "meta": format_meta, + "sequence_spec": format_sequence_spec, + "library_spec": format_library_spec, + } + if key not in FORMAT_FUNCS: + raise KeyError( + f"Unsupported format key: {key}. Must be one of {list(FORMAT_FUNCS.keys())}" + ) + return FORMAT_FUNCS[key](info, fmt) + + +def seqspec_info_meta(spec: Assay) -> Dict: + """Get meta information about the spec. + + Args: + spec: The Assay object to get info from + + Returns: + Dictionary containing meta information + """ + sd = spec.to_dict() + del sd["library_spec"] + del sd["sequence_spec"] + del sd["modalities"] + return {"meta": sd} -def seqspec_info_library_spec(spec: Assay, fmt: str) -> str: - """Get library specification information.""" + +def seqspec_info_library_spec(spec: Assay) -> Dict: + """Get library specification information. + + Args: + spec: The Assay object to get info from + + Returns: + Dictionary containing library specifications by modality + """ modalities = spec.list_modalities() - s = "" + result = {} for m in modalities: libspec = spec.get_libspec(m) - s += format_library_spec(m, libspec.get_leaves(), fmt) - return s + result[m] = libspec.get_leaves() + return {"library_spec": result} -def seqspec_info_sequence_spec(spec: Assay, fmt: str) -> str: - """Get sequence specification information.""" - reads = format_sequence_spec(spec.sequence_spec, fmt) - return reads +def seqspec_info_sequence_spec(spec: Assay) -> Dict: + """Get sequence specification information. + Args: + spec: The Assay object to get info from -def seqspec_info_modalities(spec: Assay, fmt: str) -> str: - """Get list of modalities.""" - modalities = format_modalities(spec.list_modalities(), fmt) - return modalities + Returns: + Dictionary containing sequence specifications + """ + return {"sequence_spec": spec.sequence_spec} -def format_info(spec: Assay, fmt: str = "tab") -> str: - """Format meta information.""" - sd = spec.to_dict() - del sd["library_spec"] - del sd["sequence_spec"] - del sd["modalities"] - s = "" +def seqspec_info_modalities(spec: Assay) -> Dict: + """Get list of modalities. + + Args: + spec: The Assay object to get info from + + Returns: + Dictionary containing list of modalities + """ + return {"modalities": spec.list_modalities()} + + +def format_meta(info: Dict, fmt: str = "tab") -> str: + """Format meta information. + + Args: + info: Dictionary containing meta information from seqspec_info_meta + fmt: Output format (tab or json) + + Returns: + Formatted string + """ if fmt == "tab": - for k, v in sd.items(): - s += f"{v}\t" - s = s[:-1] + return "\t".join(str(v) for v in info["meta"].values()) elif fmt == "json": - s = json.dumps(sd, sort_keys=False, indent=4) - return s + return json.dumps(info["meta"], sort_keys=False, indent=4) + return "" + + +def format_modalities(info: Dict, fmt: str = "tab") -> str: + """Format list of modalities. + Args: + info: Dictionary containing modalities from seqspec_info_modalities + fmt: Output format (tab or json) -def format_modalities(modalities: List[str], fmt: str = "tab") -> str: - """Format list of modalities.""" - s = "" + Returns: + Formatted string + """ if fmt == "tab": - s = "\t".join(modalities) + return "\t".join(info["modalities"]) elif fmt == "json": - s = json.dumps(modalities, sort_keys=False, indent=4) - return s + return json.dumps(info["modalities"], sort_keys=False, indent=4) + return "" -def format_sequence_spec(sequence_spec: List[Read], fmt: str = "tab") -> str: - """Format sequence specification.""" - s = "" +def format_sequence_spec(info: Dict, fmt: str = "tab") -> str: + """Format sequence specification. + + Args: + info: Dictionary containing sequence specs from seqspec_info_sequence_spec + fmt: Output format (tab or json) + + Returns: + Formatted string + """ if fmt == "tab": - # format the output as a table - for r in sequence_spec: + lines = [] + for r in info["sequence_spec"]: files = ",".join([i.file_id for i in r.files]) if r.files else "" - s += f"{r.modality}\t{r.read_id}\t{r.strand}\t{r.min_len}\t{r.max_len}\t{r.primer_id}\t{r.name}\t{files}\n" - s = s[:-1] + lines.append( + f"{r.modality}\t{r.read_id}\t{r.strand}\t{r.min_len}\t{r.max_len}\t{r.primer_id}\t{r.name}\t{files}" + ) + return "\n".join(lines) elif fmt == "json": - s = json.dumps([i.to_dict() for i in sequence_spec], sort_keys=False, indent=4) - return s + return json.dumps( + [i.to_dict() for i in info["sequence_spec"]], sort_keys=False, indent=4 + ) + return "" + + +def format_library_spec(info: Dict, fmt: str = "tab") -> str: + """Format library specification. + Args: + info: Dictionary containing library specs from seqspec_info_library_spec + fmt: Output format (tab or json) -def format_library_spec( - modality: str, library_spec: List[Region], fmt: str = "tab" -) -> str: - """Format library specification.""" - s = "" + Returns: + Formatted string + """ if fmt == "tab": - for r in library_spec: - file = None - if r.onlist: - file = r.onlist.filename - s += f"{modality}\t{r.region_id}\t{r.region_type}\t{r.name}\t{r.sequence_type}\t{r.sequence}\t{r.min_len}\t{r.max_len}\t{file}\n" - s = s[:-1] + lines = [] + for modality, regions in info["library_spec"].items(): + for r in regions: + file = r.onlist.filename if r.onlist else None + lines.append( + f"{modality}\t{r.region_id}\t{r.region_type}\t{r.name}\t{r.sequence_type}\t{r.sequence}\t{r.min_len}\t{r.max_len}\t{file}" + ) + return "\n".join(lines) elif fmt == "json": - s = json.dumps( - {modality: [i.to_dict() for i in library_spec]}, sort_keys=False, indent=4 + return json.dumps( + {m: [i.to_dict() for i in r] for m, r in info["library_spec"].items()}, + sort_keys=False, + indent=4, ) - return s + return "" diff --git a/seqspec/seqspec_init.py b/seqspec/seqspec_init.py index 361324b..a65af02 100644 --- a/seqspec/seqspec_init.py +++ b/seqspec/seqspec_init.py @@ -54,7 +54,6 @@ def setup_init_args(parser) -> ArgumentParser: help="List of modalities, reads, primer_ids, lengths, and strand (e.g. modality,fastq_name,primer_id,len,strand:...)", required=True, ) - subparser.add_argument( "-o", "--output", @@ -85,39 +84,38 @@ def run_init(parser: ArgumentParser, args: Namespace) -> None: modalities = args.modalities.split(",") reads = parse_reads_string(args.reads) - tree = newick.loads(args.newick) - - if len(tree[0].descendants) != len(modalities): - raise ValueError( - "Number of modalities must match number of modality-FASTQs pairs" - ) + regions = newick_to_regions(args.newick) - spec = init(args.name, modalities, tree[0].descendants, reads) + spec = seqspec_init(args.name, modalities, regions, reads) + yaml_str = spec.to_YAML() + if yaml_str is None: + raise ValueError("Failed to generate YAML string from assay") if args.output: - spec.to_YAML(args.output) + args.output.write_text(yaml_str) else: - print(spec.to_YAML()) + print(yaml_str) -def init( - name: str, modalities: List[str], tree: List[newick.Node], reads: List[Read] +def seqspec_init( + name: str, modalities: List[str], regions: List[Region], reads: List[Read] ) -> Assay: """Initialize a new seqspec specification. Args: - name: Name of the assay. - modalities: List of modalities. - tree: Newick tree nodes. - reads: List of read specifications. + name: Name of the assay + modalities: List of modalities + regions: List of Region objects + reads: List of read specifications Returns: - Initialized Assay object. + Initialized Assay object + + Raises: + ValueError: If number of modalities doesn't match number of regions """ - regions = [] - for node in tree: - region = Region(region_id="", region_type="", name="", sequence_type="") - regions.append(newick_to_region(node, region)) + if len(regions) != len(modalities): + raise ValueError("Number of modalities must match number of regions") return Assay( assay_id="", @@ -136,15 +134,39 @@ def init( ) +def newick_to_regions(newick_str: str) -> List[Region]: + """Convert a newick string to a list of Region objects. + + Args: + newick_str: Newick format string representing the library structure + + Returns: + List of Region objects + + Raises: + ValueError: If newick string is invalid + """ + try: + tree = newick.loads(newick_str) + except Exception as e: + raise ValueError(f"Invalid newick string: {e}") + + regions = [] + for node in tree[0].descendants: + region = Region(region_id="", region_type="", name="", sequence_type="") + regions.append(newick_to_region(node, region)) + return regions + + def newick_to_region(node: newick.Node, region: Region) -> Region: """Convert a newick node to a Region object. Args: - node: Newick tree node. - region: Base region object to populate. + node: Newick tree node + region: Base region object to populate Returns: - Populated Region object. + Populated Region object """ region.region_id = node.name region.name = node.name @@ -178,7 +200,7 @@ def parse_reads_string(input_string: str) -> List[Read]: "modality,read_id,primer_id,min_len,strand:..." Returns: - List of Read objects. + List of Read objects """ reads = [] for obj in input_string.split(":"): diff --git a/seqspec/seqspec_print.py b/seqspec/seqspec_print.py index 1b7744a..9ad6687 100644 --- a/seqspec/seqspec_print.py +++ b/seqspec/seqspec_print.py @@ -99,7 +99,30 @@ def run_print(parser: ArgumentParser, args: Namespace) -> None: validate_print_args(parser, args) spec = load_spec(args.yaml) + result = seqspec_print(spec, args.format) + if args.format == "seqspec-png": + result.savefig(args.output, dpi=300, bbox_inches="tight") + elif args.output: + with open(args.output, "w") as f: + print(result, file=f) + else: + print(result) + + +def seqspec_print(spec: Assay, fmt: str): + """Print sequence specification in the specified format. + + Args: + spec: The seqspec specification to print + fmt: The format to print in (library-ascii, seqspec-html, seqspec-png, seqspec-ascii) + + Returns: + The formatted output (string or matplotlib figure) + + Raises: + ValueError: If format is not supported + """ # Map format to print function format_to_function = { "library-ascii": print_library_ascii, @@ -108,15 +131,12 @@ def run_print(parser: ArgumentParser, args: Namespace) -> None: "seqspec-ascii": print_seqspec_ascii, } - result = format_to_function[args.format](spec) + if fmt not in format_to_function: + raise ValueError( + f"Unsupported format: {fmt}. Must be one of {list(format_to_function.keys())}" + ) - if args.format == "seqspec-png": - result.savefig(args.output, dpi=300, bbox_inches="tight") - elif args.output: - with open(args.output, "w") as f: - print(result, file=f) - else: - print(result) + return format_to_function[fmt](spec) def print_seqspec_ascii(spec: Assay) -> str: diff --git a/seqspec/seqspec_split.py b/seqspec/seqspec_split.py index 20436a7..2917c81 100644 --- a/seqspec/seqspec_split.py +++ b/seqspec/seqspec_split.py @@ -5,7 +5,7 @@ from pathlib import Path from argparse import ArgumentParser, RawTextHelpFormatter, Namespace -from typing import List, Dict, Any +from typing import List from seqspec.utils import load_spec from seqspec.Assay import Assay @@ -53,15 +53,24 @@ def run_split(parser: ArgumentParser, args: Namespace) -> None: validate_split_args(parser, args) spec = load_spec(args.yaml) - specs = split(spec, args.output) + specs = seqspec_split(spec) - for spec_info in specs: - output_path = args.output / f"{spec_info['prefix']}{spec_info['modality']}.yaml" - spec_info["spec"].to_YAML(output_path) + prefix = "spec." if args.output.name == "" else f"{args.output.name}." + for spec_m in specs: + modality = spec_m.list_modalities()[0] + output_path = args.output / f"{prefix}{modality}.yaml" + spec_m.to_YAML(output_path) -def split(spec: Assay, output_dir: Path) -> List[Dict[str, Any]]: - """Split spec into one file per modality.""" +def seqspec_split(spec: Assay) -> List[Assay]: + """Split spec into one file per modality. + + Args: + spec: The Assay object to split + + Returns: + List of Assay objects, each containing a single modality + """ specs = [] modalities = spec.list_modalities() @@ -85,8 +94,6 @@ def split(spec: Assay, output_dir: Path) -> List[Dict[str, Any]]: } spec_m = Assay(**info) spec_m.update_spec() - - prefix = "spec." if output_dir.name == "" else f"{output_dir.name}." - specs.append({"prefix": prefix, "spec": spec_m, "modality": modality}) + specs.append(spec_m) return specs diff --git a/seqspec/seqspec_upgrade.py b/seqspec/seqspec_upgrade.py index 62ec210..b363294 100644 --- a/seqspec/seqspec_upgrade.py +++ b/seqspec/seqspec_upgrade.py @@ -55,7 +55,7 @@ def run_upgrade(parser: ArgumentParser, args: Namespace) -> None: spec = load_spec(args.yaml) version = spec.seqspec_version - upgraded_spec = upgrade(spec, version) + upgraded_spec = seqspec_upgrade(spec, version) if args.output: args.output.write_text(upgraded_spec.to_YAML()) @@ -63,7 +63,7 @@ def run_upgrade(parser: ArgumentParser, args: Namespace) -> None: print(upgraded_spec.to_YAML()) -def upgrade(spec: Assay, version: str) -> Assay: +def seqspec_upgrade(spec: Assay, version: str) -> Assay: """Upgrade spec to current version.""" UPGRADE = { "0.0.0": upgrade_0_0_0_to_0_3_0, @@ -73,6 +73,11 @@ def upgrade(spec: Assay, version: str) -> Assay: get_version(): no_upgrade, } + if version not in UPGRADE: + raise ValueError( + f"Unsupported version: {version}. Must be one of {list(UPGRADE.keys())}" + ) + return UPGRADE[version](spec) @@ -104,7 +109,6 @@ def upgrade_0_2_0_to_0_3_0(spec: Assay) -> Assay: for lf in r.get_leaves(): if lf.onlist is not None: filename = lf.onlist.filename - # location = lf.onlist.location md5 = lf.onlist.md5 lf.onlist = Onlist( file_id=filename, @@ -114,7 +118,6 @@ def upgrade_0_2_0_to_0_3_0(spec: Assay) -> Assay: url="", urltype="", md5=md5, - # location=location, ) spec.seqspec_version = get_version() return spec diff --git a/seqspec/seqspec_version.py b/seqspec/seqspec_version.py index 2ed7c54..98580ba 100644 --- a/seqspec/seqspec_version.py +++ b/seqspec/seqspec_version.py @@ -5,7 +5,7 @@ from pathlib import Path from argparse import ArgumentParser, RawTextHelpFormatter, Namespace - +from typing import Dict from seqspec.utils import load_spec from seqspec.Assay import Assay from . import __version__ @@ -53,16 +53,29 @@ def run_version(parser: ArgumentParser, args: Namespace) -> None: validate_version_args(parser, args) spec = load_spec(args.yaml) - version_info = version(spec) + vinfo = seqspec_version(spec) + finfo = format_version(vinfo) if args.output: - args.output.write_text(version_info) + args.output.write_text(finfo) else: - print(version_info) + print(finfo) -def version(spec: Assay) -> str: +def seqspec_version(spec: Assay) -> Dict: """Get version information for spec and tool.""" version = spec.seqspec_version tool_version = __version__ - return f"seqspec version: {tool_version}\nseqspec file version: {version}" + return {"file_version": version, "tool_version": tool_version} + + +def format_version(vinfo: Dict) -> str: + """Format version information into a string. + + Args: + vinfo: Dictionary containing file_version and tool_version + + Returns: + Formatted string with version information + """ + return f"seqspec version: {vinfo['tool_version']}\nseqspec file version: {vinfo['file_version']}" From f1e094bc034d9ee12c732762ef9b4e31e365240d Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Wed, 16 Jul 2025 14:24:00 -0500 Subject: [PATCH 17/21] CHECK-201-read-id (#11) --- seqspec/schema/seqspec_igvf.schema.json | 396 ++++++++++++++++++ .../seqspec_igvf_onlist_skip.schema.json | 395 +++++++++++++++++ seqspec/seqspec_check.py | 29 +- tests/test_region.py | 16 +- tests/test_seqspec_check.py | 132 +++++- tests/test_seqspec_onlist.py | 91 ++-- tests/test_utils.py | 32 +- 7 files changed, 1033 insertions(+), 58 deletions(-) create mode 100644 seqspec/schema/seqspec_igvf.schema.json create mode 100644 seqspec/schema/seqspec_igvf_onlist_skip.schema.json diff --git a/seqspec/schema/seqspec_igvf.schema.json b/seqspec/schema/seqspec_igvf.schema.json new file mode 100644 index 0000000..b476011 --- /dev/null +++ b/seqspec/schema/seqspec_igvf.schema.json @@ -0,0 +1,396 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "Assay.schema.json", + "title": "Assay", + "description": "A Assay of DNA", + "type": "object", + "properties": { + "seqspec_version": { + "description": "Version of the seqspec specification used", + "type": "string", + "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" + }, + "assay_id": { + "description": "Identifier for the assay", + "type": "string" + }, + "name": { + "description": "The name of the assay", + "type": "string" + }, + "doi": { + "description": "the doi of the paper that describes the assay", + "type": "string" + }, + "date": { + "description": "The seqspec creation date", + "type": "string", + "pattern": "^(0?[1-9]|[12][0-9]|3[01])\\s(January|February|March|April|May|June|July|August|September|October|November|December)\\s(19|20)\\d\\d$" + }, + "description": { + "description": "A short description of the assay", + "type": "string" + }, + "modalities": { + "description": "The modalities the assay targets", + "type": "array", + "items": { + "type": "string", + "enum": ["dna", "rna", "tag", "protein", "atac", "crispr"] + } + }, + "lib_struct": { + "description": "The link to Teichmann's libstructs page derived for this sequence", + "type": "string" + }, + "library_protocol": { + "description": "The protocol/machine/tool to generate the library insert", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "protocol_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "library_kit": { + "description": "The kit used to make the library sequence_protocol compatible", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "kit_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "sequence_protocol": { + "description": "The protocol/machine/tool to generate sequences", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "protocol_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "sequence_kit": { + "description": "The kit used with the protocol to sequence the library", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "kit_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "sequence_spec": { + "description": "The spec for the sequencer", + "type": "array", + "items": { + "$ref": "#/$defs/read" + } + }, + "library_spec": { + "description": "The spec for the assay", + "type": "array", + "items": { + "$ref": "#/$defs/region" + } + } + }, + "required": [ + "seqspec_version", + "assay_id", + "name", + "doi", + "date", + "description", + "modalities" + ], + "$defs": { + "region": { + "title": "Region", + "description": "A region of DNA", + "type": "object", + "properties": { + "region_id": { + "description": "identifier for the region", + "type": "string" + }, + "region_type": { + "description": "the type of region", + "type": "string", + "enum": [ + "atac", + "barcode", + "cdna", + "crispr", + "custom_primer", + "dna", + "fastq", + "fastq_link", + "gdna", + "hic", + "illumina_p5", + "illumina_p7", + "index5", + "index7", + "linker", + "ME1", + "ME2", + "methyl", + "named", + "nextera_read1", + "nextera_read2", + "poly_A", + "poly_G", + "poly_T", + "poly_C", + "protein", + "rna", + "s5", + "s7", + "tag", + "truseq_read1", + "truseq_read2", + "umi" + ] + }, + "sequence_type": { + "description": "The type of the sequence", + "type": "string", + "enum": ["fixed", "random", "onlist", "joined"] + }, + "sequence": { + "description": "The sequence", + "type": "string" + }, + "min_len": { + "description": "The minimum length of the sequence", + "type": "integer", + "minimum": 0, + "maximum": 2048 + }, + "max_len": { + "description": "The maximum length of the sequence", + "type": "integer", + "minimum": 0, + "maximum": 2048 + }, + "onlist": { + "description": "The file containing the sequence if seq_type = onlist", + "type": ["object", "null"], + "properties": { + "file_id": { + "description": "filename", + "type": "string" + }, + "filename": { + "description": "filename for the onlist", + "type": "string" + }, + "filetype": { + "description": "the type of file", + "type": "string" + }, + "filesize": { + "description": "the size of the file in bytes", + "type": "integer" + }, + "url": { + "description": "The path or url to the file", + "type": "string" + }, + "urltype": { + "description": "type of file path", + "type": "string", + "enum": ["local", "ftp", "http", "https"] + }, + "location": { + "description": "location of onlist", + "type": "string", + "enum": ["local", "remote"] + }, + "md5": { + "description": "md5sum for the file pointed to by filename", + "type": "string" } + }, + "required": [ + "file_id", + "filename", + "filetype", + "filesize", + "url", + "urltype" + ] + }, + "regions": { + "description": "The regions being joined", + "type": "array", + "items": { + "$ref": "#/$defs/region" + } + } + }, + "required": [ + "region_id", + "region_type", + "sequence_type", + "sequence", + "min_len", + "max_len" + ], + "if": { + "properties": { + "min_len": { + "const": 0 + } + } + }, + "then": { + "properties": { + "sequence": { + "type": "string", + "pattern": "^[ACGTRYMKSWHBVDNX]*$" + } + } + }, + "else": { + "properties": { + "sequence": { + "type": "string", + "minLength": 1, + "pattern": "^[ACGTRYMKSWHBVDNX]+$" + } + } + } + }, + "read": { + "title": "Read", + "type": "object", + "properties": { + "read_id": { + "type": "string", + "description": "The unique identifier for the read.", + "pattern": "^IGVF.*" + }, + "name": { + "type": "string", + "description": "The name of the read." + }, + "modality": { + "type": "string", + "description": "The modality of the assay generating the read." + }, + "primer_id": { + "type": "string", + "description": "The region id of the primer used." + }, + "min_len": { + "type": "integer", + "minimum": 0, + "description": "The minimum length of the read, must be greater than or equal to 0." + }, + "max_len": { + "type": "integer", + "exclusiveMinimum": 0, + "description": "The maximum length of the read, must be greater than 0." + }, + "strand": { + "type": "string", + "enum": ["pos", "neg"], + "description": "The strand orientation of the read, either positive ('pos') or negative ('neg')." + }, + "files": { + "description": "An array of files containing the reads", + "type": "array", + "items": { + "type": "object", + "properties": { + "file_id": { + "description": "filename", + "type": "string" + }, + "filename": { + "description": "filename", + "type": "string" + }, + "filetype": { + "description": "the type of file", + "type": "string" + }, + "filesize": { + "description": "the size of the file in bytes", + "type": "integer" + }, + "url": { + "description": "The path or url to the file", + "type": "string" + }, + "urltype": { + "description": "type of file path", + "type": "string", + "enum": ["local", "ftp", "http", "https"] + }, + "md5": { + "description": "md5sum for the file pointed to by filename", + "type": "string", + "pattern": "^[a-f0-9]{32}$" + } + } + } + } + }, + "required": [ + "read_id", + "modality", + "primer_id", + "min_len", + "max_len", + "strand" + ], + "additionalProperties": false + } + } + } + \ No newline at end of file diff --git a/seqspec/schema/seqspec_igvf_onlist_skip.schema.json b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json new file mode 100644 index 0000000..9bf86fd --- /dev/null +++ b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json @@ -0,0 +1,395 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "Assay.schema.json", + "title": "Assay", + "description": "A Assay of DNA", + "type": "object", + "properties": { + "seqspec_version": { + "description": "Version of the seqspec specification used", + "type": "string", + "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$" + }, + "assay_id": { + "description": "Identifier for the assay", + "type": "string" + }, + "name": { + "description": "The name of the assay", + "type": "string" + }, + "doi": { + "description": "the doi of the paper that describes the assay", + "type": "string" + }, + "date": { + "description": "The seqspec creation date", + "type": "string", + "pattern": "^(0?[1-9]|[12][0-9]|3[01])\\s(January|February|March|April|May|June|July|August|September|October|November|December)\\s(19|20)\\d\\d$" + }, + "description": { + "description": "A short description of the assay", + "type": "string" + }, + "modalities": { + "description": "The modalities the assay targets", + "type": "array", + "items": { + "type": "string", + "enum": ["dna", "rna", "tag", "protein", "atac", "crispr"] + } + }, + "lib_struct": { + "description": "The link to Teichmann's libstructs page derived for this sequence", + "type": "string" + }, + "library_protocol": { + "description": "The protocol/machine/tool to generate the library insert", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "protocol_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "library_kit": { + "description": "The kit used to make the library sequence_protocol compatible", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "kit_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "sequence_protocol": { + "description": "The protocol/machine/tool to generate sequences", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "protocol_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "sequence_kit": { + "description": "The kit used with the protocol to sequence the library", + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "object", + "properties": { + "kit_id": { "type": "string" }, + "name": { "type": ["string", "null"] }, + "modality": { "type": "string" } + } + }, + "minItems": 1 + } + ] + }, + "sequence_spec": { + "description": "The spec for the sequencer", + "type": "array", + "items": { + "$ref": "#/$defs/read" + } + }, + "library_spec": { + "description": "The spec for the assay", + "type": "array", + "items": { + "$ref": "#/$defs/region" + } + } + }, + "required": [ + "seqspec_version", + "assay_id", + "name", + "doi", + "date", + "description", + "modalities" + ], + "$defs": { + "region": { + "title": "Region", + "description": "A region of DNA", + "type": "object", + "properties": { + "region_id": { + "description": "identifier for the region", + "type": "string" + }, + "region_type": { + "description": "the type of region", + "type": "string", + "enum": [ + "atac", + "barcode", + "cdna", + "crispr", + "custom_primer", + "dna", + "fastq", + "fastq_link", + "gdna", + "hic", + "illumina_p5", + "illumina_p7", + "index5", + "index7", + "linker", + "ME1", + "ME2", + "methyl", + "named", + "nextera_read1", + "nextera_read2", + "poly_A", + "poly_G", + "poly_T", + "poly_C", + "protein", + "rna", + "s5", + "s7", + "tag", + "truseq_read1", + "truseq_read2", + "umi" + ] + }, + "sequence_type": { + "description": "The type of the sequence", + "type": "string", + "enum": ["fixed", "random", "onlist", "joined"] + }, + "sequence": { + "description": "The sequence", + "type": "string" + }, + "min_len": { + "description": "The minimum length of the sequence", + "type": "integer", + "minimum": 0, + "maximum": 2048 + }, + "max_len": { + "description": "The maximum length of the sequence", + "type": "integer", + "minimum": 0, + "maximum": 2048 + }, + "onlist": { + "description": "The file containing the sequence if seq_type = onlist", + "type": ["object", "null"], + "properties": { + "file_id": { + "description": "filename", + "type": "string" + }, + "filename": { + "description": "filename for the onlist", + "type": "string" + }, + "filetype": { + "description": "the type of file", + "type": "string" + }, + "filesize": { + "description": "the size of the file in bytes", + "type": "integer" + }, + "url": { + "description": "The path or url to the file", + "type": "string" + }, + "urltype": { + "description": "type of file path", + "type": "string", + "enum": ["local", "ftp", "http", "https"] + }, + "location": { + "description": "location of onlist", + "type": "string", + "enum": ["local", "remote"] + }, + "md5": { + "description": "md5sum for the file pointed to by filename", + "type": "string" } + }, + "required": [ + "file_id", + "filename", + "filetype", + "filesize", + "url", + "urltype" + ] + }, + "regions": { + "description": "The regions being joined", + "type": "array", + "items": { + "$ref": "#/$defs/region" + } + } + }, + "required": [ + "region_id", + "region_type", + "sequence_type", + "sequence", + "min_len", + "max_len" + ], + "if": { + "properties": { + "min_len": { + "const": 0 + } + } + }, + "then": { + "properties": { + "sequence": { + "type": "string", + "pattern": "^[ACGTRYMKSWHBVDNX]*$" + } + } + }, + "else": { + "properties": { + "sequence": { + "type": "string", + "minLength": 1, + "pattern": "^[ACGTRYMKSWHBVDNX]+$" + } + } + } + }, + "read": { + "title": "Read", + "type": "object", + "properties": { + "read_id": { + "type": "string", + "description": "The unique identifier for the read." + }, + "name": { + "type": "string", + "description": "The name of the read." + }, + "modality": { + "type": "string", + "description": "The modality of the assay generating the read." + }, + "primer_id": { + "type": "string", + "description": "The region id of the primer used." + }, + "min_len": { + "type": "integer", + "minimum": 0, + "description": "The minimum length of the read, must be greater than or equal to 0." + }, + "max_len": { + "type": "integer", + "exclusiveMinimum": 0, + "description": "The maximum length of the read, must be greater than 0." + }, + "strand": { + "type": "string", + "enum": ["pos", "neg"], + "description": "The strand orientation of the read, either positive ('pos') or negative ('neg')." + }, + "files": { + "description": "An array of files containing the reads", + "type": "array", + "items": { + "type": "object", + "properties": { + "file_id": { + "description": "filename", + "type": "string" + }, + "filename": { + "description": "filename", + "type": "string" + }, + "filetype": { + "description": "the type of file", + "type": "string" + }, + "filesize": { + "description": "the size of the file in bytes", + "type": "integer" + }, + "url": { + "description": "The path or url to the file", + "type": "string" + }, + "urltype": { + "description": "type of file path", + "type": "string", + "enum": ["local", "ftp", "http", "https"] + }, + "md5": { + "description": "md5sum for the file pointed to by filename", + "type": "string", + "pattern": "^[a-f0-9]{32}$" + } + } + } + } + }, + "required": [ + "read_id", + "modality", + "primer_id", + "min_len", + "max_len", + "strand" + ], + "additionalProperties": false + } + } + } + \ No newline at end of file diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index e9b2cef..24a3a3f 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -80,7 +80,7 @@ def seqspec_check( Returns: List of error dictionaries """ - errors = check(spec, spec_fn) + errors = check(spec, spec_fn, filter_type) if filter_type: errors = filter_errors(errors, filter_type) return errors @@ -103,24 +103,14 @@ def run_check(parser: ArgumentParser, args: Namespace): return errors -IGVF_FILTERS = [ - {"error_type": "check_schema", "error_object": "'lib_struct'"}, - {"error_type": "check_schema", "error_object": "'library_protocol'"}, - {"error_type": "check_schema", "error_object": "'library_kit'"}, - {"error_type": "check_schema", "error_object": "'sequence_protocol'"}, - {"error_type": "check_schema", "error_object": "'sequence_kit'"}, - {"error_type": "check_schema", "error_object": "'md5'"}, -] -IGVF_ONLIST_SKIP_FILTERS = IGVF_FILTERS + [ +IGVF_ONLIST_SKIP_FILTERS = [ {"error_type": "check_onlist_files_exist", "error_object": "onlist"} ] def filter_errors(errors, filter_type): filters = None - if filter_type == "igvf": - filters = IGVF_FILTERS - elif filter_type == "igvf_onlist_skip": + if filter_type == "igvf_onlist_skip": filters = IGVF_ONLIST_SKIP_FILTERS if filters: @@ -141,10 +131,19 @@ def filter_errors(errors, filter_type): return errors -def check(spec: Assay, spec_fn: str): +def check(spec: Assay, spec_fn: str, skip: str = None): # Variety of checks against schema def check_schema(spec: Assay, spec_fn: str, errors=[], idx=0): - schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json") + if skip == "igvf": + schema_fn = path.join( + path.dirname(__file__), "schema/seqspec_igvf.schema.json" + ) + elif skip == "igvf_onlist_skip": + schema_fn = path.join( + path.dirname(__file__), "schema/seqspec_igvf_onlist_skip.schema.json" + ) + else: + schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json") with open(schema_fn, "r") as stream: schema = yaml.load(stream, Loader=yaml.Loader) validator = Draft4Validator(schema) diff --git a/tests/test_region.py b/tests/test_region.py index 710665e..3afdf3a 100644 --- a/tests/test_region.py +++ b/tests/test_region.py @@ -66,11 +66,8 @@ def test_simple_onlist(self): url = filename urltype = "file" md5sum = "d41d8cd98f00b204e9800998ecf8427e" - location = "local" - permit = Onlist( - file_id, filename, filetype, filesize, url, "file", md5sum, location - ) + permit = Onlist(file_id, filename, filetype, filesize, url, "file", md5sum) self.assertEqual( permit.to_dict(), @@ -204,9 +201,16 @@ def test_onlists(self): list_url = list_name list_urltype = "file" list_md5sum = "d41d8cd98f00b204e9800998ecf8427e" - list_location = "local" - permited = Onlist(list_id, list_name, list_type, list_size, list_url, list_urltype, list_md5sum, list_location) + permited = Onlist( + list_id, + list_name, + list_type, + list_size, + list_url, + list_urltype, + list_md5sum, + ) r = Region( region_name, diff --git a/tests/test_seqspec_check.py b/tests/test_seqspec_check.py index ae8ad6b..75d31bd 100644 --- a/tests/test_seqspec_check.py +++ b/tests/test_seqspec_check.py @@ -30,8 +30,8 @@ def test_check_args(self): cmdline = ["check", "-o", output_name, spec_name] args = parser.parse_args(cmdline) - self.assertEqual(args.o, output_name) - self.assertEqual(args.yaml, spec_name) + self.assertEqual(str(args.output), output_name) + self.assertEqual(str(args.yaml), spec_name) def test_validate_check_args(self): parser = create_stub_check_parser() @@ -49,4 +49,130 @@ def test_validate_check_args(self): with patch("os.path.exists") as path_exists: path_exists.return_value = True errors = validate_check_args(None, args) - self.assertEqual(errors, []) + self.assertEqual(errors, None) + + def test_check_with_igvf_skip(self): + """Test that 'igvf' skip condition filters out some IGVF-related errors but not read_id pattern errors.""" + from seqspec.seqspec_check import run_check + from argparse import ArgumentParser, Namespace + + # Create a parser + parser = ArgumentParser() + subparser = parser.add_subparsers(dest="command") + subparser = setup_check_args(subparser) + + # Test file path + test_file = Path("tests/data/seqspec_valid_ignore_onlist.yaml") + + # Test with 'igvf' skip + args = Namespace() + args.yaml = test_file + args.output = None + args.skip = "igvf" + + # Run check with igvf + errors = run_check(parser, args) + + # Should have exactly 2 errors: read_id pattern error and onlist file error + self.assertEqual( + len(errors), 2, f"Expected 2 errors, got {len(errors)}: {errors}" + ) + + # Check for read_id pattern error (should not be filtered by igvf skip) + read_id_errors = [ + e + for e in errors + if e.get("error_type") == "check_schema" + and "read_id" in e.get("error_message", "") + ] + self.assertEqual( + len(read_id_errors), + 1, + f"Expected 1 read_id error, got {len(read_id_errors)}", + ) + self.assertIn("1165AJSO", read_id_errors[0]["error_message"]) + self.assertIn("does not match", read_id_errors[0]["error_message"]) + + # Check for onlist file error (should not be filtered by igvf skip) + onlist_errors = [ + e for e in errors if e.get("error_type") == "check_onlist_files_exist" + ] + self.assertEqual( + len(onlist_errors), 1, f"Expected 1 onlist error, got {len(onlist_errors)}" + ) + self.assertIn("does not exist", onlist_errors[0]["error_message"]) + + def test_check_with_igvf_onlist_skip(self): + """Test that 'igvf_onlist_skip' skip condition filters out IGVF and onlist errors including read_id pattern.""" + from seqspec.seqspec_check import run_check + from argparse import ArgumentParser, Namespace + + # Create a parser + parser = ArgumentParser() + subparser = parser.add_subparsers(dest="command") + subparser = setup_check_args(subparser) + + # Test file path + test_file = Path("tests/data/seqspec_valid_ignore_onlist.yaml") + + # Test with 'igvf_onlist_skip' skip + args = Namespace() + args.yaml = test_file + args.output = None + args.skip = "igvf_onlist_skip" + + # Run check with igvf_onlist_skip + errors = run_check(parser, args) + + # Should have no errors (all errors are filtered out by igvf_onlist_skip) + self.assertEqual( + len(errors), 0, f"Expected 0 errors, got {len(errors)}: {errors}" + ) + + def test_check_without_skip(self): + """Test that without skip condition, validation errors are reported.""" + from seqspec.seqspec_check import run_check + from argparse import ArgumentParser, Namespace + + # Create a parser + parser = ArgumentParser() + subparser = parser.add_subparsers(dest="command") + subparser = setup_check_args(subparser) + + # Test file path + test_file = Path("tests/data/seqspec_valid_ignore_onlist.yaml") + + # Test without skip + args = Namespace() + args.yaml = test_file + args.output = None + args.skip = None + + # Run check without skip + errors = run_check(parser, args) + + # Should have exactly 2 errors: sequence_protocol error and onlist file error + self.assertEqual( + len(errors), 2, f"Expected 2 errors, got {len(errors)}: {errors}" + ) + + # Check for sequence_protocol error + protocol_errors = [ + e + for e in errors + if e.get("error_type") == "check_schema" + and "sequence_protocol" in e.get("error_message", "") + ] + self.assertEqual( + len(protocol_errors), + 1, + f"Expected 1 sequence_protocol error, got {len(protocol_errors)}", + ) + + # Check for onlist file error + onlist_errors = [ + e for e in errors if e.get("error_type") == "check_onlist_files_exist" + ] + self.assertEqual( + len(onlist_errors), 1, f"Expected 1 onlist error, got {len(onlist_errors)}" + ) diff --git a/tests/test_seqspec_onlist.py b/tests/test_seqspec_onlist.py index 40f6ad9..cd31149 100644 --- a/tests/test_seqspec_onlist.py +++ b/tests/test_seqspec_onlist.py @@ -1,6 +1,5 @@ from argparse import ArgumentParser from contextlib import contextmanager -from io import StringIO import os from tempfile import TemporaryDirectory from unittest import TestCase @@ -12,7 +11,6 @@ join_onlists, join_product_onlist, join_multi_onlist, - join_onlists, run_onlist_region, run_onlist_read, setup_onlist_args, @@ -33,7 +31,7 @@ def create_temporary_barcode_files(filenames): os.chdir(tmpdir) for name in filenames: filename = os.path.join(tmpdir, name) - with open(filename, "wt") as outstream: + with open(filename, "wt"): pass yield tmpdir finally: @@ -66,13 +64,29 @@ def test_find_list_target_dir_local(self): with create_temporary_barcode_files(["index_onlist.txt"]) as tmpdir: filename = os.path.join(tmpdir, "temp.tsv") - onlist1 = Onlist("temp_id", filename, "tsv", 300, filename, "local", "d41d8cd98f00b204e9800998ecf8427e", "local") + onlist1 = Onlist( + "temp_id", + filename, + "tsv", + 300, + filename, + "local", + "d41d8cd98f00b204e9800998ecf8427e", + ) target_dir = find_list_target_dir([onlist1]) self.assertEqual(target_dir, tmpdir) def test_find_list_target_dir_remote(self): - onlist1 = Onlist("temp_id", "temp.tsv", "tsv", 300, "http://localhost:9/temp.tsv", "http", "d41d8cd98f00b204e9800998ecf8427e", "remote") + onlist1 = Onlist( + "temp_id", + "temp.tsv", + "tsv", + 300, + "http://localhost:9/temp.tsv", + "http", + "d41d8cd98f00b204e9800998ecf8427e", + ) target_dir = find_list_target_dir([onlist1]) self.assertEqual(target_dir, os.getcwd()) @@ -126,57 +140,80 @@ def test_join_onlist_multi(self): def test_local_validate_onlist_args(self): onlist_name = "index_onlist.tsv" with create_temporary_barcode_files([onlist_name]) as tmpdir: - expected_onlist_path = os.path.join(tmpdir, onlist_name) spec_path = os.path.join(tmpdir, "spec.yaml") parser = ArgumentParser() subparser = parser.add_subparsers(dest="command") subparser = setup_onlist_args(subparser) - args = parser.parse_args([ - "onlist", "-m", "rna", "-i", "read1.fastq.gz", "-f", "multi", spec_path]) + args = parser.parse_args( + [ + "onlist", + "-m", + "rna", + "-i", + "read1.fastq.gz", + "-f", + "multi", + spec_path, + ] + ) def load_spec(*args, **kwargs): return load_example_spec(example_spec) - with patch("seqspec.seqspec_onlist.load_spec", load_spec) as loader: - validate_onlist_args(parser, args) + with patch("seqspec.seqspec_onlist.load_spec", load_spec): + with patch("pathlib.Path.exists", return_value=True): + validate_onlist_args(parser, args) def test_local_cached_remote_validate_onlist_args(self): # Test that we will can use a locally cached copy of one barcode file # even if it is marked remote. onlist_name = "index_onlist.txt" with create_temporary_barcode_files([onlist_name]) as tmpdir: - expected_onlist_path = os.path.join(tmpdir, onlist_name) spec_path = os.path.join(tmpdir, "spec.yaml") parser = ArgumentParser() subparser = parser.add_subparsers(dest="command") subparser = setup_onlist_args(subparser) - args = parser.parse_args([ - "onlist", "-m", "rna", "-i", "read1.fastq.gz", "-f", "multi", spec_path]) + args = parser.parse_args( + [ + "onlist", + "-m", + "rna", + "-i", + "read1.fastq.gz", + "-f", + "multi", + spec_path, + ] + ) def load_spec(*args, **kwargs): - remote_spec = example_spec.replace( - "location: local", - "location: remote" - ).replace( - "url: index_onlist.tsv", - "url: http://localhost:9/foo/index_onlist.tsv" - ).replace( - "urltype: local", - "urltype: http", + remote_spec = ( + example_spec.replace("location: local", "location: remote") + .replace( + "url: index_onlist.tsv", + "url: http://localhost:9/foo/index_onlist.tsv", + ) + .replace( + "urltype: local", + "urltype: http", + ) ) print(remote_spec) return load_example_spec(remote_spec) - with patch("seqspec.seqspec_onlist.load_spec", load_spec) as loader, patch("seqspec.seqspec_onlist.read_remote_list", return_value="index_onlist.tsv") as fake_remote_list: - # Failed validation would raise an exception - validate_onlist_args(parser, args) - + with patch("seqspec.seqspec_onlist.load_spec", load_spec), patch( + "seqspec.seqspec_onlist.read_remote_list", + return_value="index_onlist.tsv", + ): + with patch("pathlib.Path.exists", return_value=True): + # Failed validation would raise an exception + validate_onlist_args(parser, args) def test_write_onlist_no_double_spacing(self): # Make sure that joined onlists don't end up double spaced. - + onlists = [ ["AAAA", "TTTT"], ["GGGG", "CCCC", "GGTT"], diff --git a/tests/test_utils.py b/tests/test_utils.py index 34a36bc..0fbe293 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -2,14 +2,16 @@ from hashlib import md5 from io import StringIO, BytesIO import os -from pathlib import Path from tempfile import TemporaryDirectory from requests import HTTPError from unittest import TestCase from unittest.mock import patch from seqspec.Region import ( - Region, RegionCoordinate, Onlist, project_regions_to_coordinates + Region, + RegionCoordinate, + Onlist, + project_regions_to_coordinates, ) from seqspec.utils import ( get_remote_auth_token, @@ -18,9 +20,8 @@ write_read, read_local_list, read_remote_list, - yield_onlist_contents + yield_onlist_contents, ) -from seqspec import __version__ from .test_region import ( region_rna_joined_dict, @@ -154,6 +155,7 @@ parent_id: rna """ + def load_example_spec(spec_text): with StringIO(spec_text) as instream: spec = load_spec_stream(instream) @@ -217,7 +219,15 @@ def test_read_local_list(self): with gzip.open(temp_list_filename, "wt") as stream: stream.write(fake_contents) - onlist1 = Onlist("123", temp_list_filename, "tsv", 300, temp_list_filename, "local", fake_md5, "local") + onlist1 = Onlist( + "123", + temp_list_filename, + "tsv", + 300, + temp_list_filename, + "local", + fake_md5, + ) loaded_list = read_local_list(onlist1) self.assertEqual(fake_onlist, loaded_list) @@ -232,7 +242,15 @@ def test_read_local_list_gz(self): with open(temp_list_filename, "wt") as stream: stream.write(fake_contents) - onlist1 = Onlist("123", temp_list_filename, "tsv", 300, temp_list_filename, "local", fake_md5, "local") + onlist1 = Onlist( + "123", + temp_list_filename, + "tsv", + 300, + temp_list_filename, + "local", + fake_md5, + ) loaded_list = read_local_list(onlist1) self.assertEqual(fake_onlist, loaded_list) @@ -256,7 +274,7 @@ def raise_for_status(self): with patch("requests.get", new=fake_request_get): url = "http://localhost/testlist.txt" - onlist1 = Onlist("123", "testlist.txt", "http", 300, url, "http", fake_md5, "remote") + onlist1 = Onlist("123", "testlist.txt", "http", 300, url, "http", fake_md5) loaded_list = read_remote_list(onlist1) self.assertEqual(fake_onlist, loaded_list) From 5ca2bbed4beb69bc4b678b63564addcf33e706d1 Mon Sep 17 00:00:00 2001 From: Mingjie Li Date: Wed, 16 Jul 2025 14:51:02 -0500 Subject: [PATCH 18/21] add bead_TSO to all schema --- seqspec/schema/seqspec_igvf.schema.json | 1 + seqspec/schema/seqspec_igvf_onlist_skip.schema.json | 1 + 2 files changed, 2 insertions(+) diff --git a/seqspec/schema/seqspec_igvf.schema.json b/seqspec/schema/seqspec_igvf.schema.json index b476011..2b7a258 100644 --- a/seqspec/schema/seqspec_igvf.schema.json +++ b/seqspec/schema/seqspec_igvf.schema.json @@ -163,6 +163,7 @@ "enum": [ "atac", "barcode", + "bead_TSO", "cdna", "crispr", "custom_primer", diff --git a/seqspec/schema/seqspec_igvf_onlist_skip.schema.json b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json index 9bf86fd..3ec0d61 100644 --- a/seqspec/schema/seqspec_igvf_onlist_skip.schema.json +++ b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json @@ -163,6 +163,7 @@ "enum": [ "atac", "barcode", + "bead_TSO", "cdna", "crispr", "custom_primer", From aff80515dd9d95de1c7855e51c53141f6f1cbc28 Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Fri, 22 Aug 2025 19:18:20 -0500 Subject: [PATCH 19/21] CHECK-231-region-type (#72) (#16) --- docs/SPECIFICATION.md | 1 + docs/assays/10xcrispr.spec.yaml | 2 +- docs/assays/sccrispra.spec.yaml | 2 +- seqspec/schema/seqspec.schema.json | 1 + seqspec/schema/seqspec_igvf.schema.json | 1 + .../schema/seqspec_igvf_onlist_skip.schema.json | 1 + seqspec/seqspec_index.py | 17 +++++------------ 7 files changed, 11 insertions(+), 14 deletions(-) diff --git a/docs/SPECIFICATION.md b/docs/SPECIFICATION.md index f4ad0be..63b8384 100644 --- a/docs/SPECIFICATION.md +++ b/docs/SPECIFICATION.md @@ -153,6 +153,7 @@ Each `Region` has the following properties which are useful to annotate the elem - `rna`: The modality corresponding to assaying RNA. - `s5`: A sequencing primer or adaptor typically used in the Nextera kit in conjunction with ME1. - `s7`: A sequencing primer or adaptor typically used in the Nextera kit in conjunction with ME2. + - `sgrna_target`: A sequence corresponding to the guide RNA spacer region that determines the genomic target of CRISPR-based perturbations. - `tag`: A short sequence of DNA or RNA used to label or identify a sample, protein, or other grouping. - `truseq_read1`: The first read primer in a paired-end sequencing run using the Illumina TruSeq Library preparation kit. - `truseq_read2`: The second read primer in a paired-end sequencing run using the Illumina TruSeq Library preparation kit. diff --git a/docs/assays/10xcrispr.spec.yaml b/docs/assays/10xcrispr.spec.yaml index 87e4d36..a95cc5a 100644 --- a/docs/assays/10xcrispr.spec.yaml +++ b/docs/assays/10xcrispr.spec.yaml @@ -203,7 +203,7 @@ library_spec: parent_id: crispr - !Region region_id: sgrna_target - region_type: crispr + region_type: sgrna_target name: sgrna_target sequence_type: onlist sequence: NNNNNNNNNNNNNNNNNXXX diff --git a/docs/assays/sccrispra.spec.yaml b/docs/assays/sccrispra.spec.yaml index e0f2790..c825dc3 100644 --- a/docs/assays/sccrispra.spec.yaml +++ b/docs/assays/sccrispra.spec.yaml @@ -223,7 +223,7 @@ library_spec: - !Region parent_id: crispr_R2_001.fastq.gz region_id: gRNA - region_type: gRNA + region_type: sgrna_target name: Guide RNAs sequence_type: onlist sequence: NNNNNNNNNNNNNNNNNNNN diff --git a/seqspec/schema/seqspec.schema.json b/seqspec/schema/seqspec.schema.json index 1fbf3aa..5b3ddd0 100644 --- a/seqspec/schema/seqspec.schema.json +++ b/seqspec/schema/seqspec.schema.json @@ -296,6 +296,7 @@ "rna", "s5", "s7", + "sgrna_target", "tag", "truseq_read1", "truseq_read2", diff --git a/seqspec/schema/seqspec_igvf.schema.json b/seqspec/schema/seqspec_igvf.schema.json index 2b7a258..9d0ffd3 100644 --- a/seqspec/schema/seqspec_igvf.schema.json +++ b/seqspec/schema/seqspec_igvf.schema.json @@ -191,6 +191,7 @@ "rna", "s5", "s7", + "sgrna_target", "tag", "truseq_read1", "truseq_read2", diff --git a/seqspec/schema/seqspec_igvf_onlist_skip.schema.json b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json index 3ec0d61..9908307 100644 --- a/seqspec/schema/seqspec_igvf_onlist_skip.schema.json +++ b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json @@ -191,6 +191,7 @@ "rna", "s5", "s7", + "sgrna_target", "tag", "truseq_read1", "truseq_read2", diff --git a/seqspec/seqspec_index.py b/seqspec/seqspec_index.py index f0caa74..fff7f59 100644 --- a/seqspec/seqspec_index.py +++ b/seqspec/seqspec_index.py @@ -319,6 +319,9 @@ def get_index_by_primer( return {read_id: new_rcs, "strand": rdc.read.strand} +FEATURE_REGION_TYPES = {"CDNA", "GDNA", "PROTEIN", "TAG", "SGRNA_TARGET"} + + def format_kallisto_bus(indices, subregion_type=None): bcs = [] umi = [] @@ -331,12 +334,7 @@ def format_kallisto_bus(indices, subregion_type=None): bcs.append(f"{idx},{cut.start},{cut.stop}") elif cut.region_type.upper() == "UMI": umi.append(f"{idx},{cut.start},{cut.stop}") - elif ( - cut.region_type.upper() == "CDNA" - or cut.region_type.upper() == "GDNA" - or cut.region_type.upper() == "PROTEIN" - or cut.region_type.upper() == "TAG" - ): + elif cut.region_type.upper() in FEATURE_REGION_TYPES: feature.append(f"{idx},{cut.start},{cut.stop}") if len(umi) == 0: umi.append("-1,-1,-1") @@ -362,12 +360,7 @@ def format_kallisto_bus_force_single(indices, subregion_type=None): bcs.append(f"{idx},{cut.start},{cut.stop}") elif cut.region_type.upper() == "UMI": umi.append(f"{idx},{cut.start},{cut.stop}") - elif ( - cut.region_type.upper() == "CDNA" - or cut.region_type.upper() == "GDNA" - or cut.region_type.upper() == "PROTEIN" - or cut.region_type.upper() == "TAG" - ): + elif cut.region_type.upper() in FEATURE_REGION_TYPES: length = cut.stop - cut.start if length > max_length: max_length = length From 9fd3b559839e78c5627493b9522f7a753151f2a7 Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Tue, 23 Sep 2025 14:52:26 -0500 Subject: [PATCH 20/21] CHECK-244-random-x (#18) --- seqspec/seqspec_check.py | 14 +- ...urementSet_X056_G4_RNA_rna_seqspec.yaml.gz | Bin 0 -> 1400 bytes tests/data/seqspec_valid_ignore_onlist.yaml | 197 ++++++++++++++++++ tests/test_seqspec_check.py | 37 +++- 4 files changed, 228 insertions(+), 20 deletions(-) create mode 100644 tests/data/2881_corces_measurementSet_X056_G4_RNA_rna_seqspec.yaml.gz create mode 100644 tests/data/seqspec_valid_ignore_onlist.yaml diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py index 24a3a3f..672ebaa 100644 --- a/seqspec/seqspec_check.py +++ b/seqspec/seqspec_check.py @@ -453,20 +453,17 @@ def seqtype_check(rgn, errors, idx): "error_message": f"'{rgn.region_id}' sequence_type is 'random' and contains subregions", "error_object": "region", } - # errors.append( - # f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and contains subregions" - # ) errors.append(errobj) idx += 1 - if rgn.sequence_type == "random" and rgn.sequence != "X" * rgn.max_len: + if rgn.sequence_type == "random" and ( + set(rgn.sequence) != {"X"} + or not (rgn.min_len <= len(rgn.sequence) <= rgn.max_len) + ): errobj = { "error_type": "check_sequence_types", "error_message": f"'{rgn.region_id}' sequence_type is 'random' and sequence is not all X's", "error_object": "region", } - # errors.append( - # f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and sequence is not all X's" - # ) errors.append(errobj) idx += 1 if rgn.sequence_type == "onlist" and not rgn.onlist: @@ -475,9 +472,6 @@ def seqtype_check(rgn, errors, idx): "error_message": f"'{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object", "error_object": "region", } - # errors.append( - # f"[error {idx}] '{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object" - # ) errors.append(errobj) idx += 1 if rgn.regions: diff --git a/tests/data/2881_corces_measurementSet_X056_G4_RNA_rna_seqspec.yaml.gz b/tests/data/2881_corces_measurementSet_X056_G4_RNA_rna_seqspec.yaml.gz new file mode 100644 index 0000000000000000000000000000000000000000..217d923c9d8ca9a0fa9284546891ba7a856d45f3 GIT binary patch literal 1400 zcmV-;1&8_{iwFP!000021MOJdbDKC2fA?RZzO@gwg+K!QRL5#2q(8rH38YG~9ZW31U@Z0P#f8hJcZ3!3PcJd%h2XXMFLB*8W-bPCq@{!GM)km+h8 zd73A3BctqEW~)0)*~&3ipagOx%$5(^M5fA?(I)nI7-WG=XQMGnK(8=xS80~yB5TxB zi%-@;_BnSaK|HM-GF7q(Paap>B+gNp3Q>3;8>0Cwv7^rX=dimjO zIl~z18n$2U*7!4k4Qub9nmhYV%Tt!SF4ZRE~3e@+j?|=^PA%|PQMRGk9 zxJJVs;40=xz!BZBAjjx;;8tzhgIljcM{#2oH?a^Fm4@a~&+?cpuqPSQG)#zg95;q- zU|I4>euKPTucgVYo?ExVHqCd zwvfJWP-ZhsOv^Nn!)4h_pI@BacG8SMnwif@GcqvEhS$fIhKIn_%#(oAOk!(rTpQ2E zt={$!w|<3=;l^!C2uZ1QeQ=#iEX^gJ$+7ULB_UyVM{VnDTdO7jAm9m{fQ6r;JPZqD z6{`lNA(T6pq+?CaSRLIkQV7-OX*utYIEbXzqR-HLe&eX=jP)uzz6Rx3Na-K z|DAS9>C9^9lvxbpC=Al969{!)?ILC`X3B8r2F*CA`m@!Rw%KJUbeUBm)-Bz~lG=$M zKX@p@XK(~dTiT#Y*xH#XOllMPp;uFvL%UYc8XAAPo#T6K(V@ZGnpD2Lth z*jV=BL$laHO@z`;-_I}KwYcr7ShFW0y{}79yDDbseT!CAMU#?>_>GE~aW@ZnvR4r6 z9S;j)zb-umu_}qS>$<)r4D9+Q)r91jV(LR9!WrW(wtfiOoxZSx`2n|TWZ-60U2=9; zhhgFVtwOjT7`nb67}fZPhrEzsxN>2VDnP(gBtXi{YNQJvpr36m3k$P~lb5 zyIgJrPc`-5{B%xN+&P38#y_Xnbgl@;D=ulF)ULKB>v%+@=!mf=rpX#yA-X7Z=oCe1)6YT=f34Sst`2PkNP%)`y G7XSbZ`mVnK literal 0 HcmV?d00001 diff --git a/tests/data/seqspec_valid_ignore_onlist.yaml b/tests/data/seqspec_valid_ignore_onlist.yaml new file mode 100644 index 0000000..2d73ac0 --- /dev/null +++ b/tests/data/seqspec_valid_ignore_onlist.yaml @@ -0,0 +1,197 @@ +!Assay +seqspec_version: 0.3.0 +assay_id: 10x-ATAC-RNA-MULTI +name: 10x-ATAC-RNA-MULTI/Illumina +doi: https://doi.org/10.1038/s41592-019-0433-8 +date: 17 June 2019 +description: ansuman-satpathy:igvf_exp11_atac_10x4_NGS1 Single Cell Multiome ATAC +modalities: +- atac +lib_struct: https://igvf.github.io/seqspec/ +library_protocol: single-nucleus ATAC-seq (OBI:0002762) +library_kit: Illumina Truseq Dual Index +sequence_protocol: Illumina NovaSeq X +sequence_kit: NovaSeq X Series 10B Reagent Kit +sequence_spec: +- !Read + read_id: 1165AJSO + name: Read 1 + modality: atac + primer_id: atac-nextera_read1 + min_len: 50 + max_len: 50 + strand: pos + files: + - !File + file_id: IGVFFI1165AJSO + filename: IGVFFI1165AJSO.fastq.gz + filetype: '' + filesize: 4960657092 + url: https://api.data.igvf.org/sequence-files/IGVFFI1165AJSO/@@download/IGVFFI1165AJSO.fastq.gz + urltype: https + md5: 0a4d87a0edf52511e72948c11de9df8b +- !Read + read_id: IGVFFI2309FCAH + name: Index 1 (i7 index) + modality: atac + primer_id: atac-nextera_read2 + min_len: 8 + max_len: 8 + strand: pos + files: + - !File + file_id: IGVFFI2309FCAH + filename: IGVFFI2309FCAH.fastq.gz + filetype: '' + filesize: 1176913287 + url: https://api.data.igvf.org/sequence-files/IGVFFI2309FCAH/@@download/IGVFFI2309FCAH.fastq.gz + urltype: https + md5: 1f17b83b0c293ad74507cf0dde38a286 +- !Read + read_id: IGVFFI6229GGKZ + name: Read 2 (technically Index 2 (i5 index)) + modality: atac + primer_id: atac-nextera_read1 + min_len: 24 + max_len: 24 + strand: neg + files: + - !File + file_id: IGVFFI6229GGKZ + filename: IGVFFI6229GGKZ.fastq.gz + filetype: '' + filesize: 2696388379 + url: https://api.data.igvf.org/sequence-files/IGVFFI6229GGKZ/@@download/IGVFFI6229GGKZ.fastq.gz + urltype: https + md5: bc9775c746941a760da73a6304c1b0bd +- !Read + read_id: IGVFFI9141IFTT + name: Read 3 (technically Read 2) + modality: atac + primer_id: atac-nextera_read2 + min_len: 50 + max_len: 50 + strand: neg + files: + - !File + file_id: IGVFFI9141IFTT + filename: IGVFFI9141IFTT.fastq.gz + filetype: '' + filesize: 4922922820 + url: https://api.data.igvf.org/sequence-files/IGVFFI9141IFTT/@@download/IGVFFI9141IFTT.fastq.gz + urltype: https + md5: 0a8ee69e4918bb52664bbf4a3842c405 +library_spec: +- !Region + parent_id: null + region_id: atac + region_type: bead_TSO + name: ATAC + sequence_type: joined + sequence: AATGATACGGCGACCACCGAGATCTACACNNNNNNNNNNNNNNNNCGCGTCTGTCGTCGGCAGCGTCAGATGTGTATAAGAGACAGXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXCTGTCTCTTATACACATCTCCGAGCCCACGAGACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG + min_len: 352 + max_len: 352 + onlist: null + regions: + - !Region + parent_id: atac + region_id: atac-illumina_p5 + region_type: illumina_p5 + name: Illumina P5 + sequence_type: fixed + sequence: AATGATACGGCGACCACCGAGATCTACAC + min_len: 29 + max_len: 29 + onlist: null + regions: null + - !Region + parent_id: atac + region_id: atac-cell_barcode + region_type: barcode + name: R2 Cell Barcode + sequence_type: onlist + sequence: NNNNNNNNNNNNNNNN + min_len: 16 + max_len: 16 + onlist: !Onlist + file_id: IGVFFI7587TJLC + filename: IGVFFI7587TJLC.tsv.gz + filetype: '' + filesize: 2465078 + url: https://api.data.igvf.org/tabular-files/IGVFFI7587TJLC/@@download/IGVFFI7587TJLC.tsv.gztest + urltype: https + md5: 91f5bd173373fa1815830444480236fb + regions: null + - !Region + parent_id: atac + region_id: atac-linker + region_type: linker + name: atac linker + sequence_type: fixed + sequence: CGCGTCTG + min_len: 8 + max_len: 8 + onlist: null + regions: null + - !Region + parent_id: atac + region_id: atac-nextera_read1 + region_type: nextera_read1 + name: nextera_read1 + sequence_type: fixed + sequence: TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG + min_len: 33 + max_len: 33 + onlist: null + regions: null + - !Region + parent_id: atac + region_id: gDNA + region_type: gdna + name: gDNA + sequence_type: random + sequence: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX + min_len: 200 + max_len: 200 + onlist: null + regions: null + - !Region + parent_id: atac + region_id: atac-nextera_read2 + region_type: nextera_read2 + name: nextera_read2 + sequence_type: fixed + sequence: CTGTCTCTTATACACATCTCCGAGCCCACGAGAC + min_len: 34 + max_len: 34 + onlist: null + regions: null + - !Region + parent_id: atac + region_id: atac-index7 + region_type: index7 + name: ATAC index7 + sequence_type: onlist + sequence: NNNNNNNN + min_len: 8 + max_len: 8 + onlist: !Onlist + file_id: IGVFFI1608YDWY + filename: IGVFFI1608YDWY.csv.gz + filetype: '' + filesize: 1658 + url: https://api.data.igvf.org/tabular-files/IGVFFI1608YDWY/@@download/IGVFFI1608YDWY.csv.gz + urltype: https + md5: db54507732297fafea74bacfcc203238 + regions: null + - !Region + parent_id: atac + region_id: atac-illumina_p7 + region_type: illumina_p7 + name: Illumina P7 + sequence_type: fixed + sequence: ATCTCGTATGCCGTCTTCTGCTTG + min_len: 24 + max_len: 24 + onlist: null + regions: null diff --git a/tests/test_seqspec_check.py b/tests/test_seqspec_check.py index 75d31bd..c81957a 100644 --- a/tests/test_seqspec_check.py +++ b/tests/test_seqspec_check.py @@ -1,8 +1,9 @@ -from argparse import ArgumentParser from pathlib import Path from tempfile import TemporaryDirectory from unittest import TestCase from unittest.mock import patch +from seqspec.seqspec_check import run_check +from argparse import ArgumentParser, Namespace from seqspec.seqspec_check import ( setup_check_args, @@ -53,8 +54,6 @@ def test_validate_check_args(self): def test_check_with_igvf_skip(self): """Test that 'igvf' skip condition filters out some IGVF-related errors but not read_id pattern errors.""" - from seqspec.seqspec_check import run_check - from argparse import ArgumentParser, Namespace # Create a parser parser = ArgumentParser() @@ -104,16 +103,14 @@ def test_check_with_igvf_skip(self): def test_check_with_igvf_onlist_skip(self): """Test that 'igvf_onlist_skip' skip condition filters out IGVF and onlist errors including read_id pattern.""" - from seqspec.seqspec_check import run_check - from argparse import ArgumentParser, Namespace - + file_path = "tests/data/seqspec_valid_ignore_onlist.yaml" # Create a parser parser = ArgumentParser() subparser = parser.add_subparsers(dest="command") subparser = setup_check_args(subparser) # Test file path - test_file = Path("tests/data/seqspec_valid_ignore_onlist.yaml") + test_file = Path(file_path) # Test with 'igvf_onlist_skip' skip args = Namespace() @@ -131,9 +128,6 @@ def test_check_with_igvf_onlist_skip(self): def test_check_without_skip(self): """Test that without skip condition, validation errors are reported.""" - from seqspec.seqspec_check import run_check - from argparse import ArgumentParser, Namespace - # Create a parser parser = ArgumentParser() subparser = parser.add_subparsers(dest="command") @@ -176,3 +170,26 @@ def test_check_without_skip(self): self.assertEqual( len(onlist_errors), 1, f"Expected 1 onlist error, got {len(onlist_errors)}" ) + + def test_check_sequence_type_random_x(self): + file_path = ( + "tests/data/2881_corces_measurementSet_X056_G4_RNA_rna_seqspec.yaml.gz" + ) + # Create a parser + parser = ArgumentParser() + subparser = parser.add_subparsers(dest="command") + subparser = setup_check_args(subparser) + + # Test file path + test_file = Path(file_path) + + # Test without skip + args = Namespace() + args.yaml = test_file + args.output = None + args.skip = None + + # Run check without skip + errors = run_check(parser, args) + # should have no errors + self.assertEqual(len(errors), 0) From 876aeefa058d2e8a73cf853a0f2e074f271ef288 Mon Sep 17 00:00:00 2001 From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com> Date: Tue, 27 Jan 2026 14:32:50 -0600 Subject: [PATCH 21/21] update readme (#21) --- docs/INSTALLATION.md | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/docs/INSTALLATION.md b/docs/INSTALLATION.md index 9e7db60..46e683a 100644 --- a/docs/INSTALLATION.md +++ b/docs/INSTALLATION.md @@ -7,16 +7,8 @@ authors: # Installation -The development version can be installed with - -```bash -pip install git+https://github.com/pachterlab/seqspec@devel -``` - -The official release can be installed directly from pypi - ```bash -pip install seqspec +pip install git+https://github.com/IGVF-DACC/seqspec.git@main ``` Verify the installation