From 2e19173a5743a4ef66c3770842d27027ed747219 Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Fri, 1 Nov 2024 17:42:42 -0500
Subject: [PATCH 01/21] update schema (#52)

---
 seqspec/schema/seqspec.schema.json | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/seqspec/schema/seqspec.schema.json b/seqspec/schema/seqspec.schema.json
index 997d14a..0dc9a04 100644
--- a/seqspec/schema/seqspec.schema.json
+++ b/seqspec/schema/seqspec.schema.json
@@ -71,6 +71,8 @@
             "massively parallel reporter assay (OBI:0002675)",
             "chromosome conformation capture-on-chip assay (OBI:0002458)",
             "single nucleus methylation chromatin conformation capture seq (NTR:0000745)",
+            "in vitro CRISPR screen using flow cytometry (OBI:0003661)",
+            "in vitro CRISPR screen using single-cell RNA-seq (OBI:0003660)",
             "Custom"
           ]
         },
@@ -141,6 +143,7 @@
             "Illumina NextSeq 2000 (EFO:0010963)",
             "Illumina NovaSeq X (NTR:0000765)",
             "PacBio RS II (EFO:0008631)",
+            "Illumina NovaSeq X (EFO:0022840)",
             "Custom"
           ]
         },

From 289e7e3228a6c35289e82d1d760ca6c84610f16d Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Fri, 1 Nov 2024 17:43:22 -0500
Subject: [PATCH 02/21] update file_exsits function to check file url in igvf
 portal (#53)

---
 seqspec/utils.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/seqspec/utils.py b/seqspec/utils.py
index bad593d..1ed0b25 100644
--- a/seqspec/utils.py
+++ b/seqspec/utils.py
@@ -119,6 +119,16 @@ def region_ids_in_spec(seqspec, modality, region_ids):
 
 def file_exists(uri):
     try:
+        if uri.startswith("https://api.data.igvf.org"):
+            auth = get_remote_auth_token()
+            if auth is None:
+                print("Warning: IGVF_API_KEY and IGVF_SECRET_KEY not set")
+            r = requests.head(uri, auth=auth)
+            if r.status_code == 307:
+                # igvf download link will redirect to a presigned amazon s3 url, HEAD request will not work.
+                r = requests.get(r.headers["Location"], headers={"Range": "bytes=0-0"})
+                return r.status_code == 206
+            return r.status_code == 200
         r = requests.head(uri)
         if r.status_code == 302:
             return file_exists(r.headers["Location"])

From 2a5df33dab6457c31fd9999759d99f262d1ff9cb Mon Sep 17 00:00:00 2001
From: Sina Booeshaghi <sbooeshaghi@gmail.com>
Date: Sun, 3 Nov 2024 19:08:43 -0800
Subject: [PATCH 03/21] adding seqspec spec tokenization

---
 seqspec/main.py            |   6 +-
 seqspec/seqspec_convert.py | 403 +++++++++++++++++++++++++++++++++++++
 seqspec/seqspec_genbank.py | 210 -------------------
 3 files changed, 406 insertions(+), 213 deletions(-)
 create mode 100644 seqspec/seqspec_convert.py
 delete mode 100644 seqspec/seqspec_genbank.py

diff --git a/seqspec/main.py b/seqspec/main.py
index d64a881..a5bb35e 100644
--- a/seqspec/main.py
+++ b/seqspec/main.py
@@ -6,7 +6,7 @@
 from .seqspec_check import setup_check_args, validate_check_args
 from .seqspec_find import setup_find_args, validate_find_args
 
-# from .seqspec_genbank import setup_genbank_args, validate_genbank_args
+from .seqspec_convert import setup_convert_args, validate_convert_args
 from .seqspec_modify import setup_modify_args, validate_modify_args
 from .seqspec_index import setup_index_args, validate_index_args
 from .seqspec_info import setup_info_args, validate_info_args
@@ -54,7 +54,7 @@ def main():
         "find": setup_find_args(subparsers),
         "file": setup_file_args(subparsers),
         "format": setup_format_args(subparsers),
-        # "genbank": setup_genbank_args(subparsers),
+        "convert": setup_convert_args(subparsers),
         "index": setup_index_args(subparsers),
         "info": setup_info_args(subparsers),
         "init": setup_init_args(subparsers),
@@ -98,7 +98,7 @@ def main():
         "version": validate_version_args,
         "file": validate_file_args,
         "upgrade": validate_upgrade_args,
-        # "genbank": validate_genbank_args,
+        "convert": validate_convert_args,
     }
     COMMAND_TO_FUNCTION[sys.argv[1]](parser, args)
 
diff --git a/seqspec/seqspec_convert.py b/seqspec/seqspec_convert.py
new file mode 100644
index 0000000..ccffa7c
--- /dev/null
+++ b/seqspec/seqspec_convert.py
@@ -0,0 +1,403 @@
+from seqspec.utils import load_genbank
+import json
+from seqspec.Region import Region
+from seqspec.Assay import Assay
+from seqspec.utils import load_spec
+import numpy as np
+from typing import Dict, List, Tuple
+from os import path
+from pathlib import Path
+
+
+schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json")
+
+with open(schema_fn, "r") as f:
+    schema = json.load(f)
+REGION_TYPES = schema["$defs"]["region"]["properties"]["region_type"]["enum"]
+MODALITIES = schema["properties"]["modalities"]["items"]["enum"]
+SEQUENCE_TYPES = schema["$defs"]["region"]["properties"]["sequence_type"]["enum"]
+
+
+def setup_convert_args(parser):
+    subparser = parser.add_parser(
+        "convert",
+        description="get genbank about seqspec file",
+        help="get genbank about seqspec file",
+    )
+    choices = ["genbank", "seqspec", "token"]
+    subparser.add_argument(
+        "-ifmt", help="Input format", type=str, default="seqspec", choices=choices
+    )
+    subparser.add_argument(
+        "-ofmt", help="Output format", type=str, default="token", choices=choices
+    )
+
+    subparser.add_argument(
+        "-o",
+        metavar="OUT",
+        help=("Path to output file"),
+        type=str,
+        default=None,
+        required=False,
+    )
+    subparser.add_argument(
+        "input_file", metavar="IN", help="Path to input file", type=str
+    )
+    return subparser
+
+
+def validate_convert_args(parser, args):
+    # if everything is valid the run_convert
+    fn = args.input_file
+    ifmt = args.ifmt
+    ofmt = args.ofmt
+    o = args.o
+
+    cnv = run_convert(fn, ifmt, ofmt, o)
+    print(cnv)
+
+    # if o:
+    #     spec.to_YAML(o)
+    # else:
+    #     print(json.dumps(spec, sort_keys=False, indent=4))
+
+
+def load_input_file(fn, ifmt):
+    LOAD = {
+        "genbank": load_genbank,
+        "seqspec": load_spec,
+        # "token": load_token,
+    }
+    return LOAD[ifmt](fn)
+
+
+def get_feature_names() -> List[str]:
+    """Generate ordered list of column names"""
+    features = []
+
+    # Modality one-hot features
+    features.extend([f"modality_{mod}" for mod in MODALITIES])
+
+    # Region type one-hot features
+    features.extend([f"region_{rt}" for rt in REGION_TYPES])
+
+    # Sequence type one-hot features
+    features.extend([f"seq_{st}" for st in SEQUENCE_TYPES])
+
+    # Numerical features
+    features.extend(["min_len", "max_len", "position"])
+
+    return features
+
+
+def save_tokenized_spec(
+    matrix: np.ndarray, row_identifiers: List[Tuple[str, str, str]], output_path: str
+):
+    """
+    Save tokenized spec output to three files:
+    - spec.npy: The matrix data
+    - rows.txt: Tab-separated list of (spec_id, modality, region_type)
+    - cols.txt: List of column names
+
+    Args:
+        matrix: The tokenized matrix from tokenize_specs
+        row_identifiers: List of (spec_id, modality, region_type) tuples
+        output_path: Path to save the output (directory)
+    """
+    # Create output directory if needed
+    output_dir = Path(output_path)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    # Save matrix
+    np.save(output_dir / "spec.npy", matrix)
+
+    # Save row identifiers (tab-separated)
+    with open(output_dir / "rows.txt", "w") as f:
+        for spec_id, modality, region_type in row_identifiers:
+            f.write(f"{spec_id}\t{modality}\t{region_type}\n")
+
+    # Save column names
+    feature_names = get_feature_names()
+    with open(output_dir / "cols.txt", "w") as f:
+        for feature in feature_names:
+            f.write(f"{feature}\n")
+
+
+def run_convert(fn, ifmt, ofmt, o):
+    CONVERT = {
+        ("genbank", "seqspec"): gb_to_seqspec,
+        ("seqspec", "token"): seqspec_to_token,
+    }
+    file = load_input_file(fn, ifmt)
+    c = CONVERT[(ifmt, ofmt)](file)
+    if o:
+        save_tokenized_spec(*c, o)
+    else:
+        return c
+    return
+
+
+def seqspec_to_token(spec):
+    # for each modalitiy, make a dictionary of regions
+    specs_regions = {}
+    modalities = spec.list_modalities()
+    for modality in modalities:
+        regions = [i.to_dict() for i in spec.get_libspec(modality).get_leaves()]
+        specs_regions[modality] = regions
+
+    # Convert to tokenized matrix
+    return tokenize_specs({spec.assay_id: specs_regions})
+
+
+def tokenize_specs(
+    specs_regions: Dict[str, Dict[str, List[Dict]]]
+) -> Tuple[np.ndarray, List[Tuple[str, str, str]]]:
+    """
+    Convert specs into a single matrix where each row represents a complete region specification
+
+    Args:
+        specs_regions: Dict[spec_id -> Dict[modality -> List[region_dict]]]
+
+    Returns:
+        - Matrix where each row is [modality_onehot, region_type_onehot, sequence_type_onehot, min_len, max_len, position]
+        - List of (spec_id, modality, region_type) identifying each row
+    """
+    # Calculate feature dimensions
+    n_modality_features = len(MODALITIES)
+    n_region_type_features = len(REGION_TYPES)
+    n_sequence_type_features = len(SEQUENCE_TYPES)
+
+    # Total features = one-hot encodings + numerical features
+    total_features = (
+        n_modality_features  # modality one-hot
+        + n_region_type_features  # region_type one-hot
+        + n_sequence_type_features  # sequence_type one-hot
+        + 2  # min_len, max_len
+    )
+
+    # Total features = one-hot encodings + numerical features + position
+    total_features = (
+        n_modality_features  # modality one-hot
+        + n_region_type_features  # region_type one-hot
+        + n_sequence_type_features  # sequence_type one-hot
+        + 2  # min_len, max_len
+        + 1  # position in region list (1-based)
+    )
+
+    rows = []  # Will hold our feature vectors
+    row_identifiers = []  # Will hold (spec_id, modality, region_type) tuples
+
+    for spec_id, modality_regions in specs_regions.items():
+        for modality, regions in modality_regions.items():
+            # Enumerate regions to get position (1-based)
+            for position, region in enumerate(regions, start=1):
+                # Create feature vector for this region
+                feature_vector = np.zeros(total_features)
+                current_idx = 0
+
+                # Add modality one-hot
+                modality_idx = MODALITIES.index(modality)
+                feature_vector[modality_idx] = 1
+                current_idx += n_modality_features
+
+                # Add region_type one-hot
+                region_type_idx = REGION_TYPES.index(region["region_type"])
+                feature_vector[current_idx + region_type_idx] = 1
+                current_idx += n_region_type_features
+
+                # Add sequence_type one-hot
+                sequence_type_idx = SEQUENCE_TYPES.index(region["sequence_type"])
+                feature_vector[current_idx + sequence_type_idx] = 1
+                current_idx += n_sequence_type_features
+
+                # Add lengths
+                feature_vector[current_idx] = region["min_len"]
+                feature_vector[current_idx + 1] = region["max_len"]
+                current_idx += 2
+
+                # Add position
+                feature_vector[current_idx] = position
+
+                # Store feature vector and identifier
+                rows.append(feature_vector)
+                row_identifiers.append((spec_id, modality, region["region_type"]))
+
+    return np.array(rows), row_identifiers
+
+
+def gb_to_seqspec(gb):
+    ex = gb_to_list(gb)
+    nested_json = nest_intervals(ex)
+    filled_regions = fill_gaps(gb.sequence, nested_json)
+    regions = convert(filled_regions)
+    reads = []
+    spec = Assay(
+        "genbank",
+        "illumina",
+        "genbank thing",
+        "doi",
+        "date",
+        ["source"],
+        "description",
+        "",
+        "",
+        "",
+        "",
+        reads,
+        regions,
+    )
+    return spec
+
+
+def gb_to_list(gb):
+    feat = []
+    label = "source"
+    for f in gb.features:
+        id = f.key
+
+        if "complement" in f.location:
+            start, stop = tuple(map(int, f.location[11:-1].split("..")))
+        else:
+            start, stop = tuple(map(int, f.location.split("..")))
+
+        # convert to 0-index
+        start -= 1
+        length = stop - start
+        seq = gb.sequence[start:stop]
+
+        for q in f.qualifiers:
+            if q.key == "/label=":
+                label = q.value
+                break
+        feat.append(
+            {
+                "id": id,
+                "label": label,
+                "start": start,
+                "stop": stop,
+                "length": length,
+                "seq": seq,
+            }
+        )
+    return feat
+
+
+def nest_intervals(intervals):
+    def nest(start_index, end_limit):
+        nested = []
+
+        i = start_index
+        while i < len(intervals) and intervals[i]["start"] < end_limit:
+            current_interval = intervals[i]
+            child, next_index = nest(i + 1, current_interval["stop"])
+            interval_obj = {
+                "id": current_interval["id"],
+                "label": current_interval["label"],
+                "start": current_interval["start"],
+                "stop": current_interval["stop"],
+                "length": current_interval["length"],
+                "seq": current_interval["seq"],
+                "regions": child,
+            }
+            nested.append(interval_obj)
+            i = next_index
+
+        return nested, i
+
+    result, _ = nest(0, intervals[0]["stop"])
+    return result
+
+
+def fill_gaps(seq, regions, parent_start=0, parent_stop=0):
+    if len(regions) == 0:
+        return []
+
+    # Insert a filler at the start if necessary
+    if regions[0]["start"] > parent_start:
+        start = parent_start
+        stop = regions[0]["start"]
+        s = seq[start:stop]
+        regions.insert(
+            0,
+            {
+                "id": "filler_start",
+                "label": "filler_start",
+                "start": start,
+                "stop": stop,
+                "length": stop - start,
+                "seq": s,
+                "regions": [],
+            },
+        )
+
+    new_regions = []
+    for i, region in enumerate(regions):
+        # Append the current region
+        new_regions.append(region)
+
+        # Recursive call for nested regions
+        if "regions" in region:
+            region["regions"] = fill_gaps(
+                seq, region["regions"], region["start"], region["stop"]
+            )
+
+        # Check for gap and insert a filler
+        if i < len(regions) - 1 and region["stop"] < regions[i + 1]["start"]:
+            filler_id = f'filler_{region["id"]}_{regions[i+1]["id"]}'
+            start = region["stop"]
+            stop = regions[i + 1]["start"]
+            s = seq[start:stop]
+            new_regions.append(
+                {
+                    "id": filler_id,
+                    "label": filler_id,
+                    "start": start,
+                    "stop": stop,
+                    "length": stop - start,
+                    "seq": s,
+                    "regions": [],
+                }
+            )
+
+    # Insert a filler at the end if necessary
+    if new_regions[-1]["stop"] < parent_stop:
+        start = new_regions[-1]["stop"]
+        stop = parent_stop
+        s = seq[start:stop]
+        new_regions.append(
+            {
+                "id": "filler_end",
+                "label": "filler_end",
+                "start": start,
+                "stop": stop,
+                "length": stop - start,
+                "seq": s,
+                "regions": [],
+            }
+        )
+
+    return new_regions
+
+
+# convert filled regions to seqspec, must be recursive function
+# regions is a list
+def convert(regions):
+    if len(regions) == 0:
+        return []
+    new_regions = []
+    for r in regions:
+        rgn = Region(
+            r["id"],
+            "",
+            r["label"],
+            "fixed",
+            r["seq"],
+            r["length"],
+            r["length"],
+            None,
+            None,
+        )
+        if len(r["regions"]) > 0:
+            rgn.regions = convert(r["regions"])
+        new_regions.append(rgn)
+    return new_regions
diff --git a/seqspec/seqspec_genbank.py b/seqspec/seqspec_genbank.py
deleted file mode 100644
index 419d67c..0000000
--- a/seqspec/seqspec_genbank.py
+++ /dev/null
@@ -1,210 +0,0 @@
-from seqspec.utils import load_genbank
-import json
-from seqspec.Region import Region
-from seqspec.Assay import Assay
-
-
-def setup_genbank_args(parser):
-    subparser = parser.add_parser(
-        "genbank",
-        description="get genbank about seqspec file",
-        help="get genbank about seqspec file",
-    )
-
-    subparser.add_argument("gbk", help="Genbank file")
-    subparser.add_argument(
-        "-o",
-        metavar="OUT",
-        help=("Path to output file"),
-        type=str,
-        default=None,
-        required=False,
-    )
-    return subparser
-
-
-def validate_genbank_args(parser, args):
-    # if everything is valid the run_genbank
-    fn = args.gbk
-    o = args.o
-    gb = load_genbank(fn)
-
-    spec = run_genbank(gb)
-
-    if o:
-        spec.to_YAML(o)
-    else:
-        print(json.dumps(spec, sort_keys=False, indent=4))
-
-
-def run_genbank(gb):
-    ex = gb_to_list(gb)
-    nested_json = nest_intervals(ex)
-    filled_regions = fill_gaps(gb.sequence, nested_json)
-    regions = convert(filled_regions)
-    spec = Assay(
-        "genbank",
-        "illumina",
-        "genbank thing",
-        "doi",
-        "date",
-        "description",
-        ["source"],
-        "",
-        regions,
-    )
-    return spec
-
-
-def gb_to_list(gb):
-    feat = []
-    label = "source"
-    for f in gb.features:
-        id = f.key
-
-        if "complement" in f.location:
-            start, stop = tuple(map(int, f.location[11:-1].split("..")))
-        else:
-            start, stop = tuple(map(int, f.location.split("..")))
-
-        # convert to 0-index
-        start -= 1
-        length = stop - start
-        seq = gb.sequence[start:stop]
-
-        for q in f.qualifiers:
-            if q.key == "/label=":
-                label = q.value
-                break
-        feat.append(
-            {
-                "id": id,
-                "label": label,
-                "start": start,
-                "stop": stop,
-                "length": length,
-                "seq": seq,
-            }
-        )
-    return feat
-
-
-def nest_intervals(intervals):
-    def nest(start_index, end_limit):
-        nested = []
-
-        i = start_index
-        while i < len(intervals) and intervals[i]["start"] < end_limit:
-            current_interval = intervals[i]
-            child, next_index = nest(i + 1, current_interval["stop"])
-            interval_obj = {
-                "id": current_interval["id"],
-                "label": current_interval["label"],
-                "start": current_interval["start"],
-                "stop": current_interval["stop"],
-                "length": current_interval["length"],
-                "seq": current_interval["seq"],
-                "regions": child,
-            }
-            nested.append(interval_obj)
-            i = next_index
-
-        return nested, i
-
-    result, _ = nest(0, intervals[0]["stop"])
-    return result
-
-
-def fill_gaps(seq, regions, parent_start=0, parent_stop=0):
-    if len(regions) == 0:
-        return []
-
-    # Insert a filler at the start if necessary
-    if regions[0]["start"] > parent_start:
-        start = parent_start
-        stop = regions[0]["start"]
-        s = seq[start:stop]
-        regions.insert(
-            0,
-            {
-                "id": "filler_start",
-                "label": "filler_start",
-                "start": start,
-                "stop": stop,
-                "length": stop - start,
-                "seq": s,
-                "regions": [],
-            },
-        )
-
-    new_regions = []
-    for i, region in enumerate(regions):
-        # Append the current region
-        new_regions.append(region)
-
-        # Recursive call for nested regions
-        if "regions" in region:
-            region["regions"] = fill_gaps(
-                seq, region["regions"], region["start"], region["stop"]
-            )
-
-        # Check for gap and insert a filler
-        if i < len(regions) - 1 and region["stop"] < regions[i + 1]["start"]:
-            filler_id = f'filler_{region["id"]}_{regions[i+1]["id"]}'
-            start = region["stop"]
-            stop = regions[i + 1]["start"]
-            s = seq[start:stop]
-            new_regions.append(
-                {
-                    "id": filler_id,
-                    "label": filler_id,
-                    "start": start,
-                    "stop": stop,
-                    "length": stop - start,
-                    "seq": s,
-                    "regions": [],
-                }
-            )
-
-    # Insert a filler at the end if necessary
-    if new_regions[-1]["stop"] < parent_stop:
-        start = new_regions[-1]["stop"]
-        stop = parent_stop
-        s = seq[start:stop]
-        new_regions.append(
-            {
-                "id": "filler_end",
-                "label": "filler_end",
-                "start": start,
-                "stop": stop,
-                "length": stop - start,
-                "seq": s,
-                "regions": [],
-            }
-        )
-
-    return new_regions
-
-
-# convert filled regions to seqspec, must be recursive function
-# regions is a list
-def convert(regions):
-    if len(regions) == 0:
-        return []
-    new_regions = []
-    for r in regions:
-        rgn = Region(
-            r["id"],
-            "",
-            r["label"],
-            "fixed",
-            r["seq"],
-            r["length"],
-            r["length"],
-            None,
-            None,
-        )
-        if len(r["regions"]) > 0:
-            rgn.regions = convert(r["regions"])
-        new_regions.append(rgn)
-    return new_regions

From e3a6dea4b3a25742e0335391278b79aa79f7bc7e Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Mon, 25 Nov 2024 15:14:22 -0600
Subject: [PATCH 04/21] allow https for remote onlist (#54)

---
 seqspec/seqspec_onlist.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/seqspec/seqspec_onlist.py b/seqspec/seqspec_onlist.py
index a29c63a..b021a5a 100644
--- a/seqspec/seqspec_onlist.py
+++ b/seqspec/seqspec_onlist.py
@@ -136,7 +136,7 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o):
         onlist_path = os.path.join(base_path, onlist_fn)
         if os.path.exists(onlist_path):
             urltype = "local"
-        elif urltype == "http":
+        elif urltype in ["http", "https"]:
             # download the onlist to the base path and return the path
             onlist_elements = read_remote_list(onlists[0])
             onlist_path = write_onlist(onlist_elements, save_path)
@@ -147,7 +147,7 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o):
         for o in onlists:
             if o.urltype == "local":
                 lsts.append(read_local_list(o, base_path))
-            elif o.urltype == "http":
+            elif o.urltype in ["http", "https"]:
                 # base_path is ignored for remote onlists
                 lsts.append(read_remote_list(o, base_path))
         onlist_elements = join_onlists(lsts, fmt)

From 8e9554f8293f9efd82bf008eac536e06b743cc12 Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Wed, 15 Jan 2025 17:49:55 -0600
Subject: [PATCH 05/21] Update seqspec check so we can run it directly in
 python script (#58)

* update seqspec check

* add spec parameter back to check function
---
 seqspec/seqspec_check.py | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py
index 702dfbe..77e94b4 100644
--- a/seqspec/seqspec_check.py
+++ b/seqspec/seqspec_check.py
@@ -33,19 +33,14 @@ def setup_check_args(parser):
 def validate_check_args(parser, args):
     spec_fn = args.yaml
     o = args.o
-    schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json")
 
-    return run_check(schema_fn, spec_fn, o)
+    return run_check(spec_fn, o)
 
 
-def run_check(schema_fn, spec_fn, o):
+def run_check(spec_fn, o):
     spec = load_spec(spec_fn)
 
-    with open(schema_fn, "r") as stream:
-        schema = yaml.load(stream, Loader=yaml.Loader)
-    v = Draft4Validator(schema)
-
-    errors = check(v, spec, spec_fn)
+    errors = check(spec)
 
     if errors:
         if o:
@@ -56,14 +51,19 @@ def run_check(schema_fn, spec_fn, o):
     return errors
 
 
-def check(schema: Draft4Validator, spec: Assay, spec_fn: str):
+def check(spec: Assay, spec_fn: str = None):
+    schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json")
+
+    with open(schema_fn, "r") as stream:
+        schema = yaml.load(stream, Loader=yaml.Loader)
+    validator = Draft4Validator(schema)
     errors = []
     idx = 0
 
     # with open("del.json", "w") as f:
     #     json.dump(spec.to_dict(), f, indent=4)
 
-    for idx, error in enumerate(schema.iter_errors(spec.to_dict()), 1):
+    for idx, error in enumerate(validator.iter_errors(spec.to_dict()), 1):
         errors.append(
             f"[error {idx}] {error.message} in spec[{']['.join(repr(index) for index in error.path)}]"
         )

From a280567bcaf03b392a8718b749f9617efa470420 Mon Sep 17 00:00:00 2001
From: Sina Booeshaghi <sbooeshaghi@gmail.com>
Date: Wed, 15 Jan 2025 15:50:36 -0800
Subject: [PATCH 06/21] added python usage to docs

---
 docs/SEQSPEC_TOOL.md | 80 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 78 insertions(+), 2 deletions(-)

diff --git a/docs/SEQSPEC_TOOL.md b/docs/SEQSPEC_TOOL.md
index 1e355e7..2703767 100644
--- a/docs/SEQSPEC_TOOL.md
+++ b/docs/SEQSPEC_TOOL.md
@@ -61,6 +61,12 @@ Check that the `seqspec` file is correctly formatted and consistent with the [sp
 seqspec check [-h] [-o OUT] yaml
 ```
 
+```python
+from seqspec.seqspec_check import run_check
+
+run_check(schema_fn: str, spec_fn: str, o: str)
+```
+
 - optionally, `-o OUT` can be used to write the output to a file.
 - `yaml` corresponds to the `seqspec` file.
 
@@ -133,6 +139,12 @@ $ seqspec check spec.yaml
 seqspec find [-h] [-o OUT] [-s Selector] -m MODALITY [-i IDs] yaml
 ```
 
+```python
+from seqspec.seqspec_find import run_find
+
+run_find(spec_fn: str, modality: str, id: str, idtype: str, o: str)
+```
+
 - optionally, `-o OUT` can be used to write the output to a file.
 - optionally, `-s Selector` is the type of the ID you are searching for (default: region). Can be one of
   - read
@@ -195,6 +207,12 @@ $ seqspec find -m rna -s region-type -i barcode spec.yaml
 seqspec file [-h] [-o OUT] [-i IDs] -m MODALITY [-s SELECTOR] [-f FORMAT] [-k KEY] yaml
 ```
 
+```python
+from seqspec.seqspec_file import run_file
+
+run_file(spec_fn: str, m: str, ids: List[str], idtype: str, fmt: str, k: str, o: str, fp=False)
+```
+
 - optionally, `-o OUT` can be used to write the output to a file.
 - optionally, `-s Selector` is the type of the ID you are searching for (default: read). Can be one of
   - read
@@ -266,6 +284,11 @@ Automatically fill in missing fields in the spec.
 seqspec format [-h] [-o OUT] yaml
 ```
 
+```python
+from seqspec.seqspec_format import run_format
+run_format(spec_fn: str, o: str)
+```
+
 - `-o OUT` the path to create the formatted `seqspec` file.
 - `yaml` corresponds to the `seqspec` file.
 
@@ -283,11 +306,16 @@ $ seqspec format -o spec.yaml spec.yaml
 
 Identify the position of elements in a spec for use in downstream tools. Returns the 0-indexed position of elements contained in a given region in the 5'->3' direction.
 
-```
+```bash
 seqspec index [-o OUT] [-t TOOL] [--rev] -m MODALITY -r REGION yaml
 seqspec index [-h] [-o OUT] [-t TOOL] [-s SELECTOR] [--rev] -m MODALITY [-i IDs] yaml
 ```
 
+```python
+from seqspec.seqspec_index import run_index
+run_index(spec_fn: str, modality: str, ids: List[str], idtype: str, fmt: str, rev: str, subregion_type: str, o)
+```
+
 - optionally, `-o OUT` can be used to write the output to a file.
 - optionally, `--rev` can be set to return the 3'->5' index.
 - optionally, `-t TOOL` returns the indices in the format specified by the tool. One of:
@@ -342,6 +370,11 @@ $ seqspec index -m atac -t kb -s file spec.yaml
 seqspec info [-h] [-k KEY] [-f FORMAT] [-o OUT] yaml
 ```
 
+```python
+from seqspec.seqspec_info import run_info
+run_info(spec_fn: str, f: str, k=None, o=None)
+```
+
 - optionally, `-o OUT` path to write the info.
 - optionally, `-k KEY` the object to display (default: meta). Can be one of
   - modalities
@@ -413,6 +446,11 @@ $ seqspec info -f json -k sequence_spec spec.yaml
 seqspec init [-h] -n NAME -m MODALITIES -r READS [-o OUT] newick
 ```
 
+```python
+from seqspec.seqspec_info import run_info
+run_info(spec_fn: str, f: str, k: str = None, o: str = None)
+```
+
 - optionally, `-o OUT` path to create `seqspec` file.
 - `-m MODALITIES` is a comma-separated list of modalities (e.g. rna,atac)
 - `-n NAME` is the name associated with the `seqspec` file.
@@ -432,7 +470,7 @@ $ seqspec init -n myassay -m rna -o spec.yaml -r rna,R1.fastq.gz,r1_primer,26,po
 $ seqspec init -n myassay -m rna,atac -o spec.yaml -r rna,rna_R1.fastq.gz,rna_r1_primer,26,pos:rna,rna_R2.fastq.gz,rna_r2_primer,100,neg:atac,atac_R1.fastq.gz,atac_r1_primer,100,pos:atac,atac_R2.fastq.gz,atac_r1_primer,16,neg:atac,atac_R3.fastq.gz,atac_r2_primer,100,neg "(((rna_r1_primer:0,barcode:16,umi:12,cdna:150,rna_r2_primer:0)rna),(barcode:16,atac_r1_primer:1,gdna:150,atac_r2_primer)atac)"
 ```
 
-## `seqsoec methods`: Convert seqspec file into methods section
+## `seqspec methods`: Convert seqspec file into methods section
 
 Generate a methods section from a seqspec file.
 
@@ -440,6 +478,11 @@ Generate a methods section from a seqspec file.
 seqspec methods [-h] -m MODALITY [-o OUT] yaml
 ```
 
+```python
+from seqspec.seqspec_methods import run_methods
+run_methods(spec_fn: str, m: str, o: str)
+```
+
 - optionally, `-o OUT` path to write the methods section.
 - `-m MODALITY` is the modality to write the methods for.
 - `yaml` corresponds to the `seqspec` file.
@@ -479,6 +522,13 @@ The library was sequenced on a Illumina NovaSeq 6000 (EFO:0008637) using the Nov
 seqspec modify [-h] [--read-id READID] [--read-name READNAME] [--primer-id PRIMERID] [--strand STRAND] [--files FILES] [--region-id REGIONID] [--region-type REGIONTYPE] [--region-name REGIONNAME] [--sequence-type SEQUENCETYPE] [--sequence SEQUENCE] [--min-len MINLEN] [--max-len MAXLEN] [-o OUT] [-i IDs] [-s SELECTOR] -m MODALITY yaml
 ```
 
+```python
+from seqspec.seqspec_modify import run_modify_read, run_modify_region
+
+run_modify_read(spec, modality, target_read, read_id, read_name, primer_id, min_len, max_len, strand, files)
+run_modify_region(spec, modality, target_region, region_id, region_type, name, sequence_type, sequence, min_len, max_len)
+```
+
 Read modifications
 
 - optionally, `--read-id READID` specifies the new `read_id`.
@@ -529,6 +579,12 @@ $ seqspec modify -m atac -o mod_spec.yaml -i atac_R1 --files "R1_1.fastq.gz,fast
 seqspec onlist [-h] [-o OUT] [-s SELECTOR] [-f FORMAT] [-i IDs] -m MODALITY yaml
 ```
 
+```python
+from seqspec.seqspec_onlist import run_onlist
+
+run_onlist(spec_fn, modality, ids, idtype, fmt, o)
+```
+
 - optionally, `-o OUT` to set the path of the onlist file.
 - `-m MODALITY` is the modality in which you are searching for the region.
 - `-i ID` is the `id` of the object to search for the onlist.
@@ -563,6 +619,11 @@ Print sequence and/or library structure as ascii, png, or html.
 seqspec print [-h] [-o OUT] [-f FORMAT] yaml
 ```
 
+```python
+from seqspec.seqspec_print import run_seqspec_print
+run_seqspec_print(spec_fn, fmt, o)
+```
+
 - optionally, `-o OUT` to set the path of printed file.
 - optionally, `-f FORMAT` is the format of the printed file. Can be one of:
   - `library-ascii`: prints an ascii tree of the library_spec
@@ -651,6 +712,11 @@ $ seqspec print -o spec.png -f seqspec-png spec.yaml
 seqspec split [-h] -o OUT yaml
 ```
 
+```python
+from seqspec.seqspec_split import run_split
+run_split(spec_fn, o)
+```
+
 - optionally, `-o OUT` name prepended to split specs.
 - `yaml` corresponds to the `seqspec` file.
 
@@ -673,6 +739,11 @@ split.tag.yaml
 seqspec version [-h] [-o OUT] yaml
 ```
 
+```python
+from seqspec.seqspec_version import run_version
+run_version(spec_fn, o)
+```
+
 - optionally, `-o OUT` path to file to write output.
 - `yaml` corresponds to the `seqspec` file.
 
@@ -693,6 +764,11 @@ This is a hidden subcommand that upgrades an old version of the spec to the curr
 seqspec upgrade [-h] [-o OUT] yaml
 ```
 
+```python
+from seqspec.seqspec_upgrade import run_upgrade
+run_upgrade(spec_fn, o)
+```
+
 ### Examples
 
 ```bash

From c9520b49232ec9a488a32ac9aba4099878241fad Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Wed, 22 Jan 2025 16:08:47 -0600
Subject: [PATCH 07/21] support gzipped yaml file for function load_spec (#60)

* support gzipped yaml file for function load_spec

* fix bug in function run_check

* support gzipped yaml file for function load_spec
---
 seqspec/seqspec_check.py |  4 ++--
 seqspec/utils.py         | 16 ++++++++++++++--
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py
index 77e94b4..402a99a 100644
--- a/seqspec/seqspec_check.py
+++ b/seqspec/seqspec_check.py
@@ -40,7 +40,7 @@ def validate_check_args(parser, args):
 def run_check(spec_fn, o):
     spec = load_spec(spec_fn)
 
-    errors = check(spec)
+    errors = check(spec, spec_fn)
 
     if errors:
         if o:
@@ -51,7 +51,7 @@ def run_check(spec_fn, o):
     return errors
 
 
-def check(spec: Assay, spec_fn: str = None):
+def check(spec: Assay, spec_fn: str):
     schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json")
 
     with open(schema_fn, "r") as stream:
diff --git a/seqspec/utils.py b/seqspec/utils.py
index 1ed0b25..eea15dc 100644
--- a/seqspec/utils.py
+++ b/seqspec/utils.py
@@ -13,8 +13,20 @@
 
 
 def load_spec(spec_fn: str):
-    with open(spec_fn, "r") as stream:
-        return load_spec_stream(stream)
+    """
+    Reads a YAML file that may be gzipped or not.
+
+    :param spec_fn: Path to the YAML or gzipped YAML file.
+    :return: Parsed YAML content as a Assay object.
+    """
+    try:
+        # Check if the file is gzipped by attempting to open it as such
+        with gzip.open(spec_fn, "rt") as stream:
+            return load_spec_stream(stream)
+    except gzip.BadGzipFile:
+        # If opening as gzip fails, assume it's a regular YAML file
+        with open(spec_fn, "r") as stream:
+            return load_spec_stream(stream)
 
 
 def load_spec_stream(spec_stream: io.IOBase):

From 1ea72390c4faf979095da6feb8d4e697a5250a91 Mon Sep 17 00:00:00 2001
From: Sina Booeshaghi <sbooeshaghi@gmail.com>
Date: Thu, 20 Feb 2025 14:01:35 -0800
Subject: [PATCH 08/21] enabled skipping checks with seqspec check

---
 seqspec/seqspec_check.py | 662 +++++++++++++++++++++++++++------------
 1 file changed, 459 insertions(+), 203 deletions(-)

diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py
index 402a99a..f45afb9 100644
--- a/seqspec/seqspec_check.py
+++ b/seqspec/seqspec_check.py
@@ -19,7 +19,6 @@ def setup_check_args(parser):
         help="Validate seqspec file against specification",
         formatter_class=RawTextHelpFormatter,
     )
-    subparser.add_argument("yaml", help="Sequencing specification yaml file")
     subparser.add_argument(
         "-o",
         metavar="OUT",
@@ -27,174 +26,342 @@ def setup_check_args(parser):
         type=str,
         default=None,
     )
+    subparser.add_argument(
+        "-s",
+        metavar="SKIP",
+        help=("Skip checks"),
+        type=str,
+        default=None,
+        choices=["igvf"],
+    )
+
+    subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str)
+
     return subparser
 
 
 def validate_check_args(parser, args):
     spec_fn = args.yaml
     o = args.o
+    s = args.s
 
-    return run_check(spec_fn, o)
+    return run_check(spec_fn, o, s)
 
 
-def run_check(spec_fn, o):
+def run_check(spec_fn, o, s):
     spec = load_spec(spec_fn)
 
     errors = check(spec, spec_fn)
+    if s == "igvf":
+        errors = filter_errors(errors, "igvf")
 
     if errors:
         if o:
             with open(o, "w") as f:
-                print("\n".join(errors), file=f)
+                for idx, e in enumerate(errors, 1):
+                    print(format_error(e, idx), file=f)
         else:
-            print("\n".join(errors))
+            for idx, e in enumerate(errors, 1):
+                print(format_error(e, idx))
     return errors
 
 
-def check(spec: Assay, spec_fn: str):
-    schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json")
+IGVF_FILTERS = [
+    {"error_type": "check_schema", "error_object": "lib_struct"},
+    {"error_type": "check_schema", "error_object": "library_protocol"},
+    {"error_type": "check_schema", "error_object": "library_kit"},
+    {"error_type": "check_schema", "error_object": "sequence_protocol"},
+    {"error_type": "check_schema", "error_object": "sequence_kit"},
+    {"error_type": "check_schema", "error_object": "md5"},
+]
 
-    with open(schema_fn, "r") as stream:
-        schema = yaml.load(stream, Loader=yaml.Loader)
-    validator = Draft4Validator(schema)
-    errors = []
-    idx = 0
 
-    # with open("del.json", "w") as f:
-    #     json.dump(spec.to_dict(), f, indent=4)
-
-    for idx, error in enumerate(validator.iter_errors(spec.to_dict()), 1):
-        errors.append(
-            f"[error {idx}] {error.message} in spec[{']['.join(repr(index) for index in error.path)}]"
-        )
-    idx += 1
-    # check that the modalities are unique
-    if len(spec.modalities) != len(set(spec.modalities)):
-        errors.append(
-            f"[error {idx}] modalities [{', '.join(spec.modalities)}] are not unique"
-        )
+def filter_errors(errors, filter_type):
+    if filter_type == "igvf":
+        et = set([i["error_type"] for i in IGVF_FILTERS])
+        eo = set([i["error_object"] for i in IGVF_FILTERS])
+        ferrors = []
+        for i in errors:
+            if i["error_type"] not in et and i["error_object"] not in eo:
+                ferrors.append(i)
+        return ferrors
+    else:
+        return errors
+
+
+def format_error(errobj, idx=0):
+    return f"[error {idx}] {errobj['error_message']}"
+
+
+def check(spec: Assay, spec_fn: str):
+    # Variety of checks against schema
+    def check_schema(spec: Assay, spec_fn: str, errors=[], idx=0):
+        schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json")
+        with open(schema_fn, "r") as stream:
+            schema = yaml.load(stream, Loader=yaml.Loader)
+        validator = Draft4Validator(schema)
+        for idx, error in enumerate(validator.iter_errors(spec.to_dict()), 1):
+            err_elements = [repr(index) for index in error.path]
+            err_path = f"spec[{']['.join(err_elements)}]"
+            errobj = {
+                "error_type": "check_schema",
+                "error_message": f"{error.message} in {err_path}",
+                "error_object": err_elements[-1],
+            }
+            # errors.append(f"[error {idx}] {error.message} in {err_path}]")
+            errors.append(errobj)
         idx += 1
+        return (errors, idx)
 
-    # check that region_ids of the first level of the spec correspond to the modalities
-    # one for each modality
-    modes = spec.modalities
-    rgns = spec.library_spec
-    for r in rgns:
-        rid = r.region_id
-        if rid not in modes:
-            errors.append(
-                f"[error {idx}] region_id '{rid}' of the first level of the spec does not correspond to a modality [{', '.join(modes)}]"
-            )
+    # Modalities are unique
+    def check_unique_modalities(spec, spec_fn, errors, idx):
+        if len(spec.modalities) != len(set(spec.modalities)):
+            errobj = {
+                "error_type": "check_unique_modalities",
+                "error_message": f"modalities [{', '.join(spec.modalities)}] are not unique",
+                "error_object": "modalities",
+            }
+            # errors.append(
+            #     f"[error {idx}] modalities [{', '.join(spec.modalities)}] are not unique"
+            # )
+            errors.append(errobj)
             idx += 1
+        return (errors, idx)
 
-    # get all of the onlist files in the spec and check that they exist relative to the path of the spec
-    modes = spec.modalities
-    olrgns = []
-    for m in modes:
-        olrgns += [i.onlist for i in spec.get_libspec(m).get_onlist_regions()]
-
-    # check paths relative to spec_fn
-    for ol in olrgns:
-        if ol.urltype == "local":
-            if ol.filename[:-3] == ".gz":
-                check = path.join(path.dirname(spec_fn), ol.filename[:-3])
-                if not path.exists(check):
-                    errors.append(f"[error {idx}] {ol.filename[:-3]} does not exist")
-                    idx += 1
-            else:
-                check = path.join(path.dirname(spec_fn), ol.filename)
-                check_gz = path.join(path.dirname(spec_fn), ol.filename + ".gz")
-                if not path.exists(check) and not path.exists(check_gz):
-                    errors.append(f"[error {idx}] {ol.filename} does not exist")
-                    idx += 1
-        elif ol.urltype == "http" or ol.urltype == "https" or ol.urltype == "ftp":
-            # ping the link with a simple http request to check if the file exists at that URI
-            if spec.seqspec_version == "0.3.0":
-                if not file_exists(ol.url):
-                    errors.append(f"[error {idx}] {ol.filename} does not exist")
-                    idx += 1
-            else:
-                if not file_exists(ol.filename):
-                    errors.append(f"[error {idx}] {ol.filename} does not exist")
-                    idx += 1
+    # Region_ids of the first level  correspond to the modalities (one per modality)
+    def check_region_ids_modalities(spec, spec_fn, errors, idx):
+        modes = spec.modalities
+        rgns = spec.library_spec
+        for r in rgns:
+            rid = r.region_id
+            if rid not in modes:
+                errobj = {
+                    "error_type": "check_region_ids_modalities",
+                    "error_message": f"region_id '{rid}' of the first level of the spec does not correspond to a modality [{', '.join(modes)}]",
+                    "error_object": "region",
+                }
+                # errors.append(
+                #     f"[error {idx}] region_id '{rid}' of the first level of the spec does not correspond to a modality [{', '.join(modes)}]"
+                # )
+                errors.append(errobj)
+                idx += 1
+        return (errors, idx)
 
-    # read ids should be unique
-    read_ids = set()
-    for read in spec.sequence_spec:
-        if read.read_id in read_ids:
-            errors.append(
-                f"[error {idx}] read_id '{read.read_id}' is not unique across all reads"
-            )
-            idx += 1
-        else:
-            read_ids.add(read.read_id)
-
-    # iterate through reads in sequence_spec and check that the fastq files exist
-    for read in spec.sequence_spec:
-        spec_fn = path.dirname(spec_fn)
-        for f in read.files:
-            if f.urltype == "local":
-                check = path.join(spec_fn, f.filename)
-                if not path.exists(check):
-                    errors.append(f"[error {idx}] {f.filename} does not exist")
-                    idx += 1
-            elif f.urltype == "http" or f.urltype == "https" or f.urltype == "ftp":
+    # Onlist files exist relative to the path of the spec or http
+    def check_onlist_files_exist(spec, spec_fn, errors, idx):
+        modes = spec.modalities
+        olrgns = []
+        for m in modes:
+            olrgns += [i.onlist for i in spec.get_libspec(m).get_onlist_regions()]
+
+        # check paths relative to spec_fn
+        for ol in olrgns:
+            if ol.urltype == "local":
+                if ol.filename[:-3] == ".gz":
+                    check = path.join(path.dirname(spec_fn), ol.filename[:-3])
+                    if not path.exists(check):
+                        errobj = {
+                            "error_type": "check_onlist_files_exist",
+                            "error_message": f"{ol.filename[:-3]} does not exist",
+                            "error_object": "onlist",
+                        }
+                        # errors.append(
+                        #     f"[error {idx}] {ol.filename[:-3]} does not exist"
+                        # )
+                        errors.append(errobj)
+                        idx += 1
+                else:
+                    check = path.join(path.dirname(spec_fn), ol.filename)
+                    check_gz = path.join(path.dirname(spec_fn), ol.filename + ".gz")
+                    if not path.exists(check) and not path.exists(check_gz):
+                        errobj = {
+                            "error_type": "check_onlist_files_exist",
+                            "error_message": f"{ol.filename} does not exist",
+                            "error_object": "onlist",
+                        }
+                        # errors.append(f"[error {idx}] {ol.filename} does not exist")
+                        errors.append(errobj)
+                        idx += 1
+            elif ol.urltype == "http" or ol.urltype == "https" or ol.urltype == "ftp":
                 # ping the link with a simple http request to check if the file exists at that URI
-                if not file_exists(f.url):
-                    errors.append(f"[error {idx}] {f.filename} does not exist")
-                    idx += 1
+                if spec.seqspec_version == "0.3.0":
+                    if not file_exists(ol.url):
+                        errobj = {
+                            "error_type": "check_onlist_files_exist",
+                            "error_message": f"{ol.filename} does not exist",
+                            "error_object": "onlist",
+                        }
+
+                        # errors.append(f"[error {idx}] {ol.filename} does not exist")
+                        errors.append(errobj)
+                        idx += 1
+                else:
+                    if not file_exists(ol.filename):
+                        errobj = {
+                            "error_type": "check_onlist_files_exist",
+                            "error_message": f"{ol.filename} does not exist",
+                            "error_object": "onlist",
+                        }
+                        # errors.append(f"[error {idx}] {ol.filename} does not exist")
+                        errors.append(errobj)
+                        idx += 1
+        return (errors, idx)
 
-    # check that the primer ids, strand tuple pairs are unique across all reads
-    primer_strand_pairs = set()
-    for read in spec.sequence_spec:
-        if (read.primer_id, read.strand) in primer_strand_pairs:
-            errors.append(
-                f"[error {idx}] primer_id '{read.primer_id}' and strand '{read.strand}' tuple is not unique across all reads"
-            )
-            idx += 1
-        else:
-            primer_strand_pairs.add((read.primer_id, read.strand))
+    # Read ids are unique
+    def check_unique_read_ids(spec, spec_fn, errors, idx):
+        read_ids = set()
+        for read in spec.sequence_spec:
+            if read.read_id in read_ids:
+                errobj = {
+                    "error_type": "check_unique_read_ids",
+                    "error_message": f"read_id '{read.read_id}' is not unique across all reads",
+                    "error_object": "read",
+                }
+                # errors.append(
+                #     f"[error {idx}] read_id '{read.read_id}' is not unique across all reads"
+                # )
+                errors.append(errobj)
+                idx += 1
+            else:
+                read_ids.add(read.read_id)
+        return (errors, idx)
 
-    # TODO add option to check md5sum
+    # Read files exist
+    def check_read_files_exist(spec, spec_fn, errors, idx):
+        for read in spec.sequence_spec:
+            spec_fn = path.dirname(spec_fn)
+            for f in read.files:
+                if f.urltype == "local":
+                    check = path.join(spec_fn, f.filename)
+                    if not path.exists(check):
+                        errobj = {
+                            "error_type": "check_read_files_exist",
+                            "error_message": f"{f.filename} does not exist",
+                            "error_object": "file",
+                        }
+                        # errors.append(f"[error {idx}] {f.filename} does not exist")
+                        errors.append(errobj)
+                        idx += 1
+                elif f.urltype == "http" or f.urltype == "https" or f.urltype == "ftp":
+                    # ping the link with a simple http request to check if the file exists at that URI
+                    if not file_exists(f.url):
+                        errobj = {
+                            "error_type": "check_read_files_exist",
+                            "error_message": f"{f.filename} does not exist",
+                            "error_object": "file",
+                        }
+                        # errors.append(f"[error {idx}] {f.filename} does not exist")
+                        errors.append(errobj)
+                        idx += 1
+        return (errors, idx)
 
-    # check that the region_id is unique across all regions
-    rgn_ids = set()
-    for m in modes:
-        for rgn in spec.get_libspec(m).get_leaves():
-            if rgn.region_id in rgn_ids:
-                errors.append(
-                    f"[error {idx}] region_id '{rgn.region_id}' is not unique across all regions"
-                )
+    # Primer ids, strand tuple pairs are unique across all reads
+    def check_unique_read_primer_strand_pairs(spec, spec_fn, errors, idx):
+        primer_strand_pairs = set()
+        for read in spec.sequence_spec:
+            if (read.primer_id, read.strand) in primer_strand_pairs:
+                errobj = {
+                    "error_type": "check_unique_read_primer_strand_pairs",
+                    "error_message": f"primer_id '{read.primer_id}' and strand '{read.strand}' tuple is not unique across all reads",
+                    "error_object": "read",
+                }
+                # errors.append(
+                #     f"[error {idx}] primer_id '{read.primer_id}' and strand '{read.strand}' tuple is not unique across all reads"
+                # )
+                errors.append(errobj)
                 idx += 1
             else:
-                rgn_ids.add(rgn.region_id)
-
-    # check that the modality is in the reads
-    for read in spec.sequence_spec:
-        if read.modality not in modes:
-            errors.append(
-                f"[error {idx}] '{read.read_id}' modality '{read.modality}' does not exist in the modalities"
-            )
-            idx += 1
+                primer_strand_pairs.add((read.primer_id, read.strand))
+        return (errors, idx)
+
+    # TODO add option to check md5sum
+    def check_md5sum(spec, spec_fn, errors, idx):
+        return (errors, idx)
+
+    # Region_id is unique across all regions
+    def check_unique_region_ids(spec, spec_fn, errors, idx):
+        modes = spec.modalities
+        rgn_ids = set()
+        for m in modes:
+            for rgn in spec.get_libspec(m).get_leaves():
+                if rgn.region_id in rgn_ids:
+                    errobj = {
+                        "error_type": "check_unique_region_ids",
+                        "error_message": f"region_id '{rgn.region_id}' is not unique across all regions",
+                        "error_object": "region",
+                    }
+                    # errors.append(
+                    #     f"[error {idx}] region_id '{rgn.region_id}' is not unique across all regions"
+                    # )
+                    errors.append(errobj)
+                    idx += 1
+                else:
+                    rgn_ids.add(rgn.region_id)
+        return (errors, idx)
+
+    # Modality is in the reads
+    def check_read_modalities(spec, spec_fn, errors, idx):
+        modes = spec.modalities
+        for read in spec.sequence_spec:
+            if read.modality not in modes:
+                errobj = {
+                    "error_type": "check_read_modalities",
+                    "error_message": f"read '{read.read_id}' modality '{read.modality}' does not exist in the modalities",
+                    "error_object": "read",
+                }
+                # errors.append(
+                #     f"[error {idx}] '{read.read_id}' modality '{read.modality}' does not exist in the modalities"
+                # )
+                errors.append(errobj)
+                idx += 1
+        return (errors, idx)
 
     # check that the unique primer ids exist as a region id in the library_spec
-    for read in spec.sequence_spec:
-        if read.primer_id not in rgn_ids:
-            errors.append(
-                f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist in the library_spec"
-            )
-            idx += 1
+    # TODO is there a better way to get the rgn_ids?
+    def check_primer_ids_in_region_ids(spec, spec_fn, errors, idx):
+        # first get all unique region_ids
+        modes = spec.modalities
+        rgn_ids = set()
+        for m in modes:
+            for rgn in spec.get_libspec(m).get_leaves():
+                if rgn.region_id in rgn_ids:
+                    pass
+                else:
+                    rgn_ids.add(rgn.region_id)
+
+        # then check that the primer ids exist in the region_ids
+        for read in spec.sequence_spec:
+            if read.primer_id not in rgn_ids:
+                errobj = {
+                    "error_type": "check_primer_ids_in_region_ids",
+                    "error_message": f"'{read.read_id}' primer_id '{read.primer_id}' does not exist in the library_spec",
+                    "error_object": "read",
+                }
+                # errors.append(
+                #     f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist in the library_spec"
+                # )
+                errors.append(errobj)
+                idx += 1
+        return (errors, idx)
 
     # NOTE: this is a strong assumption that may be relaxed in the future
     # check that the primer id for each read is in the leaves of the spec for that modality
-    for read in spec.sequence_spec:
-        mode = spec.get_libspec(read.modality)
-        leaves = mode.get_leaves()
-        if read.primer_id not in [i.region_id for i in leaves]:
-            errors.append(
-                f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist as an atomic region in the library_spec for modality '{read.modality}'"
-            )
-            idx += 1
+    def check_primer_ids_in_libspec_leaves(spec, spec_fn, errors, idx):
+        for read in spec.sequence_spec:
+            mode = spec.get_libspec(read.modality)
+            leaves = mode.get_leaves()
+            if read.primer_id not in [i.region_id for i in leaves]:
+                errobj = {
+                    "error_type": "check_primer_ids_in_libspec_leaves",
+                    "error_message": f"'{read.read_id}' primer_id '{read.primer_id}' does not exist as an atomic region in the library_spec for modality '{read.modality}'",
+                    "error_object": "read",
+                }
+                # errors.append(
+                #     f"[error {idx}] '{read.read_id}' primer_id '{read.primer_id}' does not exist as an atomic region in the library_spec for modality '{read.modality}'"
+                # )
+                errors.append(errobj)
+                idx += 1
+        return (errors, idx)
 
     # check that the max read len is not longer than the max len of the lib spec after the primer
     # for read in spec.sequence_spec:
@@ -202,89 +369,178 @@ def check(spec: Assay, spec_fn: str):
     #     leaves = mode.get_leaves()
     #     idx = [i.region_id for i in leaves].index(read.primer_id)
 
-    # if a region has a sequence type "fixed" then it should not contain subregions
-    # if a region has a sequence type "joiend" then it should contain subregions
-    # if a region has a sequence type "random" then it should not contain subregions and should be all X's
-    # if a region has a sequence type "onlist" then it should have an onlist object
-    def seqtype_check(rgn, errors, idx):
-        # this is a recursive function that iterates through all regions and checks the sequence type
-        if rgn.sequence_type == "fixed" and rgn.regions:
-            errors.append(
-                f"[error {idx}] '{rgn.region_id}' sequence_type is 'fixed' and contains subregions"
-            )
-            idx += 1
-        if rgn.sequence_type == "joined" and not rgn.regions:
-            errors.append(
-                f"[error {idx}] '{rgn.region_id}' sequence_type is 'joined' and does not contain subregions"
-            )
-            idx += 1
-        if rgn.sequence_type == "random" and rgn.regions:
-            errors.append(
-                f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and contains subregions"
-            )
-            idx += 1
-        if rgn.sequence_type == "random" and rgn.sequence != "X" * rgn.max_len:
-            errors.append(
-                f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and sequence is not all X's"
-            )
-            idx += 1
-        if rgn.sequence_type == "onlist" and not rgn.onlist:
-            errors.append(
-                f"[error {idx}] '{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object"
-            )
-            idx += 1
-        if rgn.regions:
-            for r in rgn.regions:
-                errors, idx = seqtype_check(r, errors, idx)
-        return (errors, idx)
+    def check_sequence_types(spec, spec_fn, errors, idx):
+        modes = spec.modalities
+
+        # if a region has a sequence type "fixed" then it should not contain subregions
+        # if a region has a sequence type "joiend" then it should contain subregions
+        # if a region has a sequence type "random" then it should not contain subregions and should be all X's
+        # if a region has a sequence type "onlist" then it should have an onlist object
+        def seqtype_check(rgn, errors, idx):
+            # this is a recursive function that iterates through all regions and checks the sequence type
+            if rgn.sequence_type == "fixed" and rgn.regions:
+                errobj = {
+                    "error_type": "check_sequence_types",
+                    "error_message": f"'{rgn.region_id}' sequence_type is 'fixed' and contains subregions",
+                    "error_object": "region",
+                }
+                # errors.append(
+                #     f"[error {idx}] '{rgn.region_id}' sequence_type is 'fixed' and contains subregions"
+                # )
+                errors.append(errobj)
+                idx += 1
+            if rgn.sequence_type == "joined" and not rgn.regions:
+                errobj = {
+                    "error_type": "check_sequence_types",
+                    "error_message": f"'{rgn.region_id}' sequence_type is 'joined' and does not contain subregions",
+                    "error_object": "region",
+                }
+                # errors.append(
+                #     f"[error {idx}] '{rgn.region_id}' sequence_type is 'joined' and does not contain subregions"
+                # )
+                errors.append(errobj)
+                idx += 1
+            if rgn.sequence_type == "random" and rgn.regions:
+                errobj = {
+                    "error_type": "check_sequence_types",
+                    "error_message": f"'{rgn.region_id}' sequence_type is 'random' and contains subregions",
+                    "error_object": "region",
+                }
+                # errors.append(
+                #     f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and contains subregions"
+                # )
+                errors.append(errobj)
+                idx += 1
+            if rgn.sequence_type == "random" and rgn.sequence != "X" * rgn.max_len:
+                errobj = {
+                    "error_type": "check_sequence_types",
+                    "error_message": f"'{rgn.region_id}' sequence_type is 'random' and sequence is not all X's",
+                    "error_object": "region",
+                }
+                # errors.append(
+                #     f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and sequence is not all X's"
+                # )
+                errors.append(errobj)
+                idx += 1
+            if rgn.sequence_type == "onlist" and not rgn.onlist:
+                errobj = {
+                    "error_type": "check_sequence_types",
+                    "error_message": f"'{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object",
+                    "error_object": "region",
+                }
+                # errors.append(
+                #     f"[error {idx}] '{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object"
+                # )
+                errors.append(errobj)
+                idx += 1
+            if rgn.regions:
+                for r in rgn.regions:
+                    errors, idx = seqtype_check(r, errors, idx)
+            return (errors, idx)
+
+        for m in modes:
+            for rgn in [spec.get_libspec(m)]:
+                errors, idx = seqtype_check(rgn, errors, idx)
 
-    for m in modes:
-        for rgn in [spec.get_libspec(m)]:
-            errors, idx = seqtype_check(rgn, errors, idx)
+        return (errors, idx)
 
     # check the lengths of every region against the max_len, using a recursive function
-    def len_check(rgn, errors, idx):
-        if rgn.regions:
-            for r in rgn.regions:
-                errors, idx = len_check(r, errors, idx)
-        if rgn.max_len < rgn.min_len:
-            errors.append(
-                f"[error {idx}] '{rgn.region_id}' max_len is less than min_len"
-            )
-            idx += 1
+    def check_region_lengths(spec, spec_fn, errors, idx):
+        modes = spec.modalities
+
+        def len_check(rgn, errors, idx):
+            if rgn.regions:
+                for r in rgn.regions:
+                    errors, idx = len_check(r, errors, idx)
+            if rgn.max_len < rgn.min_len:
+                errobj = {
+                    "error_type": "check_region_lengths",
+                    "error_message": f"'{rgn.region_id}' max_len is less than min_len",
+                    "error_object": "region",
+                }
+                # errors.append(
+                #     f"[error {idx}] '{rgn.region_id}' max_len is less than min_len"
+                # )
+                errors.append(errobj)
+                idx += 1
+            return (errors, idx)
+
+        for m in modes:
+            for rgn in [spec.get_libspec(m)]:
+                errors, idx = len_check(rgn, errors, idx)
         return (errors, idx)
 
-    for m in modes:
-        for rgn in [spec.get_libspec(m)]:
-            errors, idx = len_check(rgn, errors, idx)
+    # errors, idx = check_region_lengths(spec, spec_fn, errors, idx)
 
     # check that the length of the sequence is equal to the max_len using a recursive function
     # an assumption in the code and spec is that the displayed sequence is equal to the max_len
-    def seq_len_check(rgn, errors, idx):
-        if rgn.regions:
-            for r in rgn.regions:
-                errors, idx = seq_len_check(r, errors, idx)
-        if rgn.sequence and (
-            len(rgn.sequence) < rgn.min_len or len(rgn.sequence) > rgn.max_len
-        ):
-            # noqa
-            errors.append(
-                f"[error {idx}] '{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})"
-            )
-            idx += 1
-        return (errors, idx)
+    def check_sequence_lengths(spec, spec_fn, errors, idx):
+        modes = spec.modalities
+
+        def seq_len_check(rgn, errors, idx):
+            if rgn.regions:
+                for r in rgn.regions:
+                    errors, idx = seq_len_check(r, errors, idx)
+            if rgn.sequence and (
+                len(rgn.sequence) < rgn.min_len or len(rgn.sequence) > rgn.max_len
+            ):
+                # noqa
+                errobj = {
+                    "error_type": "check_sequence_lengths",
+                    "error_message": f"'{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})",
+                    "error_object": "region",
+                }
+                # errors.append(
+                #     f"[error {idx}] '{rgn.region_id}' sequence '{rgn.sequence}' has length {len(rgn.sequence)}, expected range ({rgn.min_len}, {rgn.max_len})"
+                # )
+                errors.append(errobj)
+                idx += 1
+            return (errors, idx)
 
-    for m in modes:
-        for rgn in [spec.get_libspec(m)]:
-            errors, idx = seq_len_check(rgn, errors, idx)
+        for m in modes:
+            for rgn in [spec.get_libspec(m)]:
+                errors, idx = seq_len_check(rgn, errors, idx)
+        return (errors, idx)
 
     # check that the number of files in each "File" object for all Read object are all the same length
-    nfiles = []
-    for read in spec.sequence_spec:
-        nfiles.append(len(read.files))
+    def check_read_file_count(spec, spec_fn, errors, idx):
+        nfiles = []
+        for read in spec.sequence_spec:
+            nfiles.append(len(read.files))
+
+        if len(set(nfiles)) != 1:
+            errobj = {
+                "error_type": "check_read_file_count",
+                "error_message": "Reads must have the same number of files",
+                "error_object": "read",
+            }
+            # errors.append(f"[error {idx}] Reads must have the same number of files")
+            errors.append(errobj)
+            idx += 1
+        return (errors, idx)
 
-    if len(set(nfiles)) != 1:
-        errors.append(f"[error {idx}] Reads must have the same number of files")
-        idx += 1
+    # errors, idx = check_read_file_count(spec, spec_fn, errors, idx)
+
+    errors = []
+    idx = 0
+    checks = {
+        "check_schema": check_schema,
+        "check_unique_modalities": check_unique_modalities,
+        "check_region_ids_modalities": check_region_ids_modalities,
+        "check_onlist_files_exist": check_onlist_files_exist,
+        "check_unique_read_ids": check_unique_read_ids,
+        "check_read_files_exist": check_read_files_exist,
+        "check_unique_read_primer_strand_pairs": check_unique_read_primer_strand_pairs,
+        "check_unique_region_ids": check_unique_region_ids,
+        "check_read_modalities": check_read_modalities,
+        "check_primer_ids_in_region_ids": check_primer_ids_in_region_ids,
+        "check_primer_ids_in_libspec_leaves": check_primer_ids_in_libspec_leaves,
+        "check_sequence_types": check_sequence_types,
+        "check_region_lengths": check_region_lengths,
+        "check_sequence_lengths": check_sequence_lengths,
+        "check_read_file_count": check_read_file_count,
+    }
+    for k, v in checks.items():
+        errors, idx = v(spec, spec_fn, errors, idx)
 
     return errors

From 1e1abedf30102fc2aa0c88855e096df04ef62a8e Mon Sep 17 00:00:00 2001
From: Sina Booeshaghi <sbooeshaghi@gmail.com>
Date: Fri, 21 Feb 2025 14:41:51 -0800
Subject: [PATCH 09/21] updating seqspec-html to print read info

---
 seqspec/seqspec_print.py      | 33 ++++++++------
 seqspec/seqspec_print_html.py | 84 +++++++++++++++++++++++++++++++----
 2 files changed, 95 insertions(+), 22 deletions(-)

diff --git a/seqspec/seqspec_print.py b/seqspec/seqspec_print.py
index be7f304..1324ea8 100644
--- a/seqspec/seqspec_print.py
+++ b/seqspec/seqspec_print.py
@@ -71,7 +71,7 @@ def run_seqspec_print(spec_fn, fmt, o):
     s = CMD[fmt](spec)
 
     if fmt == "png":
-        return s.savefig(o, dpi=300, bbox_inches="tight")
+        return s.savefig(o, dpi=300, bbox_inches="tight")  #
 
     if o:
         with open(o, "w") as f:
@@ -84,10 +84,26 @@ def run_seqspec_print(spec_fn, fmt, o):
 def print_seqspec_ascii(spec):
     p = []
     for modality in spec.modalities:
-        p.append(libseq(spec, modality))
+        p.append(format_libseq(spec, modality, *libseq(spec, modality)))
     return "\n".join(p)
 
 
+def format_libseq(spec, modality, p, n):
+    libspec = spec.get_libspec(modality)
+
+    s = "\n".join(
+        [
+            modality,
+            "---",
+            "\n".join(p),
+            libspec.sequence,
+            complement_sequence(libspec.sequence),
+            "\n".join(n),
+        ]
+    )
+    return s
+
+
 def libseq(spec, modality):
     libspec = spec.get_libspec(modality)
     seqspec = spec.get_seqspec(modality)
@@ -118,18 +134,7 @@ def libseq(spec, modality):
             arrow = arrowl * "-"
 
             n.append(f"{ws}<{arrow}|({idx}) {read_id}")
-
-    s = "\n".join(
-        [
-            modality,
-            "---",
-            "\n".join(p),
-            libspec.sequence,
-            complement_sequence(libspec.sequence),
-            "\n".join(n),
-        ]
-    )
-    return s
+    return (p, n)
 
 
 def run_print(data):
diff --git a/seqspec/seqspec_print_html.py b/seqspec/seqspec_print_html.py
index f469238..69b904d 100644
--- a/seqspec/seqspec_print_html.py
+++ b/seqspec/seqspec_print_html.py
@@ -1,4 +1,7 @@
+from seqspec.Assay import Assay
 from seqspec.Region import Region
+from seqspec.Read import Read
+from seqspec.Read import File
 
 
 def print_seqspec_html(spec):
@@ -94,6 +97,7 @@ def atomicRegionTemplate(
       <li>onlist: {onlist}</li>
       <li> regions: {subseq}
       </li>
+    </ul>
   </details>
     """
     return s
@@ -116,21 +120,85 @@ def regionsTemplate(regions):
     return s
 
 
-def libStructTemplate(region):
+def libStructTemplate(spec, modality):
+    from seqspec.seqspec_print import libseq
+    from seqspec.Region import complement_sequence
+
+    libspec = spec.get_libspec(modality)
+    seqspec = spec.get_seqspec(modality)  # noqa
+    p, n = libseq(spec, modality)
+
+    cseq = colorSeq(libspec.get_leaves())
+    seq = "\n".join(
+        [
+            "\n".join(p),
+            cseq,
+            complement_sequence(libspec.sequence),
+            "\n".join(n),
+        ]
+    )
     s = f"""
-  <h6 style="text-align: center">{region.name}</h6>
+  <h6 style="text-align: center">{modality}</h6>
   <pre
     style="overflow-x: auto; text-align: left; background-color: #f6f8fa"
   >
-{colorSeq(region.get_leaves())}</pre>
+{seq}</pre>
     """
     return s
 
 
-def multiModalTemplate(library_spec):
-    s = "".join(
-        [libStructTemplate(v) + "\n" + regionsTemplate(v.regions) for v in library_spec]
-    )
+def atomicReadTemplate(read: Read):
+    files = "".join(atomicFileTemplate(f) for f in read.files) if read.files else ""
+
+    s = f"""
+      <details>
+        <summary>{read.name}</summary>
+        <ul>
+          <li>read_id: {read.read_id}</li>
+          <li>primer_id: {read.primer_id}</li>
+          <li>min_len: {read.min_len}</li>
+          <li>max_len: {read.max_len}</li>
+          <li>strand: {read.strand}</li>
+          <li>
+            files:
+            <ul>
+              {files}
+            </ul>
+          </li>
+        </ul>
+      </details>
+    """
+    return s
+
+
+def atomicFileTemplate(file: File):
+    s = f"""
+        <li>{file.filename} (md5: {file.md5})</li>
+    """
+    return s
+
+
+def readsTemplate(reads):
+    s = f"""<ol><li>
+    {'</li><li>'.join([atomicReadTemplate(r) for r in reads])}
+    </li></ol>"""
+    return s
+
+
+def multiModalTemplate(spec: Assay):
+    modes = spec.modalities
+    s = ""
+    for m in modes:
+        libspec = spec.get_libspec(m)
+        seqspec = spec.get_seqspec(m)
+
+        s += f"""
+          {libStructTemplate(spec, m)}
+          <h3>Sequence structure</h3>
+          {readsTemplate(seqspec)}
+          <h4>Library structure</h4>
+          {regionsTemplate(libspec.get_leaves())}
+        """
     return s
 
 
@@ -173,7 +241,7 @@ def htmlTemplate(spec):
         </div>
         <div id="library_spec">
           <h2>Final library</h2>
-          {multiModalTemplate(spec.library_spec)}
+          {multiModalTemplate(spec)}
         </div>
       </div>
     </body>

From 1b2f3456a0fc88655f692ff48f47d1e706e61379 Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Thu, 20 Mar 2025 08:49:59 -0500
Subject: [PATCH 10/21] CHECK-161-onlist (#3)

---
 seqspec/seqspec_check.py | 47 +++++++++++++++++++++++++++++-----------
 1 file changed, 34 insertions(+), 13 deletions(-)

diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py
index f45afb9..766d4fd 100644
--- a/seqspec/seqspec_check.py
+++ b/seqspec/seqspec_check.py
@@ -32,7 +32,7 @@ def setup_check_args(parser):
         help=("Skip checks"),
         type=str,
         default=None,
-        choices=["igvf"],
+        choices=["igvf", "igvf_onlist_skip"],
     )
 
     subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str)
@@ -54,6 +54,8 @@ def run_check(spec_fn, o, s):
     errors = check(spec, spec_fn)
     if s == "igvf":
         errors = filter_errors(errors, "igvf")
+    elif s == "igvf_onlist_skip":
+        errors = filter_errors(errors, "igvf_onlist_skip")
 
     if errors:
         if o:
@@ -66,24 +68,43 @@ def run_check(spec_fn, o, s):
     return errors
 
 
-IGVF_FILTERS = [
-    {"error_type": "check_schema", "error_object": "lib_struct"},
-    {"error_type": "check_schema", "error_object": "library_protocol"},
-    {"error_type": "check_schema", "error_object": "library_kit"},
-    {"error_type": "check_schema", "error_object": "sequence_protocol"},
-    {"error_type": "check_schema", "error_object": "sequence_kit"},
-    {"error_type": "check_schema", "error_object": "md5"},
-]
+IGVF_FILTERS = {
+    "check_schema": [
+        "'lib_struct'",
+        "'library_protocol'",
+        "'library_kit'",
+        "'sequence_protocol'",
+        "'sequence_kit'",
+        "'md5'",
+    ],
+}
+IGVF_ONLIST_SKIP_FILTERS = {
+    "check_schema": [
+        "'lib_struct'",
+        "'library_protocol'",
+        "'library_kit'",
+        "'sequence_protocol'",
+        "'sequence_kit'",
+        "'md5'",
+    ],
+    "check_onlist_files_exist": ["onlist"],
+}
 
 
 def filter_errors(errors, filter_type):
+    filters = None
     if filter_type == "igvf":
-        et = set([i["error_type"] for i in IGVF_FILTERS])
-        eo = set([i["error_object"] for i in IGVF_FILTERS])
+        filters = IGVF_FILTERS
+    elif filter_type == "igvf_onlist_skip":
+        filters = IGVF_ONLIST_SKIP_FILTERS
+    if filters:
         ferrors = []
         for i in errors:
-            if i["error_type"] not in et and i["error_object"] not in eo:
-                ferrors.append(i)
+            error_type = i["error_type"]
+            error_object = i["error_object"]
+            if error_type in filters and error_object in filters[error_type]:
+                continue
+            ferrors.append(i)
         return ferrors
     else:
         return errors

From 676c0200edaca7e3cce6b3c15094fd5fb8ee899a Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Fri, 28 Mar 2025 14:03:08 -0500
Subject: [PATCH 11/21] Merge devel to dev (#7)

---
 seqspec/File.py              | 24 ++++++-----------
 seqspec/Read.py              | 17 +++---------
 seqspec/seqspec_check.py     | 51 ++++++++++++++++--------------------
 seqspec/seqspec_print.py     |  7 ++---
 setup.cfg                    |  2 +-
 tests/test_region.py         | 37 +++++++++++++++++++++-----
 tests/test_seqspec_check.py  |  2 +-
 tests/test_seqspec_onlist.py | 35 +++++++++++++------------
 tests/test_seqspec_print.py  |  8 +++---
 tests/test_utils.py          | 50 +++++++++++++++++++++++++++--------
 10 files changed, 132 insertions(+), 101 deletions(-)

diff --git a/seqspec/File.py b/seqspec/File.py
index 70a52f1..423dced 100644
--- a/seqspec/File.py
+++ b/seqspec/File.py
@@ -24,26 +24,18 @@ def __init__(
         self.md5 = md5
 
     def __repr__(self) -> str:
-        d = {
-            "file_id": self.file_id,
-            "filename": self.filename,
-            "filetype": self.filetype,
-            "filesize": self.filesize,
-            "url": self.url,
-            "urltype": self.urltype,
-            "md5": self.md5,
-        }
+        d = self.to_dict()
         return f"{d}"
 
     def to_dict(self):
         d = {
-            "file_id": self.file_id,
-            "filename": self.filename,
-            "filetype": self.filetype,
-            "filesize": self.filesize,
-            "url": self.url,
-            "urltype": self.urltype,
-            "md5": self.md5,
+            "file_id": getattr(self, "file_id", None),
+            "filename": getattr(self, "filename", None),
+            "filetype": getattr(self, "filetype", None),
+            "filesize": getattr(self, "filesize", None),
+            "url": getattr(self, "url", None),
+            "urltype": getattr(self, "urltype", None),
+            "md5": getattr(self, "md5", None),
         }
         return d
 
diff --git a/seqspec/Read.py b/seqspec/Read.py
index 6681db3..6984998 100644
--- a/seqspec/Read.py
+++ b/seqspec/Read.py
@@ -32,24 +32,13 @@ def set_files(self, files: Optional[List["File"]] = []):
         self.files = files
 
     def __repr__(self) -> str:
-        d = {
-            "read_id": self.read_id,
-            "name": self.name,
-            "modality": self.modality,
-            "primer_id": self.primer_id,
-            "min_len": self.min_len,
-            "max_len": self.max_len,
-            "strand": self.strand,
-            "files": self.files,
-        }
+        d = self.to_dict()
         return f"{d}"
 
     def to_dict(self):
         # TODO is this necessary for backwards compatibility?
-        if self.files:
-            files = [i.to_dict() for i in self.files]
-        else:
-            files = []
+        files = getattr(self, "files", [])
+        files = [i.to_dict() for i in files]
         d = {
             "read_id": self.read_id,
             "name": self.name,
diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py
index 766d4fd..1e62276 100644
--- a/seqspec/seqspec_check.py
+++ b/seqspec/seqspec_check.py
@@ -68,28 +68,17 @@ def run_check(spec_fn, o, s):
     return errors
 
 
-IGVF_FILTERS = {
-    "check_schema": [
-        "'lib_struct'",
-        "'library_protocol'",
-        "'library_kit'",
-        "'sequence_protocol'",
-        "'sequence_kit'",
-        "'md5'",
-    ],
-}
-IGVF_ONLIST_SKIP_FILTERS = {
-    "check_schema": [
-        "'lib_struct'",
-        "'library_protocol'",
-        "'library_kit'",
-        "'sequence_protocol'",
-        "'sequence_kit'",
-        "'md5'",
-    ],
-    "check_onlist_files_exist": ["onlist"],
-}
-
+IGVF_FILTERS = [
+    {"error_type": "check_schema", "error_object": "'lib_struct'"},
+    {"error_type": "check_schema", "error_object": "'library_protocol'"},
+    {"error_type": "check_schema", "error_object": "'library_kit'"},
+    {"error_type": "check_schema", "error_object": "'sequence_protocol'"},
+    {"error_type": "check_schema", "error_object": "'sequence_kit'"},
+    {"error_type": "check_schema", "error_object": "'md5'"},
+]
+IGVF_ONLIST_SKIP_FILTERS = IGVF_FILTERS + [
+    {"error_type": "check_onlist_files_exist", "error_object": "onlist"}
+]
 
 def filter_errors(errors, filter_type):
     filters = None
@@ -97,14 +86,20 @@ def filter_errors(errors, filter_type):
         filters = IGVF_FILTERS
     elif filter_type == "igvf_onlist_skip":
         filters = IGVF_ONLIST_SKIP_FILTERS
+
     if filters:
         ferrors = []
-        for i in errors:
-            error_type = i["error_type"]
-            error_object = i["error_object"]
-            if error_type in filters and error_object in filters[error_type]:
-                continue
-            ferrors.append(i)
+        for error in errors:
+            # Check if this specific error combination exists in the filters
+            should_filter = any(
+                error["error_type"] == filter_item["error_type"]
+                and error["error_object"] == filter_item["error_object"]
+                for filter_item in filters
+            )
+
+            # Only keep errors that don't match our filter criteria
+            if not should_filter:
+                ferrors.append(error)
         return ferrors
     else:
         return errors
diff --git a/seqspec/seqspec_print.py b/seqspec/seqspec_print.py
index 1324ea8..11322f6 100644
--- a/seqspec/seqspec_print.py
+++ b/seqspec/seqspec_print.py
@@ -201,9 +201,10 @@ def plot_png(assay, modalities, modes, nmodes, lengths):
     plt.rcParams.update({"font.size": fsize})
 
     fig, ax = plt.subplots(
-        figsize=(10, 1 * nmodes), nrows=nmodes, constrained_layout=True
+        figsize=(10, 1 * nmodes), nrows=nmodes
     )
-    fig.suptitle(assay)
+    title_offset = 0.98 if nmodes > 1 else 1.2
+    fig.suptitle(assay, y=title_offset)
     rts = []
     for m, ax in zip(modes, fig.get_axes()):
         # get leaves
@@ -245,7 +246,7 @@ def plot_png(assay, modalities, modes, nmodes, lengths):
         ax.autoscale()
 
         # since all axes use the same scale, set the xlim to be 0 to the max length
-        ax.set(**{"xlim": (0, max(lengths))})
+        ax.set(**{"xlim": (0, max(lengths)), "ylim": (0, 1)})
 
         # hide the spines
         for spine in ["right", "top", "left", "bottom"]:
diff --git a/setup.cfg b/setup.cfg
index ec77cff..435b9be 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -16,7 +16,7 @@ max-line-length = 88
 extend-ignore = E203,E501
 
 [tox:tox]
-env_list = py{37,38,39,310,311}
+env_list = py{311,312,313}
 skip_missing_interpreters = True
 
 [testenv]
diff --git a/tests/test_region.py b/tests/test_region.py
index 2e8127c..710665e 100644
--- a/tests/test_region.py
+++ b/tests/test_region.py
@@ -52,21 +52,37 @@ def read_rna_dict(read_id, min_len=0, max_len=100):
         "min_len": min_len,
         "max_len": max_len,
         "strand": "pos",
+        "files": [],
     }
     return expected
 
 
 class TestOnlist(TestCase):
     def test_simple_onlist(self):
-        name = "barcodes.txt"
+        file_id = "123"
+        filename = "barcodes.tsv"
+        filetype = "tsv"
+        filesize = 300
+        url = filename
+        urltype = "file"
         md5sum = "d41d8cd98f00b204e9800998ecf8427e"
         location = "local"
 
-        permit = Onlist(name, md5sum, location)
+        permit = Onlist(
+            file_id, filename, filetype, filesize, url, "file", md5sum, location
+        )
 
         self.assertEqual(
             permit.to_dict(),
-            {"filename": name, "md5": md5sum, "location": location},
+            {
+                "file_id": file_id,
+                "filename": filename,
+                "filetype": filetype,
+                "filesize": filesize,
+                "url": url,
+                "urltype": urltype,
+                "md5": md5sum,
+            },
         )
 
 
@@ -181,11 +197,16 @@ def test_onlists(self):
         sequence_type = "stuff"
         sequence = "AACGTGAT"
 
-        list_name = "barcodes.txt"
+        list_id = "123"
+        list_name = "barcodes.tsv"
+        list_type = "tsv"
+        list_size = 300
+        list_url = list_name
+        list_urltype = "file"
         list_md5sum = "d41d8cd98f00b204e9800998ecf8427e"
         list_location = "local"
 
-        permited = Onlist(list_name, list_md5sum, list_location)
+        permited = Onlist(list_id, list_name, list_type, list_size, list_url, list_urltype, list_md5sum, list_location)
 
         r = Region(
             region_name,
@@ -202,8 +223,12 @@ def test_onlists(self):
             "name": region_name,
             "sequence_type": sequence_type,
             "onlist": {
+                "file_id": list_id,
                 "filename": list_name,
-                "location": list_location,
+                "filetype": list_type,
+                "filesize": list_size,
+                "url": list_name,
+                "urltype": list_urltype,
                 "md5": list_md5sum,
             },
             "sequence": sequence,
diff --git a/tests/test_seqspec_check.py b/tests/test_seqspec_check.py
index c613257..ae8ad6b 100644
--- a/tests/test_seqspec_check.py
+++ b/tests/test_seqspec_check.py
@@ -49,4 +49,4 @@ def test_validate_check_args(self):
             with patch("os.path.exists") as path_exists:
                 path_exists.return_value = True
                 errors = validate_check_args(None, args)
-                self.assertEqual(errors, None)
+                self.assertEqual(errors, [])
diff --git a/tests/test_seqspec_onlist.py b/tests/test_seqspec_onlist.py
index 6037088..40f6ad9 100644
--- a/tests/test_seqspec_onlist.py
+++ b/tests/test_seqspec_onlist.py
@@ -49,30 +49,30 @@ def test_run_onlist_region(self):
             self.assertEqual(len(regions), 1)
             region = regions[0]
             self.assertEqual(region.location, "local")
-            self.assertEqual(region.filename, "index_onlist.txt")
+            self.assertEqual(region.filename, "index_onlist.tsv")
             self.assertEqual(region.md5, "939cb244b4c43248fcc795bbe79599b0")
 
     def test_run_onlist_read(self):
-        with create_temporary_barcode_files(["index_onlist.txt"]):
+        with create_temporary_barcode_files(["index_onlist.tsv"]):
             spec = load_example_spec(example_spec)
             reads = run_onlist_read(spec, "rna", "read2.fastq.gz")
             self.assertEqual(len(reads), 1)
             read = reads[0]
             self.assertEqual(read.location, "local")
-            self.assertEqual(read.filename, "index_onlist.txt")
+            self.assertEqual(read.filename, "index_onlist.tsv")
             self.assertEqual(read.md5, "939cb244b4c43248fcc795bbe79599b0")
 
     def test_find_list_target_dir_local(self):
         with create_temporary_barcode_files(["index_onlist.txt"]) as tmpdir:
-            filename = os.path.join(tmpdir, "temp.txt")
+            filename = os.path.join(tmpdir, "temp.tsv")
 
-            onlist1 = Onlist(filename, "d41d8cd98f00b204e9800998ecf8427e", "local")
+            onlist1 = Onlist("temp_id", filename, "tsv", 300, filename, "local", "d41d8cd98f00b204e9800998ecf8427e", "local")
 
             target_dir = find_list_target_dir([onlist1])
             self.assertEqual(target_dir, tmpdir)
 
     def test_find_list_target_dir_remote(self):
-        onlist1 = Onlist("http:localhost:9/temp.txt", "d41d8cd98f00b204e9800998ecf8427e", "remote")
+        onlist1 = Onlist("temp_id", "temp.tsv", "tsv", 300, "http://localhost:9/temp.tsv", "http", "d41d8cd98f00b204e9800998ecf8427e", "remote")
 
         target_dir = find_list_target_dir([onlist1])
         self.assertEqual(target_dir, os.getcwd())
@@ -124,7 +124,7 @@ def test_join_onlist_multi(self):
         self.assertEqual(joined[2], "- GGTT")
 
     def test_local_validate_onlist_args(self):
-        onlist_name = "index_onlist.txt"
+        onlist_name = "index_onlist.tsv"
         with create_temporary_barcode_files([onlist_name]) as tmpdir:
             expected_onlist_path = os.path.join(tmpdir, onlist_name)
             spec_path = os.path.join(tmpdir, "spec.yaml")
@@ -133,15 +133,13 @@ def test_local_validate_onlist_args(self):
             subparser = parser.add_subparsers(dest="command")
             subparser = setup_onlist_args(subparser)
             args = parser.parse_args([
-                "onlist", "-m", "rna", "-r", "read1.fastq.gz", "-f", "multi", spec_path])
+                "onlist", "-m", "rna", "-i", "read1.fastq.gz", "-f", "multi", spec_path])
 
             def load_spec(*args, **kwargs):
                 return load_example_spec(example_spec)
 
             with patch("seqspec.seqspec_onlist.load_spec", load_spec) as loader:
-                onlist_path = validate_onlist_args(parser, args)
-
-                self.assertEqual(onlist_path, expected_onlist_path)
+                validate_onlist_args(parser, args)
 
     def test_local_cached_remote_validate_onlist_args(self):
         # Test that we will can use a locally cached copy of one barcode file
@@ -155,23 +153,26 @@ def test_local_cached_remote_validate_onlist_args(self):
             subparser = parser.add_subparsers(dest="command")
             subparser = setup_onlist_args(subparser)
             args = parser.parse_args([
-                "onlist", "-m", "rna", "-r", "read1.fastq.gz", "-f", "multi", spec_path])
+                "onlist", "-m", "rna", "-i", "read1.fastq.gz", "-f", "multi", spec_path])
 
             def load_spec(*args, **kwargs):
                 remote_spec = example_spec.replace(
                     "location: local",
                     "location: remote"
                 ).replace(
-                    "filename: index_onlist.txt",
-                    "filename: http://localhost:9/foo/index_onlist.txt"
+                    "url: index_onlist.tsv",
+                    "url: http://localhost:9/foo/index_onlist.tsv"
+                ).replace(
+                    "urltype: local",
+                    "urltype: http",
                 )
                 print(remote_spec)
                 return load_example_spec(remote_spec)
 
-            with patch("seqspec.seqspec_onlist.load_spec", load_spec) as loader:
-                onlist_path = validate_onlist_args(parser, args)
+            with patch("seqspec.seqspec_onlist.load_spec", load_spec) as loader, patch("seqspec.seqspec_onlist.read_remote_list", return_value="index_onlist.tsv") as fake_remote_list:
+                # Failed validation would raise an exception
+                validate_onlist_args(parser, args)
 
-                self.assertEqual(onlist_path, expected_onlist_path)
 
     def test_write_onlist_no_double_spacing(self):
         # Make sure that joined onlists don't end up double spaced.
diff --git a/tests/test_seqspec_print.py b/tests/test_seqspec_print.py
index 955281a..c8b229e 100644
--- a/tests/test_seqspec_print.py
+++ b/tests/test_seqspec_print.py
@@ -5,8 +5,8 @@
 from matplotlib.figure import Figure
 
 from seqspec.seqspec_print import (
-    run_print_library_tree,
-    run_print_library_png,
+    print_library_ascii,
+    print_seqspec_png,
 )
 from seqspec.utils import load_spec_stream
 
@@ -18,10 +18,10 @@ def setUp(self):
         self.example_spec = load_spec_stream(StringIO(example_spec_text))
 
     def test_seqspec_print_tree(self):
-        tree = run_print_library_tree(self.example_spec)
+        tree = print_library_ascii(self.example_spec)
         self.assertIn("SOLiD_P1_adapter", tree)
 
     def test_seqspec_print_png(self):
-        fig = run_print_library_png(self.example_spec)
+        fig = print_seqspec_png(self.example_spec)
 
         self.assertIsInstance(fig, Figure)
diff --git a/tests/test_utils.py b/tests/test_utils.py
index e1cc281..34a36bc 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -28,8 +28,8 @@
     region_rna_linker_dict,
 )
 
-example_spec = f"""!Assay
-seqspec_version: { __version__ }
+example_spec = """!Assay
+seqspec_version: 0.3.0
 assay_id: test_assay
 name: my assay
 doi: https://doi.org/10.1038/nmeth.1315
@@ -39,10 +39,10 @@
 modalities:
 - rna
 lib_struct: https://teichlab.github.io/scg_lib_structs/methods_html/tang2009.html
-library_protocol: custom 1
-library_kit: custom 2
-sequence_protocol: custom 3
-sequence_kit: custom 4
+library_protocol: "Custom"
+library_kit: "Custom"
+sequence_protocol: "Custom"
+sequence_kit: "Custom"
 sequence_spec:
 - !Read
   read_id: read1.fastq.gz
@@ -51,8 +51,16 @@
   primer_id: SOLiD_P1_adapter
   min_len: 90
   max_len: 187
-  # this is a guess
   strand: pos
+  files:
+  - !File
+    file_id: read1
+    filename: read1.fastq.gz
+    filetype: fastq
+    filesize: 123456789
+    url: read1.fastq.gz
+    urltype: local
+    md5: 68b329da9893e34099c7d8ad5cb9c940
 - !Read
   read_id: read2.fastq.gz
   name: read2 for experiment
@@ -61,6 +69,15 @@
   min_len: 25
   max_len: 25
   strand: neg
+  files:
+  - !File
+    file_id: read2
+    filename: read2.fastq.gz
+    filetype: fastq
+    filesize: 123456789
+    url: read2.fastq.gz
+    urltype: local
+    md5: 68b329da9893e34099c7d8ad5cb9c940
 library_spec:
 - !Region
   region_id: rna
@@ -114,7 +131,12 @@
     min_len: 6
     max_len: 6
     onlist: !Onlist
-      filename: index_onlist.txt
+      file_id: onlist-1
+      filename: index_onlist.tsv
+      filetype: tsv
+      filesize: 300
+      url: index_onlist.tsv
+      urltype: local
       md5: 939cb244b4c43248fcc795bbe79599b0
       location: local
     regions: null
@@ -195,7 +217,7 @@ def test_read_local_list(self):
             with gzip.open(temp_list_filename, "wt") as stream:
                 stream.write(fake_contents)
 
-            onlist1 = Onlist(temp_list_filename, fake_md5, "local")
+            onlist1 = Onlist("123", temp_list_filename, "tsv", 300, temp_list_filename, "local", fake_md5, "local")
             loaded_list = read_local_list(onlist1)
 
             self.assertEqual(fake_onlist, loaded_list)
@@ -210,7 +232,7 @@ def test_read_local_list_gz(self):
             with open(temp_list_filename, "wt") as stream:
                 stream.write(fake_contents)
 
-            onlist1 = Onlist(temp_list_filename, fake_md5, "local")
+            onlist1 = Onlist("123", temp_list_filename, "tsv", 300, temp_list_filename, "local", fake_md5, "local")
             loaded_list = read_local_list(onlist1)
 
             self.assertEqual(fake_onlist, loaded_list)
@@ -233,12 +255,18 @@ def raise_for_status(self):
             return response()
 
         with patch("requests.get", new=fake_request_get):
-            onlist1 = Onlist("http://localhost/testlist.txt", fake_md5, "remote")
+            url = "http://localhost/testlist.txt"
+            onlist1 = Onlist("123", "testlist.txt", "http", 300, url, "http", fake_md5, "remote")
             loaded_list = read_remote_list(onlist1)
 
             self.assertEqual(fake_onlist, loaded_list)
 
     def test_get_igvf_auth(self):
+        # clean out the environment we inherited
+        for term in ["IGVF_SECRET_KEY", "IGVF_API_KEY"]:
+            if term in os.environ:
+                del os.environ[term]
+
         test_data = [
             (None, None, None),
             ("user", "pass", ("user", "pass")),

From c62b0079ee4c813d9ed8fe141d4fb35d5040a366 Mon Sep 17 00:00:00 2001
From: Mingjie Li <mingjiecn@gmail.com>
Date: Wed, 30 Apr 2025 14:58:02 -0500
Subject: [PATCH 12/21] fix version

---
 README.md                  |  3 +--
 seqspec/__init__.py        |  9 ++++++++-
 seqspec/seqspec_check.py   |  4 +++-
 seqspec/seqspec_onlist.py  | 16 ++++++++++++++++
 seqspec/seqspec_upgrade.py |  7 ++++---
 setup.cfg                  |  1 -
 setup.py                   |  3 ++-
 7 files changed, 34 insertions(+), 9 deletions(-)

diff --git a/README.md b/README.md
index 2c7c115..9ddbafb 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,6 @@
 # seqspec
 
-![github version](https://img.shields.io/badge/Version-0.3.1-informational)
-[![pypi version](https://img.shields.io/pypi/v/seqspec)](https://pypi.org/project/seqspec/0.3.1/)
+![github version](https://img.shields.io/badge/Version-0.3.0-informational)
 ![python versions](https://img.shields.io/pypi/pyversions/seqspec)
 [![license](https://img.shields.io/pypi/l/seqspec)](LICENSE)
 
diff --git a/seqspec/__init__.py b/seqspec/__init__.py
index 260c070..ad6cb61 100644
--- a/seqspec/__init__.py
+++ b/seqspec/__init__.py
@@ -1 +1,8 @@
-__version__ = "0.3.1"
+__version__ = "0.3.0"
+
+
+def get_version():
+    """
+    Returns the version of the package.
+    """
+    return __version__
diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py
index 1e62276..3b5a8b0 100644
--- a/seqspec/seqspec_check.py
+++ b/seqspec/seqspec_check.py
@@ -4,6 +4,7 @@
 from seqspec.utils import load_spec, file_exists
 from seqspec.Assay import Assay
 from argparse import RawTextHelpFormatter
+from seqspec import get_version
 
 
 def setup_check_args(parser):
@@ -80,6 +81,7 @@ def run_check(spec_fn, o, s):
     {"error_type": "check_onlist_files_exist", "error_object": "onlist"}
 ]
 
+
 def filter_errors(errors, filter_type):
     filters = None
     if filter_type == "igvf":
@@ -200,7 +202,7 @@ def check_onlist_files_exist(spec, spec_fn, errors, idx):
                         idx += 1
             elif ol.urltype == "http" or ol.urltype == "https" or ol.urltype == "ftp":
                 # ping the link with a simple http request to check if the file exists at that URI
-                if spec.seqspec_version == "0.3.0":
+                if spec.seqspec_version == get_version():
                     if not file_exists(ol.url):
                         errobj = {
                             "error_type": "check_onlist_files_exist",
diff --git a/seqspec/seqspec_onlist.py b/seqspec/seqspec_onlist.py
index b021a5a..dac8230 100644
--- a/seqspec/seqspec_onlist.py
+++ b/seqspec/seqspec_onlist.py
@@ -124,6 +124,9 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o):
     }
 
     onlists = CMD[idtype](spec, modality, ids)
+    print("idtype:", idtype)
+    print("modality:", modality)
+    print("ids:", ids)
 
     if len(onlists) == 0:
         raise ValueError(f"No onlist found for {modality}, {idtype}, {ids}")
@@ -162,6 +165,7 @@ def run_onlist_region_type(
     spec: Assay, modality: str, region_type: str
 ) -> List[Onlist]:
     regions = find_by_region_type(spec, modality, region_type)
+    print("regions:", regions)
     onlists: List[Onlist] = []
     for r in regions:
         ol = r.get_onlist()
@@ -182,15 +186,27 @@ def run_onlist_region(spec: Assay, modality: str, region_id: str) -> List[Onlist
 
 def run_onlist_read(spec: Assay, modality: str, read_id: str) -> List[Onlist]:
     (read, rgns) = map_read_id_to_regions(spec, modality, read_id)
+    print("read:", read)
+    print()
+    print("rgns:", rgns)
+    print()
     # convert regions to region coordinates
     rcs = project_regions_to_coordinates(rgns)
+    print("rcs:", rcs)
+    print()
     # intersect read with region coordinates
     new_rcs = itx_read(rcs, 0, read.max_len)
+    print("new_rcs:", new_rcs)
+    print("reads mex len:", read.max_len)
+    print()
 
     onlists: List[Onlist] = []
     for r in new_rcs:
         ol = r.get_onlist()
         if ol:
+            print("region:", r)
+            print("onlist:", ol)
+            print()
             onlists.append(ol)
 
     return onlists
diff --git a/seqspec/seqspec_upgrade.py b/seqspec/seqspec_upgrade.py
index 8a982a4..032adbb 100644
--- a/seqspec/seqspec_upgrade.py
+++ b/seqspec/seqspec_upgrade.py
@@ -2,6 +2,7 @@
 from seqspec.File import File
 from seqspec.Region import Onlist
 from argparse import RawTextHelpFormatter
+from seqspec import get_version
 
 
 def setup_upgrade_args(parser):
@@ -53,14 +54,14 @@ def upgrade(spec, version):
         "0.1.0": upgrade_0_1_0_to_0_3_0,
         "0.1.1": upgrade_0_1_1_to_0_3_0,
         "0.2.0": upgrade_0_2_0_to_0_3_0,
-        "0.3.0": upgrade_0_3_0_to_0_3_0,
+        get_version(): no_upgrade,
     }
 
     u = UPGRADE[version](spec)
     return u
 
 
-def upgrade_0_3_0_to_0_3_0(spec):
+def no_upgrade(spec):
     return spec
 
 
@@ -100,7 +101,7 @@ def upgrade_0_2_0_to_0_3_0(spec):
                     md5=md5,
                     location=location,
                 )
-    spec.seqspec_version = "0.3.0"
+    spec.seqspec_version = get_version()
     return spec
 
 
diff --git a/setup.cfg b/setup.cfg
index 435b9be..594931a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,4 @@
 [bumpversion]
-current_version = 0.3.1
 commit = True
 tag = True
 
diff --git a/setup.py b/setup.py
index 5afafa8..1cff4a8 100644
--- a/setup.py
+++ b/setup.py
@@ -1,4 +1,5 @@
 from setuptools import find_packages, setup
+import seqspec
 
 
 def read(path):
@@ -10,7 +11,7 @@ def read(path):
 
 setup(
     name="seqspec",
-    version="0.3.1",
+    version=seqspec.get_version(),
     url="https://github.com/sbooeshaghi/seqspec",
     author="Sina Booeshaghi",
     author_email="abooesha@caltech.edu",

From 255c2ca64527765215aaea8a432b337d4d357fde Mon Sep 17 00:00:00 2001
From: Mingjie Li <mingjiecn@gmail.com>
Date: Wed, 30 Apr 2025 15:01:40 -0500
Subject: [PATCH 13/21] remove print

---
 seqspec/seqspec_onlist.py | 16 ----------------
 1 file changed, 16 deletions(-)

diff --git a/seqspec/seqspec_onlist.py b/seqspec/seqspec_onlist.py
index dac8230..b021a5a 100644
--- a/seqspec/seqspec_onlist.py
+++ b/seqspec/seqspec_onlist.py
@@ -124,9 +124,6 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o):
     }
 
     onlists = CMD[idtype](spec, modality, ids)
-    print("idtype:", idtype)
-    print("modality:", modality)
-    print("ids:", ids)
 
     if len(onlists) == 0:
         raise ValueError(f"No onlist found for {modality}, {idtype}, {ids}")
@@ -165,7 +162,6 @@ def run_onlist_region_type(
     spec: Assay, modality: str, region_type: str
 ) -> List[Onlist]:
     regions = find_by_region_type(spec, modality, region_type)
-    print("regions:", regions)
     onlists: List[Onlist] = []
     for r in regions:
         ol = r.get_onlist()
@@ -186,27 +182,15 @@ def run_onlist_region(spec: Assay, modality: str, region_id: str) -> List[Onlist
 
 def run_onlist_read(spec: Assay, modality: str, read_id: str) -> List[Onlist]:
     (read, rgns) = map_read_id_to_regions(spec, modality, read_id)
-    print("read:", read)
-    print()
-    print("rgns:", rgns)
-    print()
     # convert regions to region coordinates
     rcs = project_regions_to_coordinates(rgns)
-    print("rcs:", rcs)
-    print()
     # intersect read with region coordinates
     new_rcs = itx_read(rcs, 0, read.max_len)
-    print("new_rcs:", new_rcs)
-    print("reads mex len:", read.max_len)
-    print()
 
     onlists: List[Onlist] = []
     for r in new_rcs:
         ol = r.get_onlist()
         if ol:
-            print("region:", r)
-            print("onlist:", ol)
-            print()
             onlists.append(ol)
 
     return onlists

From 17b1923cd507ecdf42e5b36d6c544fecc9e2b171 Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Mon, 30 Jun 2025 14:07:33 -0500
Subject: [PATCH 14/21] CHECK-207-kb-single (#9)

---
 requirements.txt               |   3 +-
 seqspec/Region.py              |   2 +-
 seqspec/main.py                | 133 ++++++++-----
 seqspec/seqspec_check.py       |  50 +++--
 seqspec/seqspec_convert.py     | 137 ++++++++-----
 seqspec/seqspec_diff.py        | 169 +++++++++++++---
 seqspec/seqspec_file.py        | 292 +++++++++++++++++-----------
 seqspec/seqspec_find.py        | 151 ++++++++++-----
 seqspec/seqspec_format.py      |  75 ++++++--
 seqspec/seqspec_index.py       | 180 +++++++++++------
 seqspec/seqspec_info.py        |  79 +++++---
 seqspec/seqspec_init.py        | 190 ++++++++++--------
 seqspec/seqspec_methods.py     |  91 +++++----
 seqspec/seqspec_modify.py      | 233 +++++++++++-----------
 seqspec/seqspec_onlist.py      | 101 ++++++----
 seqspec/seqspec_print.py       | 342 +++++++++++++++------------------
 seqspec/seqspec_print_html.py  |  95 ++++-----
 seqspec/seqspec_print_utils.py |  79 ++++++++
 seqspec/seqspec_split.py       |  73 ++++---
 seqspec/seqspec_upgrade.py     |  84 ++++----
 seqspec/seqspec_version.py     |  59 +++---
 setup.cfg                      |   2 +-
 22 files changed, 1603 insertions(+), 1017 deletions(-)
 create mode 100644 seqspec/seqspec_print_utils.py

diff --git a/requirements.txt b/requirements.txt
index 4d23455..acff050 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,4 +3,5 @@ jsonschema
 newick
 requests
 biopython
-packaging
\ No newline at end of file
+packaging
+matplotlib>=3.4.0
\ No newline at end of file
diff --git a/seqspec/Region.py b/seqspec/Region.py
index 9a2a7fd..f7edbc0 100644
--- a/seqspec/Region.py
+++ b/seqspec/Region.py
@@ -417,7 +417,7 @@ def __init__(
         url: str,
         urltype: str,
         md5: str,
-        location: Optional[str],
+        # location: Optional[str],
     ) -> None:
         super().__init__()
         self.file_id = file_id
diff --git a/seqspec/main.py b/seqspec/main.py
index a5bb35e..b800ab7 100644
--- a/seqspec/main.py
+++ b/seqspec/main.py
@@ -1,46 +1,48 @@
-from . import __version__
-import argparse
-import sys
-from .seqspec_format import setup_format_args, validate_format_args
-from .seqspec_print import setup_print_args, validate_print_args
-from .seqspec_check import setup_check_args, validate_check_args
-from .seqspec_find import setup_find_args, validate_find_args
-
-from .seqspec_convert import setup_convert_args, validate_convert_args
-from .seqspec_modify import setup_modify_args, validate_modify_args
-from .seqspec_index import setup_index_args, validate_index_args
-from .seqspec_info import setup_info_args, validate_info_args
-
-from .seqspec_split import setup_split_args, validate_split_args
-from .seqspec_init import setup_init_args, validate_init_args
-from .seqspec_onlist import setup_onlist_args, validate_onlist_args
-from .seqspec_version import setup_version_args, validate_version_args
-from .seqspec_methods import setup_methods_args, validate_methods_args
-from .seqspec_file import setup_file_args, validate_file_args
-from .seqspec_upgrade import setup_upgrade_args, validate_upgrade_args
+"""Main module for seqspec CLI.
 
-import warnings
-
-# Steps to add new subcommands
-# Create seqspec_subcommand.py (create setup_subcmd_args, validate_subcmd_args, run_subcmd in that file)
-# (in this file) from seqspec_subcmd import setup_subcmd_args, validate_subcmd_args
-# Add setup_subcmd_args to command_to_parser along with its key==str(subcmd)
-# Add validate_subcmd_args to COMMAND_TO_FUNCTION along with its key==str(subcmd)
+This module provides the main entry point for the seqspec command-line interface.
+It handles argument parsing, command routing, and execution of subcommands.
+"""
 
+import sys
+import warnings
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+from typing import Dict, Callable, Any
 
-def main():
-    warnings.simplefilter("default", DeprecationWarning)
+from . import __version__
 
-    # setup parsers
-    parser = argparse.ArgumentParser(
+# Import subcommand modules
+from .seqspec_format import setup_format_args, run_format
+from .seqspec_print import setup_print_args, run_print
+from .seqspec_check import setup_check_args, run_check
+from .seqspec_find import setup_find_args, run_find
+from .seqspec_convert import setup_convert_args, run_convert
+from .seqspec_modify import setup_modify_args, run_modify
+from .seqspec_index import setup_index_args, run_index
+from .seqspec_info import setup_info_args, run_info
+from .seqspec_split import setup_split_args, run_split
+from .seqspec_init import setup_init_args, run_init
+from .seqspec_onlist import setup_onlist_args, run_onlist
+from .seqspec_version import setup_version_args, run_version
+from .seqspec_methods import setup_methods_args, run_methods
+from .seqspec_file import setup_file_args, run_file
+from .seqspec_upgrade import setup_upgrade_args, run_upgrade
+
+
+def setup_parser():
+    """Create and configure the main argument parser.
+
+    Returns:
+        Configured ArgumentParser instance.
+    """
+    parser = ArgumentParser(
         description=f"""
 seqspec {__version__}: A machine-readable file format for genomic library sequence and structure.
 
 GitHub: https://github.com/pachterlab/seqspec
 Documentation: https://pachterlab.github.io/seqspec/
-
 """,
-        formatter_class=argparse.RawTextHelpFormatter,
+        formatter_class=RawTextHelpFormatter,
     )
 
     subparsers = parser.add_subparsers(
@@ -67,7 +69,18 @@ def main():
         "version": setup_version_args(subparsers),
     }
 
-    # Show help when no arguments are given
+    return parser, command_to_parser
+
+
+def handle_no_args(
+    parser: ArgumentParser, command_to_parser: Dict[str, ArgumentParser]
+) -> None:
+    """Handle case when no arguments are provided.
+
+    Args:
+        parser: Main argument parser.
+        command_to_parser: Dictionary mapping commands to their parsers.
+    """
     if len(sys.argv) == 1:
         parser.print_help(sys.stderr)
         sys.exit(1)
@@ -80,27 +93,43 @@ def main():
             parser.print_help(sys.stderr)
         sys.exit(1)
 
+
+def main() -> None:
+    """Main entry point for the seqspec CLI."""
+    warnings.simplefilter("default", DeprecationWarning)
+
+    parser, command_to_parser = setup_parser()
+    handle_no_args(parser, command_to_parser)
+
     args = parser.parse_args()
 
-    # Setup validator and runner for all subcommands (validate and run if valid)
-    COMMAND_TO_FUNCTION = {
-        "format": validate_format_args,
-        "print": validate_print_args,
-        "check": validate_check_args,
-        "find": validate_find_args,
-        "index": validate_index_args,
-        "info": validate_info_args,
-        "init": validate_init_args,
-        "methods": validate_methods_args,
-        "modify": validate_modify_args,
-        "onlist": validate_onlist_args,
-        "split": validate_split_args,
-        "version": validate_version_args,
-        "file": validate_file_args,
-        "upgrade": validate_upgrade_args,
-        "convert": validate_convert_args,
+    # Setup validator and runner for all subcommands
+    command_to_function: Dict[str, Callable[[ArgumentParser, Namespace], Any]] = {
+        "format": run_format,
+        "print": run_print,
+        "check": run_check,
+        "find": run_find,
+        "index": run_index,
+        "info": run_info,
+        "init": run_init,
+        "methods": run_methods,
+        "modify": run_modify,
+        "onlist": run_onlist,
+        "split": run_split,
+        "version": run_version,
+        "file": run_file,
+        "upgrade": run_upgrade,
+        "convert": run_convert,
     }
-    COMMAND_TO_FUNCTION[sys.argv[1]](parser, args)
+
+    try:
+        command_to_function[sys.argv[1]](parser, args)
+    except KeyError:
+        parser.print_help(sys.stderr)
+        sys.exit(1)
+    except Exception as e:
+        print(f"Error: {str(e)}", file=sys.stderr)
+        sys.exit(1)
 
 
 if __name__ == "__main__":
diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py
index 3b5a8b0..82a2894 100644
--- a/seqspec/seqspec_check.py
+++ b/seqspec/seqspec_check.py
@@ -1,13 +1,23 @@
+"""Check module for seqspec CLI.
+
+This module provides functionality to validate seqspec files against the specification schema.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+
 from jsonschema import Draft4Validator
 import yaml
 from os import path
+
 from seqspec.utils import load_spec, file_exists
 from seqspec.Assay import Assay
-from argparse import RawTextHelpFormatter
+
 from seqspec import get_version
 
 
 def setup_check_args(parser):
+    """Create and configure the check command subparser."""
     subparser = parser.add_parser(
         "check",
         description="""
@@ -22,45 +32,51 @@ def setup_check_args(parser):
     )
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        help="Path to output file",
+        type=Path,
         default=None,
     )
     subparser.add_argument(
         "-s",
+        "--skip",
         metavar="SKIP",
-        help=("Skip checks"),
+        help="Skip checks",
         type=str,
         default=None,
         choices=["igvf", "igvf_onlist_skip"],
     )
 
-    subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str)
+    subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
 
     return subparser
 
 
-def validate_check_args(parser, args):
-    spec_fn = args.yaml
-    o = args.o
-    s = args.s
+def validate_check_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the check command arguments."""
+    if not Path(args.yaml).exists():
+        parser.error(f"Input file does not exist: {args.yaml}")
+
+    if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
 
-    return run_check(spec_fn, o, s)
 
+def run_check(parser: ArgumentParser, args: Namespace):
+    """Run the check command."""
+    validate_check_args(parser, args)
 
-def run_check(spec_fn, o, s):
-    spec = load_spec(spec_fn)
+    spec = load_spec(args.yaml)
+    errors = check(spec, args.yaml)
 
-    errors = check(spec, spec_fn)
-    if s == "igvf":
+    if args.skip == "igvf":
         errors = filter_errors(errors, "igvf")
-    elif s == "igvf_onlist_skip":
+    elif args.skip == "igvf_onlist_skip":
         errors = filter_errors(errors, "igvf_onlist_skip")
 
     if errors:
-        if o:
-            with open(o, "w") as f:
+        if args.output:
+            with open(args.output, "w") as f:
                 for idx, e in enumerate(errors, 1):
                     print(format_error(e, idx), file=f)
         else:
diff --git a/seqspec/seqspec_convert.py b/seqspec/seqspec_convert.py
index ccffa7c..11a4d1a 100644
--- a/seqspec/seqspec_convert.py
+++ b/seqspec/seqspec_convert.py
@@ -1,16 +1,22 @@
-from seqspec.utils import load_genbank
+"""Convert module for seqspec CLI.
+
+This module provides functionality to convert between different formats (seqspec, genbank, token).
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
 import json
-from seqspec.Region import Region
-from seqspec.Assay import Assay
-from seqspec.utils import load_spec
 import numpy as np
 from typing import Dict, List, Tuple
-from os import path
-from pathlib import Path
+import os
 
+from seqspec.utils import load_genbank, load_spec
+from seqspec.Region import Region
+from seqspec.Assay import Assay
 
-schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json")
 
+# Load schema and constants
+schema_fn = os.path.join(os.path.dirname(__file__), "schema/seqspec.schema.json")
 with open(schema_fn, "r") as f:
     schema = json.load(f)
 REGION_TYPES = schema["$defs"]["region"]["properties"]["region_type"]["enum"]
@@ -18,51 +24,96 @@
 SEQUENCE_TYPES = schema["$defs"]["region"]["properties"]["sequence_type"]["enum"]
 
 
-def setup_convert_args(parser):
+def setup_convert_args(parser) -> ArgumentParser:
+    """Create and configure the convert command subparser."""
     subparser = parser.add_parser(
         "convert",
-        description="get genbank about seqspec file",
-        help="get genbank about seqspec file",
+        description="""
+Convert between different formats (seqspec, genbank, token).
+
+Examples:
+seqspec convert -ifmt seqspec -ofmt token spec.yaml -o output_dir  # Convert seqspec to token format
+seqspec convert -ifmt genbank -ofmt seqspec input.gb -o spec.yaml  # Convert genbank to seqspec
+---
+""",
+        help="Convert between different formats",
+        formatter_class=RawTextHelpFormatter,
     )
     choices = ["genbank", "seqspec", "token"]
     subparser.add_argument(
-        "-ifmt", help="Input format", type=str, default="seqspec", choices=choices
+        "-ifmt",
+        "--input-format",
+        help="Input format",
+        type=str,
+        default="seqspec",
+        choices=choices,
     )
     subparser.add_argument(
-        "-ofmt", help="Output format", type=str, default="token", choices=choices
+        "-ofmt",
+        "--output-format",
+        help="Output format",
+        type=str,
+        default="token",
+        choices=choices,
     )
-
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        help="Path to output file or directory",
+        type=Path,
         default=None,
         required=False,
     )
     subparser.add_argument(
-        "input_file", metavar="IN", help="Path to input file", type=str
+        "input_file",
+        metavar="IN",
+        help="Path to input file",
+        type=Path,
     )
     return subparser
 
 
-def validate_convert_args(parser, args):
-    # if everything is valid the run_convert
-    fn = args.input_file
-    ifmt = args.ifmt
-    ofmt = args.ofmt
-    o = args.o
+def validate_convert_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the convert command arguments."""
+    if not Path(args.input_file).exists():
+        parser.error(f"Input file does not exist: {args.input_file}")
+
+    if args.output and Path(args.output).exists():
+        if args.output_format == "token":
+            if not Path(args.output).is_dir():
+                parser.error(
+                    f"Output path exists but is not a directory: {args.output}"
+                )
+        else:
+            if not Path(args.output).is_file():
+                parser.error(f"Output path exists but is not a file: {args.output}")
+
+
+def run_convert(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the convert command."""
+    validate_convert_args(parser, args)
+
+    CONVERT = {
+        ("genbank", "seqspec"): gb_to_seqspec,
+        ("seqspec", "token"): seqspec_to_token,
+    }
 
-    cnv = run_convert(fn, ifmt, ofmt, o)
-    print(cnv)
+    file = load_input_file(args.input_file, args.input_format)
+    result = CONVERT[(args.input_format, args.output_format)](file)
 
-    # if o:
-    #     spec.to_YAML(o)
-    # else:
-    #     print(json.dumps(spec, sort_keys=False, indent=4))
+    if args.output:
+        if args.output_format == "token":
+            save_tokenized_spec(*result, str(args.output))  # noqa
+        else:
+            # Handle other output formats here
+            pass
+    else:
+        return result
 
 
-def load_input_file(fn, ifmt):
+def load_input_file(fn: Path, ifmt: str):
+    """Load input file based on format."""
     LOAD = {
         "genbank": load_genbank,
         "seqspec": load_spec,
@@ -123,20 +174,6 @@ def save_tokenized_spec(
             f.write(f"{feature}\n")
 
 
-def run_convert(fn, ifmt, ofmt, o):
-    CONVERT = {
-        ("genbank", "seqspec"): gb_to_seqspec,
-        ("seqspec", "token"): seqspec_to_token,
-    }
-    file = load_input_file(fn, ifmt)
-    c = CONVERT[(ifmt, ofmt)](file)
-    if o:
-        save_tokenized_spec(*c, o)
-    else:
-        return c
-    return
-
-
 def seqspec_to_token(spec):
     # for each modalitiy, make a dictionary of regions
     specs_regions = {}
@@ -150,7 +187,7 @@ def seqspec_to_token(spec):
 
 
 def tokenize_specs(
-    specs_regions: Dict[str, Dict[str, List[Dict]]]
+    specs_regions: Dict[str, Dict[str, List[Dict]]],
 ) -> Tuple[np.ndarray, List[Tuple[str, str, str]]]:
     """
     Convert specs into a single matrix where each row represents a complete region specification
@@ -167,14 +204,6 @@ def tokenize_specs(
     n_region_type_features = len(REGION_TYPES)
     n_sequence_type_features = len(SEQUENCE_TYPES)
 
-    # Total features = one-hot encodings + numerical features
-    total_features = (
-        n_modality_features  # modality one-hot
-        + n_region_type_features  # region_type one-hot
-        + n_sequence_type_features  # sequence_type one-hot
-        + 2  # min_len, max_len
-    )
-
     # Total features = one-hot encodings + numerical features + position
     total_features = (
         n_modality_features  # modality one-hot
@@ -184,6 +213,8 @@ def tokenize_specs(
         + 1  # position in region list (1-based)
     )
 
+    # features = get_feature_names()
+
     rows = []  # Will hold our feature vectors
     row_identifiers = []  # Will hold (spec_id, modality, region_type) tuples
 
@@ -192,7 +223,7 @@ def tokenize_specs(
             # Enumerate regions to get position (1-based)
             for position, region in enumerate(regions, start=1):
                 # Create feature vector for this region
-                feature_vector = np.zeros(total_features)
+                feature_vector = np.zeros(total_features).astype(int)
                 current_idx = 0
 
                 # Add modality one-hot
diff --git a/seqspec/seqspec_diff.py b/seqspec/seqspec_diff.py
index c4e6a57..b4ba3a8 100644
--- a/seqspec/seqspec_diff.py
+++ b/seqspec/seqspec_diff.py
@@ -1,45 +1,156 @@
-from .Region import Region
-from seqspec.Assay import Assay
+"""Diff module for seqspec.
+
+This module provides functionality to compare two seqspec files and identify differences.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, Namespace
+from typing import List
+
 from seqspec.utils import load_spec
+from seqspec.Assay import Assay
+from seqspec.Region import Region
 
 
-def setup_diff_args(parser):
-    parser_diff = parser.add_parser(
+def setup_diff_args(parser) -> ArgumentParser:
+    """Create and configure the diff command subparser."""
+    subparser = parser.add_parser(
         "diff",
-        description="diff two seqspecs",
-        help="diff two seqspecs",
+        description="""
+Compare two seqspec files and identify differences.
+
+Examples:
+seqspec diff spec1.yaml spec2.yaml                    # Compare two specs and print differences
+seqspec diff spec1.yaml spec2.yaml -o diff.txt        # Compare specs and save differences to file
+---
+""",
+        help="Compare two seqspec files and identify differences",
+    )
+    subparser.add_argument(
+        "yamlA", help="First sequencing specification yaml file", type=str
     )
-    parser_diff.add_argument("yamlA", help="Sequencing specification yaml file A")
-    parser_diff.add_argument("yamlB", help="Sequencing specification yaml file B")
-    parser_diff.add_argument(
+    subparser.add_argument(
+        "yamlB", help="Second sequencing specification yaml file", type=str
+    )
+    subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        help="Path to output file",
+        type=Path,
         default=None,
     )
-    return parser_diff
+    return subparser
+
+
+def validate_diff_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the diff command arguments."""
+    if not Path(args.yamlA).exists():
+        parser.error(f"Input file A does not exist: {args.yamlA}")
+    if not Path(args.yamlB).exists():
+        parser.error(f"Input file B does not exist: {args.yamlB}")
+    if args.output and args.output.exists() and not args.output.is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
+
+
+def run_diff(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the diff command."""
+    validate_diff_args(parser, args)
+
+    spec_a = load_spec(args.yamlA)
+    spec_b = load_spec(args.yamlB)
+
+    differences = compare_specs(spec_a, spec_b)
+
+    if args.output:
+        args.output.write_text(differences)
+    else:
+        print(differences)
+
+
+def compare_specs(spec_a: Assay, spec_b: Assay) -> str:
+    """Compare two specs and return a string describing their differences."""
+    differences = []
+
+    # Compare modalities
+    modalities_a = set(spec_a.modalities)
+    modalities_b = set(spec_b.modalities)
+
+    if modalities_a != modalities_b:
+        differences.append("Modalities differ:")
+        differences.append(f"  Spec A: {', '.join(sorted(modalities_a))}")
+        differences.append(f"  Spec B: {', '.join(sorted(modalities_b))}")
+
+    # Compare common modalities
+    common_modalities = modalities_a.intersection(modalities_b)
+    for modality in common_modalities:
+        regions_a = spec_a.get_libspec(modality).get_leaves()
+        regions_b = spec_b.get_libspec(modality).get_leaves()
+
+        region_diffs = compare_regions(regions_a, regions_b)
+        if region_diffs:
+            differences.append(f"\nModality '{modality}' differences:")
+            differences.extend(region_diffs)
+
+    return "\n".join(differences) if differences else "No differences found"
+
+
+def compare_regions(regions_a: List[Region], regions_b: List[Region]) -> List[str]:
+    """Compare two lists of regions and return a list of differences."""
+    differences = []
+
+    # Create lookup dictionaries
+    regions_a_dict = {r.region_id: r for r in regions_a}
+    regions_b_dict = {r.region_id: r for r in regions_b}
+
+    # Find regions unique to each spec
+    unique_to_a = set(regions_a_dict.keys()) - set(regions_b_dict.keys())
+    unique_to_b = set(regions_b_dict.keys()) - set(regions_a_dict.keys())
+
+    if unique_to_a:
+        differences.append("  Regions only in spec A:")
+        for region_id in sorted(unique_to_a):
+            differences.append(f"    - {region_id}")
+
+    if unique_to_b:
+        differences.append("  Regions only in spec B:")
+        for region_id in sorted(unique_to_b):
+            differences.append(f"    - {region_id}")
 
+    # Compare common regions
+    common_regions = set(regions_a_dict.keys()).intersection(set(regions_b_dict.keys()))
+    for region_id in sorted(common_regions):
+        region_a = regions_a_dict[region_id]
+        region_b = regions_b_dict[region_id]
 
-def validate_diff_args(parser, args):
-    # if everything is valid the run_diff
-    A_fn = args.yamlA
-    B_fn = args.yamlB
-    # o = args.o
-    A = load_spec(A_fn)
-    B = load_spec(B_fn)
+        region_diffs = diff_regions(region_a, region_b)
+        if region_diffs:
+            differences.append(f"  Region '{region_id}' differences:")
+            differences.extend(f"    - {diff}" for diff in region_diffs)
 
-    # load in two specs
-    run_diff(A, B)
+    return differences
 
 
-def run_diff(A: Assay, B: Assay):
-    # What does it mean to diff two assays?
-    # Only compare on modalities?
-    # itx: pull out regions that have the same name?
-    # itx:
-    pass
+def diff_regions(region_a: Region, region_b: Region) -> List[str]:
+    """Compare two regions and return a list of differences."""
+    differences = []
 
+    # Compare basic properties
+    if region_a.region_type != region_b.region_type:
+        differences.append(
+            f"region_type: {region_a.region_type} != {region_b.region_type}"
+        )
+    if region_a.name != region_b.name:
+        differences.append(f"name: {region_a.name} != {region_b.name}")
+    if region_a.sequence_type != region_b.sequence_type:
+        differences.append(
+            f"sequence_type: {region_a.sequence_type} != {region_b.sequence_type}"
+        )
+    if region_a.sequence != region_b.sequence:
+        differences.append(f"sequence: {region_a.sequence} != {region_b.sequence}")
+    if region_a.min_len != region_b.min_len:
+        differences.append(f"min_len: {region_a.min_len} != {region_b.min_len}")
+    if region_a.max_len != region_b.max_len:
+        differences.append(f"max_len: {region_a.max_len} != {region_b.max_len}")
 
-def diff_regions(R1: Region, R2: Region):
-    pass
+    return differences
diff --git a/seqspec/seqspec_file.py b/seqspec/seqspec_file.py
index 31be20d..48563e6 100644
--- a/seqspec/seqspec_file.py
+++ b/seqspec/seqspec_file.py
@@ -1,16 +1,22 @@
+"""File module for seqspec.
+
+This module provides functionality to list and format files present in seqspec files.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+from typing import Dict, List, Optional
+from collections import defaultdict
+import json
+
 from seqspec.utils import load_spec
 from seqspec.Assay import Assay
-from collections import defaultdict
 from seqspec.File import File
-from typing import Dict, List, Optional
-from argparse import RawTextHelpFormatter
-import json
 from seqspec import seqspec_find
-import os
-import argparse
 
 
-def setup_file_args(parser):
+def setup_file_args(parser) -> ArgumentParser:
+    """Create and configure the file command subparser."""
     subparser = parser.add_parser(
         "file",
         description="""
@@ -29,36 +35,37 @@ def setup_file_args(parser):
     )
     subparser_required = subparser.add_argument_group("required arguments")
 
-    subparser.add_argument("yaml", help="Sequencing specification yaml file")
+    subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        help="Path to output file",
+        type=Path,
         default=None,
     )
     subparser.add_argument(
         "-i",
+        "--ids",
         metavar="IDs",
-        help=("Ids to list"),
+        help="Ids to list",
         type=str,
         default=None,
     )
     subparser_required.add_argument(
         "-m",
+        "--modality",
         metavar="MODALITY",
-        help=("Modality"),
+        help="Modality",
         type=str,
-        default=None,
         required=True,
     )
-    # the object we are using to index
-    # choices = ["read", "region", "file", "onlist", "region-type"]
     choices = ["read", "region", "file", "region-type"]
     subparser.add_argument(
         "-s",
+        "--selector",
         metavar="SELECTOR",
-        help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"),
+        help=f"Selector for ID, [{', '.join(choices)}] (default: read)",
         type=str,
         default="read",
         choices=choices,
@@ -66,6 +73,7 @@ def setup_file_args(parser):
     choices = ["paired", "interleaved", "index", "list", "json"]
     subparser.add_argument(
         "-f",
+        "--format",
         metavar="FORMAT",
         help=f"Format, [{', '.join(choices)}], default: paired",
         type=str,
@@ -84,6 +92,7 @@ def setup_file_args(parser):
     ]
     subparser.add_argument(
         "-k",
+        "--key",
         metavar="KEY",
         help=f"Key, [{', '.join(choices)}], default: file_id",
         type=str,
@@ -94,7 +103,7 @@ def setup_file_args(parser):
     # option to get the full path of the file
     subparser.add_argument(
         "--fullpath",
-        help=argparse.SUPPRESS,
+        help="Use full path for local files",
         action="store_true",
         default=False,
     )
@@ -102,59 +111,82 @@ def setup_file_args(parser):
     return subparser
 
 
-def validate_file_args(parser, args):
-    spec_fn = os.path.abspath(args.yaml)
-    o = args.o
-    m = args.m  # modality
-    idtype = args.s  # selector
-    fmt = args.f  # format
-    ids = args.i  # ids
-    k = args.k  # key
-    fp = args.fullpath
-
-    if (k == "filesize" or k == "filetype" or k == "urltype" or k == "md5") and (
-        fmt == "paired" or fmt == "interleaved" or fmt == "index"
-    ):
-        parser.error(f"-f {fmt} valid only with -k file_id, filename, url")
-
-    return run_file(spec_fn, m, ids, idtype, fmt, k, o, fp=fp)
-
-
-def run_file(spec_fn, m, ids, idtype, fmt, k, o, fp=False):
-    spec = load_spec(spec_fn)
-    if ids is None:
-        ids = []
-    else:
-        ids = ids.split(",")
-    files = file(spec, m, ids, idtype, fmt, k, spec_fn, fp)
+def validate_file_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the file command arguments."""
+    if not Path(args.yaml).exists():
+        parser.error(f"Input file does not exist: {args.yaml}")
+
+    if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
+
+    if args.key in ["filesize", "filetype", "urltype", "md5"] and args.format in [
+        "paired",
+        "interleaved",
+        "index",
+    ]:
+        parser.error(
+            f"Format '{args.format}' valid only with key 'file_id', 'filename', or 'url'"
+        )
+
+
+def run_file(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the file command."""
+    validate_file_args(parser, args)
+
+    spec = load_spec(args.yaml)
+    ids = args.ids.split(",") if args.ids else []
+
+    files = list_files(
+        spec,
+        args.modality,
+        ids,
+        args.selector,
+        args.format,
+        args.key,
+        args.yaml,
+        args.fullpath,
+    )
 
     if files:
-        if o:
-            with open(o, "w") as f:
-                print(files, file=f)
+        if args.output:
+            args.output.write_text(str(files))
         else:
             print(files)
-    return
 
 
-def file(
+def list_files(
     spec: Assay,
     modality: str,
     ids: List[str],
     idtype: str,
     fmt: str,
-    k: Optional[str],
-    spec_fn: str,
+    k: str,
+    spec_fn: Path,
     fp: bool = False,
-):
+) -> str:
+    """List files based on the given parameters.
+
+    Args:
+        spec: The seqspec specification.
+        modality: The modality to list files for.
+        ids: List of IDs to filter by.
+        idtype: Type of ID to filter by (read, region, file, region-type).
+        fmt: Output format (paired, interleaved, index, list, json).
+        k: Key to use for output (file_id, filename, etc.).
+        spec_fn: Path to the spec file.
+        fp: Whether to use full paths for local files.
+
+    Returns:
+        Formatted string containing the file information.
+    """
     # NOTE: LIST FILES DOES NOT RESPECT ORDERING OF INPUT IDs LIST
     # NOTE: seqspec file -s read gets the files for the read, not the files mapped from the regions associated with the read.
     LIST_FILES = {
         "read": list_read_files,
         "region": list_region_files,
-        "file": list_files,
-        # "onlist": list_onlist_files,
+        "file": list_all_files,
     }
+
     LIST_FILES_BY_ID = {
         "read": list_files_by_read_id,
         "file": list_files_by_file_id,
@@ -162,13 +194,6 @@ def file(
         "region-type": list_files_by_region_type,
     }
 
-    if len(ids) == 0:
-        # list all the files
-        files = LIST_FILES[idtype](spec, modality)
-    else:
-        # list only the id files
-        files = LIST_FILES_BY_ID[idtype](spec, modality, ids)
-
     FORMAT = {
         "list": format_list_files_metadata,
         "paired": format_list_files,
@@ -177,19 +202,30 @@ def file(
         "json": format_json_files,
     }
 
-    x = FORMAT[fmt](files, fmt, k, spec_fn, fp)
-    return x
+    # Get files based on whether we're filtering by IDs
+    if not ids:
+        # list all files
+        files = LIST_FILES[idtype](spec, modality)
+    else:
+        # list files by id
+        files = LIST_FILES_BY_ID[idtype](spec, modality, ids)
 
+    # Format the output
+    return FORMAT[fmt](files, fmt, k, spec_fn, fp)
 
-def list_read_files(spec, modality):
+
+def list_read_files(spec: Assay, modality: str) -> Dict[str, List[File]]:
+    """List files for all reads in a modality."""
     files = defaultdict(list)
     reads = spec.get_seqspec(modality)
     for rd in reads:
-        files[rd.read_id] = rd.files
+        if rd.files:
+            files[rd.read_id] = rd.files
     return files
 
 
-def list_files(spec, modality):
+def list_all_files(spec: Assay, modality: str) -> Dict[str, List[File]]:
+    """List all files in a modality."""
     files_rd = list_read_files(spec, modality)
     files_rgn = list_region_files(spec, modality)
     return {**files_rd, **files_rgn}
@@ -210,131 +246,155 @@ def list_region_files(spec, modality):
 
 
 def format_list_files_metadata(
-    files: Dict[str, List[File]], fmt, k, spec_fn="", fp=False
-):
-    x = ""
+    files: Dict[str, List[File]],
+    fmt: str,
+    k: str,
+    spec_fn: Path = Path(""),
+    fp: bool = False,
+) -> str:
+    """Format file metadata as a tab-separated list."""
+    x = []
     if k == "all":
         for items in zip(*files.values()):
             for key, item in zip(files.keys(), items):
-                x += f"{key}\t{item.file_id}\t{item.filename}\t{item.filetype}\t{item.filesize}\t{item.url}\t{item.urltype}\t{item.md5}\n"
-        x = x[:-1]
-
+                x.append(
+                    f"{key}\t{item.file_id}\t{item.filename}\t{item.filetype}\t{item.filesize}\t{item.url}\t{item.urltype}\t{item.md5}"
+                )
     else:
         for items in zip(*files.values()):
             for key, item in zip(files.keys(), items):
                 attr = str(getattr(item, k))
                 id = item.file_id
-                x += f"{key}\t{id}\t{attr}\n"
-        x = x[:-1]
-
-    return x
+                x.append(f"{key}\t{id}\t{attr}")
+    return "\n".join(x)
 
 
-def format_json_files(files: Dict[str, List[File]], fmt, k, spec_fn="", fp=False):
+def format_json_files(
+    files: Dict[str, List[File]],
+    fmt: str,
+    k: str,
+    spec_fn: Path = Path(""),
+    fp: bool = False,
+) -> str:
+    """Format files as JSON."""
     x = []
     for items in zip(*files.values()):
         if k == "all":
             for key, item in zip(files.keys(), items):
                 d = item.to_dict()
                 if item.urltype == "local" and fp:
-                    d["url"] = os.path.join(os.path.dirname(spec_fn), d["url"])
+                    d["url"] = str(spec_fn.parent / d["url"])
                 x.append(d)
         else:
             for key, item in zip(files.keys(), items):
                 attr = getattr(item, k)
                 if k == "url" and item.urltype == "local" and fp:
-                    attr = os.path.join(os.path.dirname(spec_fn), attr)
+                    attr = str(spec_fn.parent / attr)
                 x.append({"file_id": item.file_id, k: attr})
     return json.dumps(x, indent=4)
 
 
-def format_list_files(files: Dict[str, List[File]], fmt, k=None, spec_fn="", fp=False):
-    x = ""
+def format_list_files(
+    files: Dict[str, List[File]],
+    fmt: str,
+    k: Optional[str] = None,
+    spec_fn: Path = Path(""),
+    fp: bool = False,
+) -> str:
+    """Format files as a list based on the format type."""
+    x = []
+
     if fmt == "paired":
-        x = ""
         for items in zip(*files.values()):
-            t = ""
+            t = []
             for i in items:
                 if k:
                     attr = str(getattr(i, k))
                     if k == "url" and i.urltype == "local" and fp:
-                        attr = os.path.join(os.path.dirname(spec_fn), attr)
-                    t += f"{attr}\t"
+                        attr = str(spec_fn.parent / attr)
+                    t.append(attr)
                 else:
-                    t += f"{i.filename}\t"
-            x += f"{t[:-1]}\n"
-        x = x[:-1]
+                    t.append(i.filename)
+            x.append("\t".join(t))
 
-    elif fmt == "interleaved" or fmt == "list":
+    elif fmt in ["interleaved", "list"]:
         for items in zip(*files.values()):
             for item in items:
                 id = item.filename
                 if k:
                     id = str(getattr(item, k))
                     if k == "url" and item.urltype == "local" and fp:
-                        id = os.path.join(os.path.dirname(spec_fn), id)
-                x += id + "\n"
-        x = x[:-1]
+                        id = str(spec_fn.parent / id)
+                x.append(id)
+
     elif fmt == "index":
+        t = []
         for items in zip(*files.values()):
             for item in items:
                 id = item.filename
                 if k:
                     id = str(getattr(item, k))
                     if k == "url" and item.urltype == "local" and fp:
-                        id = os.path.join(os.path.dirname(spec_fn), id)
-                x += id + ","
-        x = x[:-1]
+                        id = str(spec_fn.parent / id)
+                t.append(id)
+        x.append(",".join(t))
 
-    return x
+    return "\n".join(x)
 
 
-def list_files_by_read_id(spec, modality, read_ids):
+def list_files_by_read_id(
+    spec: Assay, modality: str, read_ids: List[str]
+) -> Dict[str, List[File]]:
+    """List files for specific read IDs."""
     seqspec = spec.get_seqspec(modality)
     files = defaultdict(list)
     ids = set(read_ids)
     # TODO return the files in the order of the ids given in the input
     # NOTE ORDERING HERE IS IMPORANT SEE GET_INDEX_BY_FILES FUNCTION
     for read in seqspec:
-        if read.read_id in ids:
-            for file in read.files:
-                # files[read.read_id].append(file.filename)
-                files[read.read_id].append(file)
+        if read.read_id in ids and read.files:
+            files[read.read_id].extend(read.files)
     return files
 
 
-def list_files_by_file_id(spec, modality, file_ids):
+def list_files_by_file_id(
+    spec: Assay, modality: str, file_ids: List[str]
+) -> Dict[str, List[File]]:
+    """List files for specific file IDs."""
     seqspec = spec.get_seqspec(modality)
     ids = set(file_ids)
     files = defaultdict(list)
     # TODO: NOTE ORDERING HERE IS IMPORTANT SEE RUN_LIST_FILES FUNCTION
     for read in seqspec:
-        for file in read.files:
-            if file.filename in ids:
-                # files[read.read_id].append(file.filename)
-                files[read.read_id].append(file)
+        if read.files:
+            for file in read.files:
+                if file.filename in ids:
+                    files[read.read_id].append(file)
     return files
 
 
-def list_files_by_region_id(spec, modality, file_ids):
+def list_files_by_region_id(
+    spec: Assay, modality: str, region_ids: List[str]
+) -> Dict[str, List[File]]:
+    """List files for specific region IDs."""
     files = list_region_files(spec, modality)
-
-    ids = set(file_ids)
+    ids = set(region_ids)
     new_files = defaultdict(list)
-    for region_id, files in files.items():
+    for region_id, region_files in files.items():
         if region_id in ids:
-            new_files[region_id] += files
+            new_files[region_id].extend(region_files)
     return new_files
 
 
-def list_files_by_region_type(spec, modality, file_ids):
+def list_files_by_region_type(
+    spec: Assay, modality: str, region_types: List[str]
+) -> Dict[str, List[File]]:
+    """List files for specific region types."""
     files = list_region_files(spec, modality)
-
-    ids = set(file_ids)
+    ids = set(region_types)
     new_files = defaultdict(list)
-    for region_id, files in files.items():
+    for region_id, region_files in files.items():
         r = seqspec_find.find_by_region_id(spec, modality, region_id)[0]
-        rt = r.region_type
-        if rt in ids:
-            new_files[region_id] += files
+        if r.region_type in ids:
+            new_files[region_id].extend(region_files)
     return new_files
diff --git a/seqspec/seqspec_find.py b/seqspec/seqspec_find.py
index f5aec05..c1826ce 100644
--- a/seqspec/seqspec_find.py
+++ b/seqspec/seqspec_find.py
@@ -1,13 +1,24 @@
+"""Find module for seqspec CLI.
+
+This module provides functionality to search for objects within seqspec files.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS
+import warnings
+import yaml
+from typing import List
+
 from seqspec.utils import load_spec
 from seqspec.Assay import Assay
-import yaml
-import argparse
-import warnings
-from argparse import RawTextHelpFormatter
-from seqspec.seqspec_file import list_files
+from seqspec.Read import Read
+from seqspec.Region import Region
+from seqspec.File import File
+from seqspec.seqspec_file import list_all_files
 
 
-def setup_find_args(parser):
+def setup_find_args(parser) -> ArgumentParser:
+    """Create and configure the find command subparser."""
     subparser = parser.add_parser(
         "find",
         description="""
@@ -24,45 +35,48 @@ def setup_find_args(parser):
     )
     subparser_required = subparser.add_argument_group("required arguments")
 
-    subparser.add_argument("yaml", help="Sequencing specification yaml file")
+    subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        help="Path to output file",
+        type=Path,
         default=None,
     )
     # depracate
-    subparser.add_argument("--rtype", help=argparse.SUPPRESS, action="store_true")
+    subparser.add_argument("--rtype", help=SUPPRESS, action="store_true")
     choices = ["read", "region", "file", "region-type"]
     subparser.add_argument(
         "-s",
-        metavar="Selector",
-        help=(f"Selector, [{','.join(choices)}] (default: region)"),
+        "--selector",
+        metavar="SELECTOR",
+        help=f"Selector, [{','.join(choices)}] (default: region)",
         type=str,
         default="region",
         choices=choices,
     )
     subparser_required.add_argument(
         "-m",
+        "--modality",
         metavar="MODALITY",
-        help=("Modality"),
+        help="Modality",
         type=str,
-        default=None,
         required=True,
     )
     # depracate -r
     subparser_required.add_argument(
         "-r",
         metavar="REGION",
-        help=argparse.SUPPRESS,
+        help=SUPPRESS,
         type=str,
         default=None,
     )
     subparser_required.add_argument(
         "-i",
-        metavar="IDs",
-        help=("IDs"),
+        "--id",
+        metavar="ID",
+        help="ID to search for",
         type=str,
         default=None,
         required=False,
@@ -71,8 +85,14 @@ def setup_find_args(parser):
     return subparser
 
 
-def validate_find_args(parser, args):
-    # IDs
+def validate_find_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the find command arguments."""
+    if not Path(args.yaml).exists():
+        parser.error(f"Input file does not exist: {args.yaml}")
+
+    if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
+
     if args.r is not None:
         warnings.warn(
             "The '-r' argument is deprecated and will be removed in a future version. "
@@ -80,45 +100,47 @@ def validate_find_args(parser, args):
             DeprecationWarning,
         )
         # Optionally map the old option to the new one
-        if not args.i:
-            args.i = args.r
-
-    fn = args.yaml
-    m = args.m
-    o = args.o
-    idtype = args.s  # selector
-    ids = args.i
+        if not args.id:
+            args.id = args.r
 
-    # run function
-    return run_find(fn, m, ids, idtype, o)
 
+def run_find(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the find command."""
+    validate_find_args(parser, args)
 
-def run_find(spec_fn: str, modality: str, id: str, idtype: str, o: str):
-    spec = load_spec(spec_fn)
+    spec = load_spec(args.yaml)
     found = []
-    if idtype == "region-type":
-        found = find_by_region_type(spec, modality, id)
-    elif idtype == "region":
-        found = find_by_region_id(spec, modality, id)
-    elif idtype == "read":
-        found = find_by_read_id(spec, modality, id)
-    elif idtype == "file":
-        found = find_by_file_id(spec, modality, id)
+
+    if args.selector == "region-type":
+        found = find_by_region_type(spec, args.modality, args.id)
+    elif args.selector == "region":
+        found = find_by_region_id(spec, args.modality, args.id)
+    elif args.selector == "read":
+        found = find_by_read_id(spec, args.modality, args.id)
+    elif args.selector == "file":
+        found = find_by_file_id(spec, args.modality, args.id)
     else:
-        raise ValueError(f"Unknown idtype: {idtype}")
+        raise ValueError(f"Unknown selector: {args.selector}")
 
     # post processing
-    if o:
-        with open(o, "w") as f:
+    if args.output:
+        with open(args.output, "w") as f:
             yaml.dump(found, f, sort_keys=False)
     else:
         print(yaml.dump(found, sort_keys=False))
 
-    return
 
+def find_by_read_id(spec: Assay, modality: str, id: str) -> List[Read]:
+    """Find reads by their ID.
 
-# TODO implement
-def find_by_read_id(spec: Assay, modality: str, id: str):
+    Args:
+        spec: The seqspec specification.
+        modality: The modality to search in.
+        id: The read ID to search for.
+
+    Returns:
+        A list of Read objects matching the ID.
+    """
     rds = []
     reads = spec.get_seqspec(modality)
     for r in reads:
@@ -127,10 +149,19 @@ def find_by_read_id(spec: Assay, modality: str, id: str):
     return rds
 
 
-# TODO implement
-def find_by_file_id(spec: Assay, modality: str, id: str):
+def find_by_file_id(spec: Assay, modality: str, id: str) -> List[File]:
+    """Find files by their ID.
+
+    Args:
+        spec: The seqspec specification.
+        modality: The modality to search in.
+        id: The file ID to search for.
+
+    Returns:
+        A list of File objects matching the ID.
+    """
     files = []
-    lf = list_files(spec, modality)
+    lf = list_all_files(spec, modality)
     for k, v in lf.items():
         for f in v:
             if f.file_id == id:
@@ -138,13 +169,33 @@ def find_by_file_id(spec: Assay, modality: str, id: str):
     return files
 
 
-def find_by_region_id(spec: Assay, modality: str, id: str):
+def find_by_region_id(spec: Assay, modality: str, id: str) -> List[Region]:
+    """Find regions by their ID.
+
+    Args:
+        spec: The seqspec specification.
+        modality: The modality to search in.
+        id: The region ID to search for.
+
+    Returns:
+        A list of Region objects matching the ID.
+    """
     m = spec.get_libspec(modality)
     regions = m.get_region_by_id(id)
     return regions
 
 
-def find_by_region_type(spec: Assay, modality: str, id: str):
+def find_by_region_type(spec: Assay, modality: str, id: str) -> List[Region]:
+    """Find regions by their type.
+
+    Args:
+        spec: The seqspec specification.
+        modality: The modality to search in.
+        id: The region type to search for.
+
+    Returns:
+        A list of Region objects matching the type.
+    """
     m = spec.get_libspec(modality)
     regions = m.get_region_by_region_type(id)
     return regions
diff --git a/seqspec/seqspec_format.py b/seqspec/seqspec_format.py
index 87eccad..b6d209f 100644
--- a/seqspec/seqspec_format.py
+++ b/seqspec/seqspec_format.py
@@ -1,8 +1,25 @@
+"""Format module for seqspec CLI.
+
+This module provides functionality to automatically format and fill in missing fields
+in a seqspec specification file.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+
 from seqspec.utils import load_spec
-from argparse import RawTextHelpFormatter
+from seqspec.Assay import Assay
 
 
-def setup_format_args(parser):
+def setup_format_args(parser) -> ArgumentParser:
+    """Create and configure the format command subparser.
+
+    Args:
+        parser: The main argument parser to add the format subparser to.
+
+    Returns:
+        The configured format subparser.
+    """
     subparser = parser.add_parser(
         "format",
         description="""
@@ -16,34 +33,58 @@ def setup_format_args(parser):
         help="Autoformat seqspec file",
         formatter_class=RawTextHelpFormatter,
     )
-    # subparser_required = subparser.add_argument_group("required arguments")
 
-    subparser.add_argument("yaml", help="Sequencing specification yaml file")
+    subparser.add_argument("yaml", type=Path, help="Sequencing specification yaml file")
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        type=Path,
+        help="Path to output file",
         default=None,
     )
     return subparser
 
 
-def validate_format_args(parser, args):
-    fn = args.yaml
-    o = args.o
+def validate_format_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the format command arguments.
+
+    Args:
+        parser: The argument parser.
+        args: The parsed arguments.
 
-    run_format(spec_fn=fn, o=o)
+    Raises:
+        parser.error: If any validation fails.
+    """
+    if not Path(args.yaml).exists():
+        parser.error(f"Input file does not exist: {args.yaml}")
 
+    if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
 
-def run_format(spec_fn, o):
-    spec = load_spec(spec_fn)
-    format(spec)
-    if o:
-        spec.to_YAML(o)
+
+def run_format(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the format command.
+
+    Args:
+        parser: The argument parser.
+        args: The parsed arguments.
+    """
+    validate_format_args(parser, args)
+
+    spec = load_spec(args.yaml)
+    format_spec(spec)
+
+    if args.output:
+        spec.to_YAML(args.output)
     else:
         print(spec.to_YAML())
 
 
-def format(spec):
-    return spec.update_spec()
+def format_spec(spec: Assay) -> None:
+    """Format a seqspec specification by updating its fields.
+
+    Args:
+        spec: The seqspec specification to format.
+    """
+    spec.update_spec()
diff --git a/seqspec/seqspec_index.py b/seqspec/seqspec_index.py
index 4bbc657..c08576b 100644
--- a/seqspec/seqspec_index.py
+++ b/seqspec/seqspec_index.py
@@ -1,19 +1,28 @@
+"""Index module for seqspec CLI.
+
+This module provides functionality to identify the position of elements in a spec for use in downstream tools.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS
+import warnings
+from typing import List, Optional
+
 from seqspec.utils import load_spec, map_read_id_to_regions
 from seqspec.seqspec_find import find_by_region_id
-import warnings
 from seqspec.seqspec_file import list_files_by_file_id, list_read_files
-from argparse import SUPPRESS, RawTextHelpFormatter
-from seqspec.Region import complement_sequence
-from seqspec.Region import RegionCoordinateDifference
-
 from seqspec.Region import (
+    complement_sequence,
+    RegionCoordinateDifference,
     project_regions_to_coordinates,
     itx_read,
 )
 from seqspec.Read import ReadCoordinate
+from seqspec.Assay import Assay
 
 
-def setup_index_args(parser):
+def setup_index_args(parser) -> ArgumentParser:
+    """Create and configure the index command subparser."""
     subparser = parser.add_parser(
         "index",
         description="""
@@ -30,12 +39,13 @@ def setup_index_args(parser):
         formatter_class=RawTextHelpFormatter,
     )
     subparser_required = subparser.add_argument_group("required arguments")
-    subparser.add_argument("yaml", help="Sequencing specification yaml file")
+    subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        help="Path to output file",
+        type=Path,
         default=None,
     )
 
@@ -50,6 +60,7 @@ def setup_index_args(parser):
     choices = [
         "chromap",
         "kb",
+        "kb-single",
         "relative",
         "seqkit",
         "simpleaf",
@@ -60,8 +71,9 @@ def setup_index_args(parser):
     ]
     subparser.add_argument(
         "-t",
+        "--tool",
         metavar="TOOL",
-        help=(f"Tool, [{', '.join(choices)}] (default: tab)"),
+        help=f"Tool, [{', '.join(choices)}] (default: tab)",
         default="tab",
         type=str,
         choices=choices,
@@ -70,8 +82,9 @@ def setup_index_args(parser):
     choices = ["read", "region", "file"]
     subparser.add_argument(
         "-s",
+        "--selector",
         metavar="SELECTOR",
-        help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"),
+        help=f"Selector for ID, [{', '.join(choices)}] (default: read)",
         type=str,
         default="read",
         choices=choices,
@@ -90,16 +103,17 @@ def setup_index_args(parser):
 
     subparser_required.add_argument(
         "-m",
+        "--modality",
         metavar="MODALITY",
-        help=("Modality"),
+        help="Modality",
         type=str,
-        default=None,
         required=True,
     )
     subparser_required.add_argument(
         "-i",
+        "--ids",
         metavar="IDs",
-        help=("IDs"),
+        help="IDs",
         type=str,
         default=None,
         required=False,
@@ -114,7 +128,8 @@ def setup_index_args(parser):
     return subparser
 
 
-def validate_index_args(parser, args):
+def validate_index_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the index command arguments."""
     if args.r is not None:
         warnings.warn(
             "The '-r' argument is deprecated and will be removed in a future version. "
@@ -122,8 +137,8 @@ def validate_index_args(parser, args):
             DeprecationWarning,
         )
         # Optionally map the old option to the new one
-        if not args.i:
-            args.i = args.r
+        if not args.ids:
+            args.ids = args.r
     if args.region:
         warnings.warn(
             "The '--region' argument is deprecated and will be removed in a future version. "
@@ -131,61 +146,67 @@ def validate_index_args(parser, args):
             DeprecationWarning,
         )
 
-    fn = args.yaml
-    m = args.m
-    ids = args.i  # this can be a list of ids (reads, regions, or files)
-    t = args.t
-    o = args.o
-    subregion_type = args.subregion_type
-    rev = args.rev
-    idtype = args.s
+    if not Path(args.yaml).exists():
+        parser.error(f"Input file does not exist: {args.yaml}")
+
+    if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
 
-    if ids is None and (idtype == "read" or idtype == "region"):
+    if args.ids is None and (args.selector == "read" or args.selector == "region"):
         parser.error("Must specify ids with -i for -s read or -s region")
 
-    return run_index(fn, m, ids, idtype, t, rev, subregion_type, o=o)
-
-
-def run_index(
-    spec_fn,
-    modality,
-    ids,
-    idtype,
-    fmt,
-    rev,
-    subregion_type,
-    o,
-):
-    spec = load_spec(spec_fn)
-    if ids is None:
-        ids = []
-    else:
-        ids = ids.split(",")
 
-    x = index(spec, modality, ids, idtype, fmt, rev, subregion_type)
+def run_index(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the index command."""
+    validate_index_args(parser, args)
 
-    # post processing
-    if o:
-        with open(o, "w") as f:
-            print(x, file=f)
-    else:
-        print(x)
+    spec = load_spec(args.yaml)
+    ids = args.ids.split(",") if args.ids else []
 
-    return
+    result = index(
+        spec,
+        args.modality,
+        ids,
+        args.selector,
+        args.tool,
+        args.rev,
+        args.subregion_type,
+    )
+
+    if args.output:
+        with open(args.output, "w") as f:
+            print(result, file=f)
+    else:
+        print(result)
 
 
 def index(
-    spec,
-    modality,
-    ids,
-    idtype,
-    fmt,
-    rev=False,
-    subregion_type=None,
-):
+    spec: Assay,
+    modality: str,
+    ids: List[str],
+    idtype: str,
+    fmt: str,
+    rev: bool = False,
+    subregion_type: Optional[str] = None,
+) -> str:
+    """Get index information from the spec.
+
+    Args:
+        spec: The seqspec specification.
+        modality: The modality to index.
+        ids: List of IDs to index.
+        idtype: Type of ID (read, region, file).
+        fmt: Output format.
+        rev: Whether to return 3'->5' region order.
+        subregion_type: Optional subregion type.
+
+    Returns:
+        Formatted index information.
+    """
     FORMAT = {
         "chromap": format_chromap,
         "kb": format_kallisto_bus,
+        "kb-single": format_kallisto_bus_force_single,
         "relative": format_relative,
         "seqkit": format_seqkit_subseq,
         "simpleaf": format_simpleaf,
@@ -205,7 +226,7 @@ def index(
         "read": get_index_by_read_ids,
     }
 
-    if len(ids) == 0:
+    if not ids:
         indices = GET_INDICES[idtype](spec, modality)
     else:
         indices = GET_INDICES_BY_IDS[idtype](spec, modality, ids)
@@ -311,6 +332,43 @@ def format_kallisto_bus(indices, subregion_type=None):
     return x
 
 
+def format_kallisto_bus_force_single(indices, subregion_type=None):
+    bcs = []
+    umi = []
+    feature = []
+    longest_feature = None
+    max_length = 0
+
+    for idx, region in enumerate(indices):
+        rg_strand = region.pop("strand")  # noqa
+        for rgn, cuts in region.items():
+            for cut in cuts:
+                if cut.region_type.upper() == "BARCODE":
+                    bcs.append(f"{idx},{cut.start},{cut.stop}")
+                elif cut.region_type.upper() == "UMI":
+                    umi.append(f"{idx},{cut.start},{cut.stop}")
+                elif (
+                    cut.region_type.upper() == "CDNA"
+                    or cut.region_type.upper() == "GDNA"
+                    or cut.region_type.upper() == "PROTEIN"
+                    or cut.region_type.upper() == "TAG"
+                ):
+                    length = cut.stop - cut.start
+                    if length > max_length:
+                        max_length = length
+                        longest_feature = f"{idx},{cut.start},{cut.stop}"
+
+    if len(umi) == 0:
+        umi.append("-1,-1,-1")
+    if len(bcs) == 0:
+        bcs.append("-1,-1,-1")
+    if longest_feature:
+        feature.append(longest_feature)
+
+    x = ",".join(bcs) + ":" + ",".join(umi) + ":" + ",".join(feature)
+    return x
+
+
 # this one should only return one string
 # TODO: return to this
 def format_seqkit_subseq(indices, subregion_type=None):
diff --git a/seqspec/seqspec_info.py b/seqspec/seqspec_info.py
index c01a963..b9573f7 100644
--- a/seqspec/seqspec_info.py
+++ b/seqspec/seqspec_info.py
@@ -4,10 +4,12 @@
 from seqspec.Region import Region
 from seqspec.Read import Read
 from seqspec.Assay import Assay
-from argparse import RawTextHelpFormatter
+from argparse import RawTextHelpFormatter, ArgumentParser, Namespace
+from pathlib import Path
 
 
-def setup_info_args(parser):
+def setup_info_args(parser) -> ArgumentParser:
+    """Create and configure the info command subparser."""
     subparser = parser.add_parser(
         "info",
         description="""
@@ -24,12 +26,13 @@ def setup_info_args(parser):
         formatter_class=RawTextHelpFormatter,
     )
 
-    subparser.add_argument("yaml", help="Sequencing specification yaml file")
+    subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
     choices = ["modalities", "meta", "sequence_spec", "library_spec"]
     subparser.add_argument(
         "-k",
+        "--key",
         metavar="KEY",
-        help=(f"Object to display, [{', '.join(choices)}] (default: meta)"),
+        help=f"Object to display, [{', '.join(choices)}] (default: meta)",
         type=str,
         default="meta",
         required=False,
@@ -37,8 +40,9 @@ def setup_info_args(parser):
     choices = ["tab", "json"]
     subparser.add_argument(
         "-f",
+        "--format",
         metavar="FORMAT",
-        help=(f"The output format, [{', '.join(choices)}] (default: tab)"),
+        help=f"The output format, [{', '.join(choices)}] (default: tab)",
         type=str,
         default="tab",
         required=False,
@@ -46,26 +50,30 @@ def setup_info_args(parser):
     )
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        help="Path to output file",
+        type=Path,
         default=None,
         required=False,
     )
     return subparser
 
 
-def validate_info_args(parser, args):
-    spec_fn = args.yaml
-    o = args.o
-    k = args.k
-    fmt = args.f
-    return run_info(spec_fn, fmt, k, o)
+def validate_info_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the info command arguments."""
+    if not Path(args.yaml).exists():
+        parser.error(f"Input file does not exist: {args.yaml}")
+
+    if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
 
 
-def run_info(spec_fn, f, k=None, o=None):
-    # return json of the Assay object
-    spec = load_spec(spec_fn)
+def run_info(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the info command."""
+    validate_info_args(parser, args)
+
+    spec = load_spec(args.yaml)
     CMD = {
         "modalities": seqspec_info_modalities,
         "meta": seqspec_info,
@@ -73,23 +81,24 @@ def run_info(spec_fn, f, k=None, o=None):
         "library_spec": seqspec_info_library_spec,
     }
     s = ""
-    if k:
-        s = CMD[k](spec, f)
+    if args.key:
+        s = CMD[args.key](spec, args.format)
 
-    if o:
-        with open(o, "w") as f:
+    if args.output:
+        with open(args.output, "w") as f:
             json.dump(s, f, sort_keys=False, indent=4)
     else:
         print(s)
-    return
 
 
-def seqspec_info(spec, fmt):
+def seqspec_info(spec: Assay, fmt: str) -> str:
+    """Get meta information about the spec."""
     s = format_info(spec, fmt)
     return s
 
 
-def seqspec_info_library_spec(spec, fmt):
+def seqspec_info_library_spec(spec: Assay, fmt: str) -> str:
+    """Get library specification information."""
     modalities = spec.list_modalities()
     s = ""
     for m in modalities:
@@ -98,17 +107,20 @@ def seqspec_info_library_spec(spec, fmt):
     return s
 
 
-def seqspec_info_sequence_spec(spec: Assay, fmt):
+def seqspec_info_sequence_spec(spec: Assay, fmt: str) -> str:
+    """Get sequence specification information."""
     reads = format_sequence_spec(spec.sequence_spec, fmt)
     return reads
 
 
-def seqspec_info_modalities(spec, fmt):
+def seqspec_info_modalities(spec: Assay, fmt: str) -> str:
+    """Get list of modalities."""
     modalities = format_modalities(spec.list_modalities(), fmt)
     return modalities
 
 
-def format_info(spec: Assay, fmt="tab"):
+def format_info(spec: Assay, fmt: str = "tab") -> str:
+    """Format meta information."""
     sd = spec.to_dict()
     del sd["library_spec"]
     del sd["sequence_spec"]
@@ -120,11 +132,11 @@ def format_info(spec: Assay, fmt="tab"):
         s = s[:-1]
     elif fmt == "json":
         s = json.dumps(sd, sort_keys=False, indent=4)
-
     return s
 
 
-def format_modalities(modalities: List[str], fmt="tab"):
+def format_modalities(modalities: List[str], fmt: str = "tab") -> str:
+    """Format list of modalities."""
     s = ""
     if fmt == "tab":
         s = "\t".join(modalities)
@@ -133,19 +145,24 @@ def format_modalities(modalities: List[str], fmt="tab"):
     return s
 
 
-def format_sequence_spec(sequence_spec: List[Read], fmt="tab"):
+def format_sequence_spec(sequence_spec: List[Read], fmt: str = "tab") -> str:
+    """Format sequence specification."""
     s = ""
     if fmt == "tab":
         # format the output as a table
         for r in sequence_spec:
-            s += f"{r.modality}\t{r.read_id}\t{r.strand}\t{r.min_len}\t{r.max_len}\t{r.primer_id}\t{r.name}\t{','.join([i.file_id for i in r.files])}\n"
+            files = ",".join([i.file_id for i in r.files]) if r.files else ""
+            s += f"{r.modality}\t{r.read_id}\t{r.strand}\t{r.min_len}\t{r.max_len}\t{r.primer_id}\t{r.name}\t{files}\n"
         s = s[:-1]
     elif fmt == "json":
         s = json.dumps([i.to_dict() for i in sequence_spec], sort_keys=False, indent=4)
     return s
 
 
-def format_library_spec(modality: str, library_spec: List[Region], fmt="tab"):
+def format_library_spec(
+    modality: str, library_spec: List[Region], fmt: str = "tab"
+) -> str:
+    """Format library specification."""
     s = ""
     if fmt == "tab":
         for r in library_spec:
diff --git a/seqspec/seqspec_init.py b/seqspec/seqspec_init.py
index 066c5b6..361324b 100644
--- a/seqspec/seqspec_init.py
+++ b/seqspec/seqspec_init.py
@@ -1,18 +1,25 @@
+"""Init module for seqspec CLI.
+
+This module provides functionality to generate new seqspec files from a newick tree format.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+from typing import List
+
+import newick
 from seqspec.Assay import Assay
 from seqspec.Region import Region
 from seqspec.File import File
 from seqspec.Read import Read
-from typing import List
-import newick
-from argparse import RawTextHelpFormatter
 
 # example
+# seqspec init -n myassay -m 1 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna)" # seqspec init -n myassay -m 1 -o spec.yaml -r "rna,R1.fastq.gz,truseq_r1,16,pos:rna,R2.fastq.gz,truseq_r2,100,neg" " ((truseq_r1:10,barcode:16,umi:12,cdna:150)rna)"
+# seqspec init -n myassay -m 2 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna,((barcode:16)r1.fastq.gz,(gdna:150)r2.fastq.gz,(gdna:150)r3.fastq.gz)atac)"
 
 
-# seqspec init -n myassay -m 1 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna)"
-# seqspec init -n myassay -m 1 -o spec.yaml -r "rna,R1.fastq.gz,truseq_r1,16,pos:rna,R2.fastq.gz,truseq_r2,100,neg" "((truseq_r1:10,barcode:16,umi:12,cdna:150)rna)"
-# seqspec init -n myassay -m 2 -o spec.yaml "(((barcode:16,umi:12)r1.fastq.gz,(cdna:150)r2.fastq.gz)rna,((barcode:16)r1.fastq.gz,(gdna:150)r2.fastq.gz,(gdna:150)r3.fastq.gz)atac)"
-def setup_init_args(parser):
+def setup_init_args(parser) -> ArgumentParser:
+    """Create and configure the init command subparser."""
     subparser = parser.add_parser(
         "init",
         description="""
@@ -27,85 +34,92 @@ def setup_init_args(parser):
     )
     subparser_required = subparser.add_argument_group("required arguments")
     subparser_required.add_argument(
-        "-n", metavar="NAME", type=str, help="assay name", required=True
+        "-n", "--name", metavar="NAME", type=str, help="Assay name", required=True
     )
-    # -m "rna,atac"
     subparser_required.add_argument(
         "-m",
+        "--modalities",
         metavar="MODALITIES",
         type=str,
-        help="list of comma-separated modalities (e.g. rna,atac)",
+        help="List of comma-separated modalities (e.g. rna,atac)",
         required=True,
     )
 
     # -r "rna,R1.fastq.gz,truseq_r1,16,pos:rna,R2.fastq.gz,truseq_r2,100,neg"
     subparser_required.add_argument(
         "-r",
+        "--reads",
         metavar="READS",
         type=str,
-        help="list of modalities, reads, primer_ids, lengths, and strand (e.g. modality,fastq_name,primer_id,len,strand:...)",
+        help="List of modalities, reads, primer_ids, lengths, and strand (e.g. modality,fastq_name,primer_id,len,strand:...)",
         required=True,
     )
 
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        help="Path to output file",
+        type=Path,
         default=None,
     )
     subparser.add_argument(
         "newick",
-        help=(
-            "tree in newick format (https://marvin.cs.uidaho.edu/Teaching/CS515/newickFormat.html)"
-        ),
+        help="Tree in newick format (https://marvin.cs.uidaho.edu/Teaching/CS515/newickFormat.html)",
     )
     return subparser
 
 
-def validate_init_args(parser, args):
-    name = args.n
-    modalities_str = args.m
-    newick_str = args.newick
-    o = args.o
-    reads_str = args.r
+def validate_init_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the init command arguments."""
+    if not args.newick:
+        parser.error("Newick tree must be provided")
+
+    if args.output and args.output.exists() and not args.output.is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
 
-    if newick_str is None:
-        parser.error("modality-FASTQs pairs must be provided")
 
-    return run_init(name, modalities_str, newick_str, reads_str, o)
+def run_init(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the init command."""
+    validate_init_args(parser, args)
 
+    modalities = args.modalities.split(",")
+    reads = parse_reads_string(args.reads)
+    tree = newick.loads(args.newick)
 
-def run_init(name: str, modalities_str, newick_str, reads_str, o=None):
-    modalities = modalities_str.split(",")
-    reads = parse_reads_string(reads_str)
-    tree = newick.loads(newick_str)
     if len(tree[0].descendants) != len(modalities):
         raise ValueError(
             "Number of modalities must match number of modality-FASTQs pairs"
         )
 
-    reads = parse_reads_string(reads_str)
-    spec = init(name, modalities, tree[0].descendants, reads)
+    spec = init(args.name, modalities, tree[0].descendants, reads)
 
-    if o:
-        spec.to_YAML(o)
+    if args.output:
+        spec.to_YAML(args.output)
     else:
         print(spec.to_YAML())
 
-    return
 
+def init(
+    name: str, modalities: List[str], tree: List[newick.Node], reads: List[Read]
+) -> Assay:
+    """Initialize a new seqspec specification.
+
+    Args:
+        name: Name of the assay.
+        modalities: List of modalities.
+        tree: Newick tree nodes.
+        reads: List of read specifications.
 
-def init(name: str, modalities, tree: List[newick.Node], reads: List[Read]):
-    # make read for each fastq
-    # make region for each modality
-    # add modality regions to assay
-    rgns = []
-    for t in tree:
-        r = Region(region_id="", region_type="", name="", sequence_type="")
-        rgns.append(newick_to_region(t, r))
+    Returns:
+        Initialized Assay object.
+    """
+    regions = []
+    for node in tree:
+        region = Region(region_id="", region_type="", name="", sequence_type="")
+        regions.append(newick_to_region(node, region))
 
-    assay = Assay(
+    return Assay(
         assay_id="",
         name=name,
         doi="",
@@ -118,59 +132,79 @@ def init(name: str, modalities, tree: List[newick.Node], reads: List[Read]):
         sequence_kit="",
         sequence_protocol="",
         sequence_spec=reads,
-        library_spec=rgns,
+        library_spec=regions,
     )
-    return assay
 
 
-def newick_to_region(
-    node, region=Region(region_id="", region_type="", name="", sequence_type="")
-):
+def newick_to_region(node: newick.Node, region: Region) -> Region:
+    """Convert a newick node to a Region object.
+
+    Args:
+        node: Newick tree node.
+        region: Base region object to populate.
+
+    Returns:
+        Populated Region object.
+    """
     region.region_id = node.name
     region.name = node.name
 
-    if len(node.descendants) == 0:
+    if not node.descendants:
         region.min_len = int(node.length)
         region.max_len = int(node.length)
         return region
+
     region.regions = []
-    for n in node.descendants:
+    for descendant in node.descendants:
         region.regions.append(
             newick_to_region(
-                n,
-                Region(region_id=n.name, region_type="", name=n.name, sequence_type=""),
+                descendant,
+                Region(
+                    region_id=descendant.name,
+                    region_type="",
+                    name=descendant.name,
+                    sequence_type="",
+                ),
             )
         )
     return region
 
 
-def parse_reads_string(input_string):
+def parse_reads_string(input_string: str) -> List[Read]:
+    """Parse a string of read specifications into Read objects.
+
+    Args:
+        input_string: String containing read specifications in format
+            "modality,read_id,primer_id,min_len,strand:..."
+
+    Returns:
+        List of Read objects.
+    """
     reads = []
-    objects = input_string.split(":")
-    for obj in objects:
-        parts = obj.split(",")
-        modality, read_id, primer_id, min_len, strand = parts
-
-        read = Read(
-            read_id=read_id,
-            name=read_id,
-            modality=modality,
-            primer_id=primer_id,
-            min_len=int(min_len),
-            max_len=int(min_len),
-            strand=strand,
-            files=[
-                File(
-                    file_id=read_id,
-                    filename=read_id,
-                    filetype="",
-                    filesize=0,
-                    url="",
-                    urltype="",
-                    md5="",
-                )
-            ],
+    for obj in input_string.split(":"):
+        modality, read_id, primer_id, min_len, strand = obj.split(",")
+
+        reads.append(
+            Read(
+                read_id=read_id,
+                name=read_id,
+                modality=modality,
+                primer_id=primer_id,
+                min_len=int(min_len),
+                max_len=int(min_len),
+                strand=strand,
+                files=[
+                    File(
+                        file_id=read_id,
+                        filename=read_id,
+                        filetype="",
+                        filesize=0,
+                        url="",
+                        urltype="",
+                        md5="",
+                    )
+                ],
+            )
         )
-        reads.append(read)
 
     return reads
diff --git a/seqspec/seqspec_methods.py b/seqspec/seqspec_methods.py
index 2d19915..6f7a3b0 100644
--- a/seqspec/seqspec_methods.py
+++ b/seqspec/seqspec_methods.py
@@ -1,17 +1,27 @@
+"""Methods module for seqspec.
+
+This module provides functionality to convert seqspec files into methods sections.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+
 from seqspec.utils import load_spec
 from seqspec.Assay import Assay
 from seqspec.Region import Region
-from argparse import RawTextHelpFormatter
+from seqspec.Read import Read, File
 
 
-def setup_methods_args(parser):
+def setup_methods_args(parser) -> ArgumentParser:
+    """Create and configure the methods command subparser."""
     subparser = parser.add_parser(
         "methods",
         description="""
 Convert seqspec file into methods section.
 
 Examples:
-seqspec methods -m rna spec.yaml # Return methods section for rna modality
+seqspec methods -m rna -o methods.txt spec.yaml  # Save methods section to file
+seqspec methods -m rna spec.yaml                 # Print methods section to stdout
 ---
 """,
         help="Convert seqspec file into methods section",
@@ -19,46 +29,50 @@ def setup_methods_args(parser):
     )
     subparser_required = subparser.add_argument_group("required arguments")
 
-    subparser.add_argument("yaml", help="Sequencing specification yaml file")
+    subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str)
     subparser_required.add_argument(
         "-m",
+        "--modality",
         metavar="MODALITY",
-        help=("Modality"),
+        help="Modality",
         type=str,
-        default=None,
         required=True,
     )
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        help="Path to output file",
+        type=Path,
         default=None,
     )
     return subparser
 
 
-def validate_methods_args(parser, args):
-    # if everything is valid the run_methods
-    fn = args.yaml
-    o = args.o
+def validate_methods_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the methods command arguments."""
+    if not Path(args.yaml).exists():
+        parser.error(f"Input file does not exist: {args.yaml}")
 
-    m = args.m
-    return run_methods(fn, m, o)
+    if args.output and args.output.exists() and not args.output.is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
 
 
-def run_methods(spec_fn, m, o):
-    spec = load_spec(spec_fn)
-    m = methods(spec, m)
+def run_methods(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the methods command."""
+    validate_methods_args(parser, args)
 
-    if o:
-        with open(o, "w") as f:
-            print(m, file=f)
+    spec = load_spec(args.yaml)
+    methods_text = methods(spec, args.modality)
+
+    if args.output:
+        args.output.write_text(methods_text)
     else:
-        print(m)
+        print(methods_text)
 
 
-def methods(spec, modality):
+def methods(spec: Assay, modality: str) -> str:
+    """Generate methods section for spec and modality."""
     m = f"""Methods
 The {modality} portion of the {spec.name} assay was generated on {spec.date}.
     """
@@ -67,15 +81,16 @@ def methods(spec, modality):
 
 
 # TODO: manage sequence/library protocol/kit for cases where each modality has different protocols/kits
-def format_library_spec(spec: Assay, m):
-    leaves = spec.get_libspec(m).get_leaves()
+def format_library_spec(spec: Assay, modality: str) -> str:
+    """Format library specification for methods section."""
+    leaves = spec.get_libspec(modality).get_leaves()
 
     lib_prot = None
     if isinstance(spec.library_protocol, str):
         lib_prot = spec.library_protocol
     elif isinstance(spec.library_protocol, list):
         for i in spec.library_protocol:
-            if i.modality == m:
+            if i.modality == modality:
                 lib_prot = i.protocol_id
 
     lib_kit = None
@@ -83,7 +98,7 @@ def format_library_spec(spec: Assay, m):
         lib_kit = spec.library_kit
     elif isinstance(spec.library_kit, list):
         for i in spec.library_kit:
-            if i.modality == m:
+            if i.modality == modality:
                 lib_kit = i.kit_id
 
     seq_prot = None
@@ -91,7 +106,7 @@ def format_library_spec(spec: Assay, m):
         seq_prot = spec.sequence_protocol
     elif isinstance(spec.sequence_protocol, list):
         for i in spec.sequence_protocol:
-            if i.modality == m:
+            if i.modality == modality:
                 seq_prot = i.protocol_id
 
     seq_kit = None
@@ -99,7 +114,7 @@ def format_library_spec(spec: Assay, m):
         seq_kit = spec.sequence_kit
     elif isinstance(spec.sequence_kit, list):
         for i in spec.sequence_kit:
-            if i.modality == m:
+            if i.modality == modality:
                 seq_kit = i.kit_id
 
     s = f"""
@@ -112,13 +127,14 @@ def format_library_spec(spec: Assay, m):
 \nSequence structure\n
 The library was sequenced on a {seq_prot} using the {seq_kit} sequencing kit. The library was sequenced using the following configuration:\n
 """
-    reads = spec.get_seqspec(m)
+    reads = spec.get_seqspec(modality)
     for idx, r in enumerate(reads, 1):
         s += format_read(r, idx)
     return s
 
 
-def format_region(region: Region, idx: int = 1):
+def format_region(region: Region, idx: int = 1) -> str:
+    """Format region for methods section."""
     s = f"{idx}. {region.name}: {region.min_len}-{region.max_len}bp {region.sequence_type} sequence ({region.sequence})"
     if region.onlist:
         s += f", onlist file: {region.onlist.filename}.\n"
@@ -127,14 +143,15 @@ def format_region(region: Region, idx: int = 1):
     return s
 
 
-def format_read(read, idx: int = 1):
+def format_read(read: Read, idx: int = 1) -> str:
+    """Format read for methods section."""
     s = f"- {read.name}: {read.max_len} cycles on the {'positive' if read.strand == 'pos' else 'negative'} strand using the {read.primer_id} primer. The following files contain the sequences in Read {idx}:\n"
-    for idx, f in enumerate(read.files, 1):
-        s += "  " + format_read_file(f, idx)
-    s = s[:-1]
+    if read.files:
+        for idx, f in enumerate(read.files, 1):
+            s += "  " + format_read_file(f, idx)
     return s
 
 
-def format_read_file(file, idx: int = 1):
-    s = f"- File {idx}: {file.filename}\n"
-    return s
+def format_read_file(file: File, idx: int = 1) -> str:
+    """Format read file for methods section."""
+    return f"- File {idx}: {file.filename}\n"
diff --git a/seqspec/seqspec_modify.py b/seqspec/seqspec_modify.py
index 006cf00..78aa1ec 100644
--- a/seqspec/seqspec_modify.py
+++ b/seqspec/seqspec_modify.py
@@ -1,12 +1,20 @@
+"""Modify module for seqspec.
+
+This module provides functionality to modify attributes of various elements in seqspec files.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS
+from typing import List, Optional
+import warnings
+
 from seqspec.utils import load_spec
 from seqspec.File import File
-from argparse import RawTextHelpFormatter, SUPPRESS
-import warnings
+from seqspec.Assay import Assay
 
 
-# TODO fix modify to use the -s selector
-def setup_modify_args(parser):
-    # given a spec, a region id and a list of key value property pairs, modify the spec
+def setup_modify_args(parser) -> ArgumentParser:
+    """Create and configure the modify command subparser."""
     subparser = parser.add_parser(
         "modify",
         description="""
@@ -22,34 +30,34 @@ def setup_modify_args(parser):
         formatter_class=RawTextHelpFormatter,
     )
     subparser_required = subparser.add_argument_group("required arguments")
-    subparser.add_argument("yaml", help="Sequencing specification yaml file")
+    subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str)
 
     # Read properties
     subparser.add_argument(
         "--read-id",
         metavar="READID",
-        help=("New ID of read"),
+        help="New ID of read",
         type=str,
         default=None,
     )
     subparser.add_argument(
         "--read-name",
         metavar="READNAME",
-        help=("New name of read"),
+        help="New name of read",
         type=str,
         default=None,
     )
     subparser.add_argument(
         "--primer-id",
         metavar="PRIMERID",
-        help=("New ID of primer"),
+        help="New ID of primer",
         type=str,
         default=None,
     )
     subparser.add_argument(
         "--strand",
         metavar="STRAND",
-        help=("New strand"),
+        help="New strand",
         type=str,
         default=None,
     )
@@ -59,45 +67,44 @@ def setup_modify_args(parser):
     subparser.add_argument(
         "--files",
         metavar="FILES",
-        help=("New files, (filename,filetype,filesize,url,urltype,md5:...)"),
+        help="New files, (filename,filetype,filesize,url,urltype,md5:...)",
         type=str,
         default=None,
     )
 
     # Region properties
-
     subparser.add_argument(
         "--region-id",
         metavar="REGIONID",
-        help=("New ID of region"),
+        help="New ID of region",
         type=str,
         default=None,
     )
     subparser.add_argument(
         "--region-type",
         metavar="REGIONTYPE",
-        help=("New type of region"),
+        help="New type of region",
         type=str,
         default=None,
     )
     subparser.add_argument(
         "--region-name",
         metavar="REGIONNAME",
-        help=("New name of region"),
+        help="New name of region",
         type=str,
         default=None,
     )
     subparser.add_argument(
         "--sequence-type",
         metavar="SEQUENCETYPE",
-        help=("New type of sequence"),
+        help="New type of sequence",
         type=str,
         default=None,
     )
     subparser.add_argument(
         "--sequence",
         metavar="SEQUENCE",
-        help=("New sequence"),
+        help="New sequence",
         type=str,
         default=None,
     )
@@ -106,25 +113,25 @@ def setup_modify_args(parser):
     subparser.add_argument(
         "--min-len",
         metavar="MINLEN",
-        help=("Min region length"),
+        help="Min region length",
         type=int,
         default=None,
     )
     subparser.add_argument(
         "--max-len",
         metavar="MAXLEN",
-        help=("Max region length"),
+        help="Max region length",
         type=int,
         default=None,
     )
 
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        help="Path to output file",
+        type=Path,
         default=None,
-        required=False,
     )
     subparser_required.add_argument(
         "-r",
@@ -132,38 +139,38 @@ def setup_modify_args(parser):
         help=SUPPRESS,
         type=str,
         default=None,
-        required=False,
     )
     subparser_required.add_argument(
         "-i",
         metavar="IDs",
-        help=("IDs"),
+        help="IDs",
         type=str,
         default=None,
-        required=False,
     )
     choices = ["read", "region"]
     subparser.add_argument(
         "-s",
+        "--selector",
         metavar="SELECTOR",
-        help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"),
+        help=f"Selector for ID, [{', '.join(choices)}] (default: read)",
         type=str,
         default="read",
         choices=choices,
     )
     subparser_required.add_argument(
         "-m",
+        "--modality",
         metavar="MODALITY",
-        help=("Modality of the assay"),
+        help="Modality of the assay",
         type=str,
-        default=None,
         required=True,
     )
 
     return subparser
 
 
-def validate_modify_args(parser, args):
+def validate_modify_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the modify command arguments."""
     if args.r is not None:
         warnings.warn(
             "The '-r' argument is deprecated and will be removed in a future version. "
@@ -174,103 +181,99 @@ def validate_modify_args(parser, args):
         if not args.i:
             args.i = args.r
 
-    # if everything is valid the run_format
-    fn = args.yaml
-    o = args.o
-    modality = args.m
-    # target_r = args.r
-    idtype = args.s  # selector
-    ids = args.i
+    if not Path(args.yaml).exists():
+        parser.error(f"Input file does not exist: {args.yaml}")
 
-    # Read properties
-    read_id = args.read_id
-    read_name = args.read_name
-    primer_id = args.primer_id
-    strand = args.strand
-    files = args.files
+    if args.output and args.output.exists() and not args.output.is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
 
-    # Region properties
-    region_id = args.region_id
-    region_type = args.region_type
-    region_name = args.region_name
-    sequence_type = args.sequence_type
-    sequence = args.sequence
 
-    # Read and Region properties
-    min_len = args.min_len
-    max_len = args.max_len
+def run_modify(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the modify command."""
+    validate_modify_args(parser, args)
 
-    spec = load_spec(fn)
+    spec = load_spec(args.yaml)
 
+    # Read properties
     read_kwd = {
-        "read_id": read_id,
-        "read_name": read_name,
-        "primer_id": primer_id,
-        "min_len": min_len,
-        "max_len": max_len,
-        "strand": strand,
-        "files": files,
+        "read_id": args.read_id,
+        "read_name": args.read_name,
+        "primer_id": args.primer_id,
+        "min_len": args.min_len,
+        "max_len": args.max_len,
+        "strand": args.strand,
+        "files": args.files,
     }
 
+    # Region properties
     region_kwd = {
-        "region_id": region_id,
-        "region_type": region_type,
-        "name": region_name,
-        "sequence_type": sequence_type,
-        "sequence": sequence,
-        "min_len": min_len,
-        "max_len": max_len,
+        "region_id": args.region_id,
+        "region_type": args.region_type,
+        "name": args.region_name,
+        "sequence_type": args.sequence_type,
+        "sequence": args.sequence,
+        "min_len": args.min_len,
+        "max_len": args.max_len,
     }
 
-    if idtype == "region":
-        spec = run_modify_region(spec, modality, ids, **region_kwd)
-    elif idtype == "read":
-        spec = run_modify_read(spec, modality, ids, **read_kwd)
-    # update region in spec
-    # once the region is updated, update the spec
+    if args.selector == "region":
+        spec = run_modify_region(spec, args.modality, args.i, **region_kwd)
+    elif args.selector == "read":
+        spec = run_modify_read(spec, args.modality, args.i, **read_kwd)
+
+    # Update spec
     spec.update_spec()
-    if o:
-        spec.to_YAML(o)
+
+    if args.output:
+        args.output.write_text(spec.to_YAML())
     else:
         print(spec.to_YAML())
 
 
 def run_modify_read(
-    spec,
-    modality,
-    target_read,
-    read_id,
-    read_name,
-    primer_id,
-    min_len,
-    max_len,
-    strand,
-    files,
-):
+    spec: Assay,
+    modality: str,
+    target_read: str,
+    read_id: Optional[str] = None,
+    read_name: Optional[str] = None,
+    primer_id: Optional[str] = None,
+    min_len: Optional[int] = None,
+    max_len: Optional[int] = None,
+    strand: Optional[str] = None,
+    files: Optional[str] = None,
+) -> Assay:
+    """Modify read properties in spec."""
     reads = spec.get_seqspec(modality)
     if files:
-        files = parse_files_string(files)
+        files_list = parse_files_string(files)
     for r in reads:
         if r.read_id == target_read:
             r.update_read_by_id(
-                read_id, read_name, modality, primer_id, min_len, max_len, strand, files
+                read_id,
+                read_name,
+                modality,
+                primer_id,
+                min_len,
+                max_len,
+                strand,
+                files_list,
             )
-
     return spec
 
 
 def run_modify_region(
-    spec,
-    modality,
-    target_region,
-    region_id,
-    region_type,
-    name,
-    sequence_type,
-    sequence,
-    min_len,
-    max_len,
-):
+    spec: Assay,
+    modality: str,
+    target_region: str,
+    region_id: Optional[str] = None,
+    region_type: Optional[str] = None,
+    name: Optional[str] = None,
+    sequence_type: Optional[str] = None,
+    sequence: Optional[str] = None,
+    min_len: Optional[int] = None,
+    max_len: Optional[int] = None,
+) -> Assay:
+    """Modify region properties in spec."""
     spec.get_libspec(modality).update_region_by_id(
         target_region,
         region_id,
@@ -281,27 +284,23 @@ def run_modify_region(
         min_len,
         max_len,
     )
-
     return spec
 
 
-# filename,filetype,filesize,url,urltype,md5:...
-def parse_files_string(input_string):
+def parse_files_string(input_string: str) -> List[File]:
+    """Parse files string into list of File objects. # filename,filetype,filesize,url,urltype,md5:..."""
     files = []
-    objects = input_string.split(":")
-    for obj in objects:
-        parts = obj.split(",")
-        filename, filetype, filesize, url, urltype, md5 = parts
-
-        file = File(
-            file_id=filename,
-            filename=filename,
-            filetype=filetype,
-            filesize=int(filesize),
-            url=url,
-            urltype=urltype,
-            md5=md5,
+    for f in input_string.split(":"):
+        filename, filetype, filesize, url, urltype, md5 = f.split(",")
+        files.append(
+            File(
+                file_id=filename,
+                filename=filename,
+                filetype=filetype,
+                filesize=int(filesize),
+                url=url,
+                urltype=urltype,
+                md5=md5,
+            )
         )
-        files.append(file)
-
     return files
diff --git a/seqspec/seqspec_onlist.py b/seqspec/seqspec_onlist.py
index b021a5a..51198b9 100644
--- a/seqspec/seqspec_onlist.py
+++ b/seqspec/seqspec_onlist.py
@@ -1,16 +1,28 @@
-from seqspec.Assay import Assay
-from seqspec.Region import project_regions_to_coordinates, itx_read, Onlist
-from seqspec.utils import load_spec, map_read_id_to_regions
-from seqspec.seqspec_find import find_by_region_type, find_by_region_id
+"""Onlist module for seqspec CLI.
+
+This module provides functionality to generate and manage onlist files for seqspec regions.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS
+import warnings
 import os
-from seqspec.utils import read_local_list, read_remote_list
 import itertools
 from typing import List
-from argparse import SUPPRESS, RawTextHelpFormatter
-import warnings
 
+from seqspec.Assay import Assay
+from seqspec.Region import project_regions_to_coordinates, itx_read, Onlist
+from seqspec.utils import (
+    load_spec,
+    map_read_id_to_regions,
+    read_local_list,
+    read_remote_list,
+)
+from seqspec.seqspec_find import find_by_region_type, find_by_region_id
 
-def setup_onlist_args(parser):
+
+def setup_onlist_args(parser) -> ArgumentParser:
+    """Create and configure the onlist command subparser."""
     subparser = parser.add_parser(
         "onlist",
         description="""
@@ -25,20 +37,22 @@ def setup_onlist_args(parser):
         formatter_class=RawTextHelpFormatter,
     )
     subparser_required = subparser.add_argument_group("required arguments")
-    subparser.add_argument("yaml", help="Sequencing specification yaml file")
+    subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
 
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        help="Path to output file",
+        type=Path,
         default=None,
     )
     choices = ["read", "region", "region-type"]
     subparser.add_argument(
         "-s",
+        "--selector",
         metavar="SELECTOR",
-        help=(f"Selector for ID, [{', '.join(choices)}] (default: read)"),
+        help=f"Selector for ID, [{', '.join(choices)}] (default: read)",
         type=str,
         default="read",
         choices=choices,
@@ -55,6 +69,7 @@ def setup_onlist_args(parser):
     format_choices = ["product", "multi"]
     subparser.add_argument(
         "-f",
+        "--format",
         metavar="FORMAT",
         type=str,
         default="product",
@@ -63,16 +78,18 @@ def setup_onlist_args(parser):
     )
     subparser_required.add_argument(
         "-i",
-        metavar="IDs",
-        help=("IDs"),
+        "--id",
+        metavar="ID",
+        help="ID to search for",
         type=str,
         default=None,
         required=False,
     )
     subparser_required.add_argument(
         "-m",
+        "--modality",
         metavar="MODALITY",
-        help=("Modality"),
+        help="Modality",
         type=str,
         default=None,
         required=True,
@@ -81,7 +98,14 @@ def setup_onlist_args(parser):
     return subparser
 
 
-def validate_onlist_args(parser, args):
+def validate_onlist_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the onlist command arguments."""
+    if not Path(args.yaml).exists():
+        parser.error(f"Input file does not exist: {args.yaml}")
+
+    if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
+
     if args.r is not None:
         warnings.warn(
             "The '-r' argument is deprecated and will be removed in a future version. "
@@ -89,32 +113,26 @@ def validate_onlist_args(parser, args):
             DeprecationWarning,
         )
         # Optionally map the old option to the new one
-        if not args.i:
-            args.i = args.r
-
-    fn = args.yaml
-    m = args.m
-    ids = args.i
-    fmt = args.f
-    o = args.o
-    idtype = args.s
+        if not args.id:
+            args.id = args.r
 
-    return run_onlist(fn, m, ids, idtype, fmt, o)
 
+def run_onlist(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the onlist command."""
+    validate_onlist_args(parser, args)
 
-def run_onlist(spec_fn, modality, ids, idtype, fmt, o):
     # the base path is the path to the spec file
-    base_path = os.path.dirname(os.path.abspath(spec_fn))
+    base_path = args.yaml.parent.absolute()
 
     # set the save path if it exists
-    if o:
-        save_path = os.path.abspath(o)
+    if args.output:
+        save_path = args.output
     else:
         # otherwise the save path is the same path as the spec
-        save_path = os.path.join(base_path, "onlist_joined.txt")
+        save_path = base_path / "onlist_joined.txt"
 
     # load spec
-    spec = load_spec(spec_fn)
+    spec = load_spec(args.yaml)
     # if number of barcodes > 1 then we need to join them
 
     CMD = {
@@ -123,18 +141,20 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o):
         "read": run_onlist_read,
     }
 
-    onlists = CMD[idtype](spec, modality, ids)
+    onlists = CMD[args.selector](spec, args.modality, args.id)
 
     if len(onlists) == 0:
-        raise ValueError(f"No onlist found for {modality}, {idtype}, {ids}")
+        raise ValueError(
+            f"No onlist found for {args.modality}, {args.selector}, {args.id}"
+        )
 
     # for only one onlist we can just return the path
     # if only one, its remote and we save it to the base path
     elif len(onlists) == 1:
         urltype = onlists[0].urltype
-        onlist_fn = os.path.basename(onlists[0].filename)
-        onlist_path = os.path.join(base_path, onlist_fn)
-        if os.path.exists(onlist_path):
+        onlist_fn = Path(onlists[0].filename).name
+        onlist_path = base_path / onlist_fn
+        if onlist_path.exists():
             urltype = "local"
         elif urltype in ["http", "https"]:
             # download the onlist to the base path and return the path
@@ -150,12 +170,11 @@ def run_onlist(spec_fn, modality, ids, idtype, fmt, o):
             elif o.urltype in ["http", "https"]:
                 # base_path is ignored for remote onlists
                 lsts.append(read_remote_list(o, base_path))
-        onlist_elements = join_onlists(lsts, fmt)
+        onlist_elements = join_onlists(lsts, args.format)
         onlist_path = write_onlist(onlist_elements, save_path)
 
     # print the path to the onlist
     print(onlist_path)
-    return
 
 
 def run_onlist_region_type(
@@ -174,7 +193,9 @@ def run_onlist_region(spec: Assay, modality: str, region_id: str) -> List[Onlist
     regions = find_by_region_id(spec, modality, region_id)
     onlists: List[Onlist] = []
     for r in regions:
-        onlists.append(r.get_onlist())
+        ol = r.get_onlist()
+        if ol:
+            onlists.append(ol)
     if len(onlists) == 0:
         raise ValueError(f"No onlist found for region {region_id}")
     return onlists
diff --git a/seqspec/seqspec_print.py b/seqspec/seqspec_print.py
index 11322f6..1b7744a 100644
--- a/seqspec/seqspec_print.py
+++ b/seqspec/seqspec_print.py
@@ -1,13 +1,33 @@
-from seqspec.utils import load_spec
-from seqspec.seqspec_print_html import print_seqspec_html
+"""Print module for seqspec CLI.
+
+This module provides functionality to print sequence and/or library structure
+in various formats (ascii, png, html).
+"""
+
+from typing import List, Any
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
 import newick
-from seqspec.utils import REGION_TYPE_COLORS
+import matplotlib.pyplot as plt
+from matplotlib.patches import Rectangle
+import matplotlib.patches as mpatches
+
+from seqspec.utils import load_spec, REGION_TYPE_COLORS
+from seqspec.seqspec_print_html import print_seqspec_html
 from seqspec.Region import complement_sequence
-from seqspec.Region import project_regions_to_coordinates
-from argparse import RawTextHelpFormatter
+from seqspec.Assay import Assay
+from seqspec.seqspec_print_utils import libseq
+
 
+def setup_print_args(parser) -> ArgumentParser:
+    """Create and configure the print command subparser.
 
-def setup_print_args(parser):
+    Args:
+        parser: The main argument parser to add the print subparser to.
+
+    Returns:
+        The configured print subparser.
+    """
     subparser = parser.add_parser(
         "print",
         description="""
@@ -23,20 +43,23 @@ def setup_print_args(parser):
         help="Display the sequence and/or library structure from seqspec file",
         formatter_class=RawTextHelpFormatter,
     )
-    subparser.add_argument("yaml", help="Sequencing specification yaml file")
+
+    subparser.add_argument("yaml", type=Path, help="Sequencing specification yaml file")
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        type=Path,
+        help="Path to output file",
         default=None,
     )
 
     format_choices = ["library-ascii", "seqspec-html", "seqspec-png", "seqspec-ascii"]
     subparser.add_argument(
         "-f",
+        "--format",
         metavar="FORMAT",
-        help=(f"Format ({', '.join(format_choices)}), default: library-ascii"),
+        help=f"Format ({', '.join(format_choices)}), default: library-ascii",
         type=str,
         default="library-ascii",
         choices=format_choices,
@@ -45,53 +68,87 @@ def setup_print_args(parser):
     return subparser
 
 
-def validate_print_args(parser, args):
-    fmt = args.f
+def validate_print_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the print command arguments.
+
+    Args:
+        parser: The argument parser.
+        args: The parsed arguments.
+
+    Raises:
+        parser.error: If any validation fails.
+    """
+    if not Path(args.yaml).exists():
+        parser.error(f"Input file does not exist: {args.yaml}")
+
+    if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
 
-    fn = args.yaml
-    o = args.o
-    if fmt == "seqspec-png" and o is None:
+    if args.format == "seqspec-png" and args.output is None:
         parser.error("Output file required for png format")
 
-    return run_seqspec_print(fn, fmt, o)
 
+def run_print(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the print command.
+
+    Args:
+        parser: The argument parser.
+        args: The parsed arguments.
+    """
+
+    validate_print_args(parser, args)
 
-def run_seqspec_print(spec_fn, fmt, o):
-    spec = load_spec(spec_fn)
+    spec = load_spec(args.yaml)
 
-    # TODO: add reads to seqspec html
-    # TODO: add reads to seqspec png
-    CMD = {
+    # Map format to print function
+    format_to_function = {
         "library-ascii": print_library_ascii,
         "seqspec-html": print_seqspec_html,
         "seqspec-png": print_seqspec_png,
         "seqspec-ascii": print_seqspec_ascii,
     }
 
-    s = CMD[fmt](spec)
-
-    if fmt == "png":
-        return s.savefig(o, dpi=300, bbox_inches="tight")  #
+    result = format_to_function[args.format](spec)
 
-    if o:
-        with open(o, "w") as f:
-            print(s, file=f)
+    if args.format == "seqspec-png":
+        result.savefig(args.output, dpi=300, bbox_inches="tight")
+    elif args.output:
+        with open(args.output, "w") as f:
+            print(result, file=f)
     else:
-        print(s)
-    return
+        print(result)
+
+
+def print_seqspec_ascii(spec: Assay) -> str:
+    """Print sequence specification in ASCII format.
 
+    Args:
+        spec: The seqspec specification to print.
 
-def print_seqspec_ascii(spec):
-    p = []
+    Returns:
+        The ASCII formatted string.
+    """
+    parts = []
     for modality in spec.modalities:
-        p.append(format_libseq(spec, modality, *libseq(spec, modality)))
-    return "\n".join(p)
+        parts.append(format_libseq(spec, modality, *libseq(spec, modality)))
+    return "\n".join(parts)
+
 
+def format_libseq(spec: Assay, modality: str, p: List[str], n: List[str]) -> str:
+    """Format library sequence for a specific modality.
 
-def format_libseq(spec, modality, p, n):
+    Args:
+        spec: The seqspec specification.
+        modality: The modality to format.
+        p: Positive strand parts.
+        n: Negative strand parts.
+
+    Returns:
+        The formatted string.
+    """
     libspec = spec.get_libspec(modality)
 
-    s = "\n".join(
+    return "\n".join(
         [
             modality,
             "---",
@@ -101,154 +158,123 @@ def format_libseq(spec, modality, p, n):
             "\n".join(n),
         ]
     )
-    return s
 
 
-def libseq(spec, modality):
-    libspec = spec.get_libspec(modality)
-    seqspec = spec.get_seqspec(modality)
-
-    p = []
-    n = []
-    leaves = libspec.get_leaves()
-    cuts = project_regions_to_coordinates(leaves)
-    for idx, read in enumerate(seqspec, 1):
-        read_len = read.max_len
-        read_id = read.read_id
-        primer_id = read.primer_id
-        primer_idx = [i for i, l in enumerate(leaves) if l.region_id == primer_id][0]
-        primer_pos = cuts[primer_idx]
-        if read.strand == "pos":
-            wsl = primer_pos.stop - 1
-            ws = wsl * " "
-
-            arrowl = read_len - 1
-            arrow = arrowl * "-"
-
-            p.append(f"{ws}|{arrow}>({idx}) {read_id}")
-        elif read.strand == "neg":
-            wsl = primer_pos.start - read_len
-            ws = wsl * " "
-
-            arrowl = read_len - 1
-            arrow = arrowl * "-"
-
-            n.append(f"{ws}<{arrow}|({idx}) {read_id}")
-    return (p, n)
-
-
-def run_print(data):
-    header = headerTemplate(data.name, data.doi, data.description, data.modalities)
-    header2 = "## Final Library"
-    library_spec = multiModalTemplate(data.library_spec)
-    s = f"{header}\n{header2}\n{library_spec}"
-    return s
-
-
-def run_print_sequence_spec(spec):
-    p = []
-    for r in spec.sequence_spec:
-        p.append(
-            "\t".join(
-                [r.read_id, r.primer_id, r.strand, str(r.min_len), str(r.max_len)]
-            )
-        )
-    return "\n".join(p)
+def print_library_ascii(spec: Assay) -> str:
+    """Print library structure in ASCII format.
 
+    Args:
+        spec: The seqspec specification to print.
 
-def print_library_ascii(spec):
-    t = []
+    Returns:
+        The ASCII formatted string.
+    """
+    trees = []
     for r in spec.library_spec:
-        t.append(r.to_newick())
-    n = ",".join(t)
-    # print(n)
-    tree = newick.loads(f"({n})")
+        trees.append(r.to_newick())
+    tree_str = ",".join(trees)
+    tree = newick.loads(f"({tree_str})")
     return tree[0].ascii_art()
 
 
-def argsort(arr):
-    # http://stackoverflow.com/questions/3382352/equivalent-of-numpy-argsort-in-basic-python/3382369#3382369
-    # by unutbu
-    return sorted(range(len(arr)), key=arr.__getitem__)
-
+def print_seqspec_png(spec: Assay):
+    """Print sequence specification as PNG.
 
-def print_seqspec_png(spec):
-    # builds directly off of https://colab.research.google.com/drive/1ZCIGrwLEIfE0yo33bP8uscUNPEn1p1DH developed by https://github.com/LucasSilvaFerreira
+    Args:
+        spec: The seqspec specification to print.
 
-    # modality
+    Returns:
+        The matplotlib figure.
+    """
     modalities = spec.list_modalities()
     modes = [spec.get_libspec(m) for m in modalities]
     lengths = [i.min_len for i in modes]
     nmodes = len(modalities)
 
-    # sort the modalities by their lengths
+    # Sort modalities by length
     asort = argsort(lengths)
     modalities = [modalities[i] for i in asort]
     lengths = [lengths[i] for i in asort]
     modes = [modes[i] for i in asort]
-    assay_id = spec.assay_id
 
-    fig, _ = plot_png(assay_id, modalities, modes, nmodes, lengths)
-    return fig
+    return plot_png(spec.assay_id, modalities, modes, nmodes, lengths)
+
 
+def argsort(arr: List[Any]) -> List[int]:
+    """Get indices that would sort an array.
 
-def plot_png(assay, modalities, modes, nmodes, lengths):
-    import matplotlib.pyplot as plt
-    from matplotlib.patches import Rectangle
-    import matplotlib.patches as mpatches
+    Args:
+        arr: The array to sort.
 
+    Returns:
+        List of indices that would sort the array.
+    """
+    return sorted(range(len(arr)), key=arr.__getitem__)
+
+
+def plot_png(
+    assay: str, modalities: List[str], modes: List[Any], nmodes: int, lengths: List[int]
+):
+    """Create PNG plot of sequence specification.
+
+    Args:
+        assay: The assay ID.
+        modalities: List of modalities.
+        modes: List of mode specifications.
+        nmodes: Number of modes.
+        lengths: List of lengths.
+
+    Returns:
+        The matplotlib figure.
+    """
     fsize = 15
     plt.rcParams.update({"font.size": fsize})
 
-    fig, ax = plt.subplots(
-        figsize=(10, 1 * nmodes), nrows=nmodes
-    )
+    fig, _ = plt.subplots(figsize=(10, 1 * nmodes), nrows=nmodes)
     title_offset = 0.98 if nmodes > 1 else 1.2
     fig.suptitle(assay, y=title_offset)
+
     rts = []
     for m, ax in zip(modes, fig.get_axes()):
-        # get leaves
+        # Get leaves
         leaves = m.get_leaves()
 
-        # setup plotting variables
+        # Setup plotting variables
         y = 0
         x = 0
         height = 1
 
         for idx, node in enumerate(leaves):
-            # region tupe
+            # Region type
             rtype = node.region_type.lower()
-            # add to the global list so we can make a legend
             rts.append(rtype)
-            # get region properties
+
+            # Get region properties
             length = node.min_len
             label = f"{length}"
 
-            # setup rectangle for the region
+            # Setup rectangle for the region
             rectangle = Rectangle(
                 (x, y), length, height, color=REGION_TYPE_COLORS[rtype], ec="black"
             )
 
-            # write in the length of the region in the rectangle
+            # Write length in the rectangle
             ax.text(
                 x + length / 2,
                 y + height / 2,
                 label,
                 horizontalalignment="center",
                 verticalalignment="center",
-            )  # , rotation=90)
-            # add the rectangle
+            )
             ax.add_patch(rectangle)
 
-            # add length to x for next region
+            # Add length to x for next region
             x += length
 
         ax.autoscale()
-
-        # since all axes use the same scale, set the xlim to be 0 to the max length
         ax.set(**{"xlim": (0, max(lengths)), "ylim": (0, 1)})
 
-        # hide the spines
+        # Hide the spines
         for spine in ["right", "top", "left", "bottom"]:
             ax.spines[spine].set_visible(False)
         # Hide the axis and ticks and labels
@@ -256,10 +282,10 @@ def plot_png(assay, modalities, modes, nmodes, lengths):
         ax.set_yticklabels([])
         ax.set_yticks([])
 
-        # label the modality on the ylabel
+        # Label the modality on the ylabel
         ax.set_ylabel(m.region_type, rotation=0, fontsize=20, ha="right", va="center")
 
-    # adjust the xaxis for the last modality to show the length
+    # Adjust the xaxis for the last modality to show the length
     ax.xaxis.set_visible(True)
     ax.spines["bottom"].set_visible(True)
     ax.minorticks_on()
@@ -270,68 +296,10 @@ def plot_png(assay, modalities, modes, nmodes, lengths):
         }
     )
 
-    # setup the figure legend
+    # Setup the figure legend
     handles = []
     for t in sorted(set(rts)):
         handles.append(mpatches.Patch(color=REGION_TYPE_COLORS[t], label=t))
     fig.legend(handles=handles, loc="center", bbox_to_anchor=(1.1, 0.55))
-    return (fig, ax)
-
-
-def headerTemplate(name, doi, description, modalities):
-    s = f"""# {name}
-- DOI: [{doi}]({doi})
-- Description: {description}
-- Modalities: {", ".join(modalities)}
-    """
-    return s
 
-
-def atomicRegionTemplate(
-    name, region_type, sequence_type, sequence, min_len, max_len, onlist, ns=0
-):
-    s = f"""<details><summary>{name}</summary>
-
-{' '*ns}- region_type: {region_type}
-{' '*ns}- sequence_type: {sequence_type}
-{' '*ns}- sequence: <pre style="overflow-x: auto; text-align: left; margin: 0; display: inline;">{sequence}</pre>
-{' '*ns}- min_len: {min_len}
-{' '*ns}- max_len: {max_len}
-{' '*ns}- onlist: {onlist}
-{' '*ns}</details>"""
-    return s
-
-
-def regionsTemplate(regions):
-    s = "\n".join(
-        [
-            f"{idx + 1}. "
-            + atomicRegionTemplate(
-                v.name,
-                v.region_type,
-                v.sequence_type,
-                v.sequence,
-                v.min_len,
-                v.max_len,
-                v.onlist,
-                len(str(idx + 1))
-                + 1
-                + 1,  # length of string rep of number plus 1 for "." plus 1 for space
-            )
-            for idx, v in enumerate(regions)
-        ]
-    )
-    return s
-
-
-def libStructTemplate(region):
-    s = f"""###### {region.name}
-<pre style="overflow-x: auto; text-align: left; background-color: #f6f8fa">{region.sequence}</pre>"""
-    return s
-
-
-def multiModalTemplate(library_spec):
-    s = "\n".join(
-        [libStructTemplate(v) + "\n" + regionsTemplate(v.regions) for v in library_spec]
-    )
-    return s
+    return fig
diff --git a/seqspec/seqspec_print_html.py b/seqspec/seqspec_print_html.py
index 69b904d..8ec4694 100644
--- a/seqspec/seqspec_print_html.py
+++ b/seqspec/seqspec_print_html.py
@@ -1,19 +1,23 @@
+"""Print HTML module for seqspec.
+
+This module provides functionality to generate HTML representations of seqspec files.
+It is used by the print command with the 'seqspec-html' format option.
+"""
+
+from typing import List, Optional
+
 from seqspec.Assay import Assay
-from seqspec.Region import Region
-from seqspec.Read import Read
-from seqspec.Read import File
+from seqspec.Region import Region, Onlist, complement_sequence
+from seqspec.Read import Read, File
+from seqspec.seqspec_print_utils import libseq
 
 
-def print_seqspec_html(spec):
-    # header = headerTemplate(spec.name, spec.doi, spec.description, spec.modalities)
-    # header2 = "## Final Library"
-    # library_spec = multiModalTemplate(spec.library_spec)
-    # s = f"{header}\n{header2}\n{library_spec}"
-    s = htmlTemplate(spec)
-    return s
+def print_seqspec_html(spec: Assay) -> str:
+    """Generate HTML representation of seqspec."""
+    return htmlTemplate(spec)
 
 
-def headerTemplate(name, doi, description, modalities):
+def headerTemplate(name: str, doi: str, description: str, modalities: List[str]) -> str:
     s = f"""<h1 style="text-align: center">{name}</h1>
   <ul>
     <li>
@@ -30,7 +34,7 @@ def headerTemplate(name, doi, description, modalities):
     return s
 
 
-def colorSeq(regions):
+def colorSeq(regions: List[Region]) -> str:
     return "".join(
         [f"<{r.region_type}>{r.sequence}</{r.region_type}>" for r in regions]
     )
@@ -38,21 +42,22 @@ def colorSeq(regions):
 
 def atomicRegionTemplate(
     region: Region,
-    name,
-    region_type,
-    sequence_type,
-    sequence,
-    min_len,
-    max_len,
-    onlist,
-    regions,
-):
+    name: str,
+    region_type: str,
+    sequence_type: str,
+    sequence: str,
+    min_len: int,
+    max_len: int,
+    onlist: Optional[Onlist],
+    regions: Optional[List[Region]],
+) -> str:
     seq = (
         colorSeq(region.get_leaves())
         if regions
         else f"<{region_type}>{sequence}</{region_type}>"
     )
-    onlist = f"{onlist.filename} (md5: {onlist.md5})" if onlist else None
+
+    ol = f"{onlist.filename} (md5: {onlist.md5})" if onlist else None
     lst = []
     if regions:
         for idx, r in enumerate(regions):
@@ -73,7 +78,6 @@ def atomicRegionTemplate(
     else:
         subseq = ""
 
-    # subseq = "<li>" + "</li><li>".join( [  for i in regions if regions else ''])
     s = f"""<details>
     <summary>{name}</summary>
     <ul>
@@ -94,7 +98,7 @@ def atomicRegionTemplate(
       </li>
       <li>min_len: {min_len}</li>
       <li>max_len: {max_len}</li>
-      <li>onlist: {onlist}</li>
+      <li>onlist: {ol}</li>
       <li> regions: {subseq}
       </li>
     </ul>
@@ -103,27 +107,28 @@ def atomicRegionTemplate(
     return s
 
 
-def regionsTemplate(regions):
+def regionsTemplate(regions: List[Region]) -> str:
+    templates = [
+        atomicRegionTemplate(
+            r,
+            r.region_id,
+            r.region_type,
+            r.sequence_type,
+            r.sequence,
+            r.min_len,
+            r.max_len,
+            r.onlist,
+            r.regions,
+        )
+        for idx, r in enumerate(regions)
+    ]
     s = f"""<ol><li>
-    {'</li><li>'.join([atomicRegionTemplate(
-                r,
-                r.region_id,
-                r.region_type,
-                r.sequence_type,
-                r.sequence,
-                r.min_len,
-                r.max_len,
-                r.onlist,
-                r.regions,
-    ) for idx, r in enumerate(regions)])}
+    {'</li><li>'.join(templates)}
     </li></ol>"""
     return s
 
 
-def libStructTemplate(spec, modality):
-    from seqspec.seqspec_print import libseq
-    from seqspec.Region import complement_sequence
-
+def libStructTemplate(spec: Assay, modality: str) -> str:
     libspec = spec.get_libspec(modality)
     seqspec = spec.get_seqspec(modality)  # noqa
     p, n = libseq(spec, modality)
@@ -147,7 +152,7 @@ def libStructTemplate(spec, modality):
     return s
 
 
-def atomicReadTemplate(read: Read):
+def atomicReadTemplate(read: Read) -> str:
     files = "".join(atomicFileTemplate(f) for f in read.files) if read.files else ""
 
     s = f"""
@@ -171,21 +176,21 @@ def atomicReadTemplate(read: Read):
     return s
 
 
-def atomicFileTemplate(file: File):
+def atomicFileTemplate(file: File) -> str:
     s = f"""
         <li>{file.filename} (md5: {file.md5})</li>
     """
     return s
 
 
-def readsTemplate(reads):
+def readsTemplate(reads: List[Read]) -> str:
     s = f"""<ol><li>
     {'</li><li>'.join([atomicReadTemplate(r) for r in reads])}
     </li></ol>"""
     return s
 
 
-def multiModalTemplate(spec: Assay):
+def multiModalTemplate(spec: Assay) -> str:
     modes = spec.modalities
     s = ""
     for m in modes:
@@ -202,7 +207,7 @@ def multiModalTemplate(spec: Assay):
     return s
 
 
-def htmlTemplate(spec):
+def htmlTemplate(spec: Assay) -> str:
     s = f"""
   <!DOCTYPE html>
   <html>
diff --git a/seqspec/seqspec_print_utils.py b/seqspec/seqspec_print_utils.py
new file mode 100644
index 0000000..9aebc17
--- /dev/null
+++ b/seqspec/seqspec_print_utils.py
@@ -0,0 +1,79 @@
+"""Utility functions for printing seqspec files.
+
+This module contains shared functionality used by both seqspec_print.py and seqspec_print_html.py.
+"""
+
+from typing import List, Tuple
+from seqspec.Assay import Assay
+from seqspec.Region import complement_sequence, project_regions_to_coordinates
+
+
+def libseq(spec: Assay, modality: str) -> Tuple[List[str], List[str]]:
+    """Get library sequence parts for a specific modality.
+
+    Args:
+        spec: The seqspec specification.
+        modality: The modality to get parts for.
+
+    Returns:
+        Tuple of (positive strand parts, negative strand parts).
+    """
+    libspec = spec.get_libspec(modality)
+    seqspec = spec.get_seqspec(modality)
+
+    p = []
+    n = []
+    leaves = libspec.get_leaves()
+    cuts = project_regions_to_coordinates(leaves)
+
+    for idx, read in enumerate(seqspec, 1):
+        read_len = read.max_len
+        read_id = read.read_id
+        primer_id = read.primer_id
+        primer_idx = [i for i, l in enumerate(leaves) if l.region_id == primer_id][0]
+        primer_pos = cuts[primer_idx]
+
+        if read.strand == "pos":
+            wsl = primer_pos.stop - 1
+            ws = wsl * " "
+
+            arrowl = read_len - 1
+            arrow = arrowl * "-"
+
+            p.append(f"{ws}|{arrow}>({idx}) {read_id}")
+        elif read.strand == "neg":
+            wsl = primer_pos.start - read_len
+            ws = wsl * " "
+
+            arrowl = read_len - 1
+            arrow = arrowl * "-"
+
+            n.append(f"{ws}<{arrow}|({idx}) {read_id}")
+
+    return (p, n)
+
+
+def format_libseq(spec: Assay, modality: str, p: List[str], n: List[str]) -> str:
+    """Format library sequence for a specific modality.
+
+    Args:
+        spec: The seqspec specification.
+        modality: The modality to format.
+        p: Positive strand parts.
+        n: Negative strand parts.
+
+    Returns:
+        The formatted string.
+    """
+    libspec = spec.get_libspec(modality)
+
+    return "\n".join(
+        [
+            modality,
+            "---",
+            "\n".join(p),
+            libspec.sequence,
+            complement_sequence(libspec.sequence),
+            "\n".join(n),
+        ]
+    )
diff --git a/seqspec/seqspec_split.py b/seqspec/seqspec_split.py
index 3ad405a..20436a7 100644
--- a/seqspec/seqspec_split.py
+++ b/seqspec/seqspec_split.py
@@ -1,17 +1,25 @@
-import os
+"""Split module for seqspec.
+
+This module provides functionality to split seqspec files into one file per modality.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+from typing import List, Dict, Any
+
 from seqspec.utils import load_spec
 from seqspec.Assay import Assay
-from argparse import RawTextHelpFormatter
 
 
-def setup_split_args(parser):
+def setup_split_args(parser) -> ArgumentParser:
+    """Create and configure the split command subparser."""
     subparser = parser.add_parser(
         "split",
         description="""
 Split seqspec file into one file per modality.
 
 Examples:
-seqspec split -o split spec.yaml # Split spec into modalities
+seqspec split -o split spec.yaml  # Split spec into modalities
 ---
 """,
         help="Split seqspec file by modality",
@@ -19,59 +27,66 @@ def setup_split_args(parser):
     )
     subparser_required = subparser.add_argument_group("required arguments")
 
-    subparser.add_argument("yaml", help="Sequencing specification yaml file")
+    subparser.add_argument("yaml", help="Sequencing specification yaml file", type=str)
     subparser_required.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
-        default=None,
+        help="Path to output files",
+        type=Path,
         required=True,
     )
     return subparser
 
 
-def validate_split_args(parser, args):
-    # if everything is valid the run_split
-    fn = args.yaml
-    o = args.o
-    return run_split(fn, o)
+def validate_split_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the split command arguments."""
+    if not Path(args.yaml).exists():
+        parser.error(f"Input file does not exist: {args.yaml}")
+
+    if Path(args.output).exists() and Path(args.output).is_file():
+        parser.error(f"Output path exists: {args.output}")
 
 
-def run_split(spec_fn, o):
-    spec = load_spec(spec_fn)
-    specs = split(spec, o)
+def run_split(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the split command."""
+    validate_split_args(parser, args)
 
-    for spec in specs:
-        spec["spec"].to_YAML(
-            os.path.join(os.path.dirname(o), f"{spec['p']}{spec['m']}.yaml")
-        )
-    return
+    spec = load_spec(args.yaml)
+    specs = split(spec, args.output)
 
+    for spec_info in specs:
+        output_path = args.output / f"{spec_info['prefix']}{spec_info['modality']}.yaml"
+        spec_info["spec"].to_YAML(output_path)
 
-def split(spec, o=""):
+
+def split(spec: Assay, output_dir: Path) -> List[Dict[str, Any]]:
+    """Split spec into one file per modality."""
     specs = []
     modalities = spec.list_modalities()
-    # make a new spec per modality
-    for m in modalities:
+
+    # Make a new spec per modality
+    for modality in modalities:
         info = {
             "assay_id": spec.assay_id,
             "name": spec.name,
             "doi": spec.doi,
             "date": spec.date,
             "description": spec.description,
-            "modalities": [m],
+            "modalities": [modality],
             "lib_struct": spec.lib_struct,
             "library_kit": spec.library_kit,
             "library_protocol": spec.library_protocol,
             "sequence_kit": spec.sequence_kit,
             "sequence_protocol": spec.sequence_protocol,
-            "sequence_spec": spec.get_seqspec(m),
-            "library_spec": [spec.get_libspec(m)],
+            "sequence_spec": spec.get_seqspec(modality),
+            "library_spec": [spec.get_libspec(modality)],
             "seqspec_version": spec.seqspec_version,
         }
         spec_m = Assay(**info)
         spec_m.update_spec()
-        base_o = "spec." if os.path.basename(o) == "" else f"{os.path.basename(o)}."
-        specs.append({"p": base_o, "spec": spec_m, "m": m})
+
+        prefix = "spec." if output_dir.name == "" else f"{output_dir.name}."
+        specs.append({"prefix": prefix, "spec": spec_m, "modality": modality})
+
     return specs
diff --git a/seqspec/seqspec_upgrade.py b/seqspec/seqspec_upgrade.py
index 032adbb..62ec210 100644
--- a/seqspec/seqspec_upgrade.py
+++ b/seqspec/seqspec_upgrade.py
@@ -1,75 +1,89 @@
+"""Upgrade module for seqspec.
+
+This module provides functionality to upgrade seqspec files from older versions to the current version.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+
 from seqspec.utils import load_spec
 from seqspec.File import File
 from seqspec.Region import Onlist
-from argparse import RawTextHelpFormatter
 from seqspec import get_version
+from seqspec.Assay import Assay
 
 
-def setup_upgrade_args(parser):
+def setup_upgrade_args(parser) -> ArgumentParser:
+    """Create and configure the upgrade command subparser."""
     subparser = parser.add_parser(
         "upgrade",
         description="""
 Upgrade seqspec file from older versions to the current version.
 
 Examples:
-seqspec upgrade spec.yaml # Upgrade the spec file
+seqspec upgrade -o upgraded.yaml spec.yaml  # Upgrade and save to new file
+seqspec upgrade spec.yaml                   # Upgrade and print to stdout
 ---
 """,
-        # help="upgrade seqspec file",
+        help="Upgrade seqspec file to current version",
         formatter_class=RawTextHelpFormatter,
     )
-    # subparser_required = subparser.add_argument_group("required arguments")
-
-    subparser.add_argument("yaml", help="Sequencing specification yaml file")
+    subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        help="Path to output file",
+        type=Path,
         default=None,
     )
     return subparser
 
 
-def validate_upgrade_args(parser, args):
-    fn = args.yaml
-    o = args.o
+def validate_upgrade_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the upgrade command arguments."""
+    if not Path(args.yaml).exists():
+        parser.error(f"Input file does not exist: {args.yaml}")
 
-    run_upgrade(spec_fn=fn, o=o)
+    if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
 
 
-def run_upgrade(spec_fn, o):
-    spec = load_spec(spec_fn)
+def run_upgrade(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the upgrade command."""
+    validate_upgrade_args(parser, args)
+
+    spec = load_spec(args.yaml)
     version = spec.seqspec_version
-    upgrade(spec, version)
-    if o:
-        spec.to_YAML(o)
+    upgraded_spec = upgrade(spec, version)
+
+    if args.output:
+        args.output.write_text(upgraded_spec.to_YAML())
     else:
-        print(spec.to_YAML())
+        print(upgraded_spec.to_YAML())
 
 
-def upgrade(spec, version):
+def upgrade(spec: Assay, version: str) -> Assay:
+    """Upgrade spec to current version."""
     UPGRADE = {
-        "0.0.0": upgrade_0_2_0_to_0_3_0,
+        "0.0.0": upgrade_0_0_0_to_0_3_0,
         "0.1.0": upgrade_0_1_0_to_0_3_0,
         "0.1.1": upgrade_0_1_1_to_0_3_0,
         "0.2.0": upgrade_0_2_0_to_0_3_0,
         get_version(): no_upgrade,
     }
 
-    u = UPGRADE[version](spec)
-    return u
+    return UPGRADE[version](spec)
 
 
 def no_upgrade(spec):
+    """No upgrade needed for current version."""
     return spec
 
 
-def upgrade_0_2_0_to_0_3_0(spec):
-    # for backwards compatibilty, for specs < v0.3.0 set the files to empty
-    # of the specs < v0.3.0, set the onlist regions with missing properties
-    # if version.parse(spec.seqspec_version) < version.parse("0.3.0"):
-
+def upgrade_0_2_0_to_0_3_0(spec: Assay) -> Assay:
+    """Upgrade spec from version 0.2.0 to 0.3.0."""
+    # Set files to empty for specs < v0.3.0
     for r in spec.sequence_spec:
         r.set_files(
             [
@@ -85,11 +99,12 @@ def upgrade_0_2_0_to_0_3_0(spec):
             ]
         )
 
+    # Update onlist regions with missing properties
     for r in spec.library_spec:
         for lf in r.get_leaves():
             if lf.onlist is not None:
                 filename = lf.onlist.filename
-                location = lf.onlist.location
+                # location = lf.onlist.location
                 md5 = lf.onlist.md5
                 lf.onlist = Onlist(
                     file_id=filename,
@@ -99,19 +114,22 @@ def upgrade_0_2_0_to_0_3_0(spec):
                     url="",
                     urltype="",
                     md5=md5,
-                    location=location,
+                    # location=location,
                 )
     spec.seqspec_version = get_version()
     return spec
 
 
-def upgrade_0_1_1_to_0_3_0(spec):
+def upgrade_0_1_1_to_0_3_0(spec: Assay) -> Assay:
+    """Upgrade spec from version 0.1.1 to 0.3.0."""
     return upgrade_0_2_0_to_0_3_0(spec)
 
 
-def upgrade_0_1_0_to_0_3_0(spec):
+def upgrade_0_1_0_to_0_3_0(spec: Assay) -> Assay:
+    """Upgrade spec from version 0.1.0 to 0.3.0."""
     return upgrade_0_2_0_to_0_3_0(spec)
 
 
-def upgrade_0_0_0_to_0_3_0(spec):
+def upgrade_0_0_0_to_0_3_0(spec: Assay) -> Assay:
+    """Upgrade spec from version 0.0.0 to 0.3.0."""
     return upgrade_0_2_0_to_0_3_0(spec)
diff --git a/seqspec/seqspec_version.py b/seqspec/seqspec_version.py
index 56a52e0..2ed7c54 100644
--- a/seqspec/seqspec_version.py
+++ b/seqspec/seqspec_version.py
@@ -1,53 +1,68 @@
+"""Version module for seqspec.
+
+This module provides functionality to get seqspec tool version and seqspec file version.
+"""
+
+from pathlib import Path
+from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+
 from seqspec.utils import load_spec
+from seqspec.Assay import Assay
 from . import __version__
-from argparse import RawTextHelpFormatter
 
 
-def setup_version_args(parser):
+def setup_version_args(parser) -> ArgumentParser:
+    """Create and configure the version command subparser."""
     subparser = parser.add_parser(
         "version",
         description="""
 Get seqspec version and seqspec file version.
 
 Examples:
-seqspec version spec.yaml
+seqspec version -o version.txt spec.yaml  # Save version info to file
+seqspec version spec.yaml                 # Print version info to stdout
 ---
 """,
         help="Get seqspec tool version and seqspec file version",
         formatter_class=RawTextHelpFormatter,
     )
 
-    subparser.add_argument("yaml", help="Sequencing specification yaml file")
+    subparser.add_argument("yaml", help="Sequencing specification yaml file", type=Path)
     subparser.add_argument(
         "-o",
+        "--output",
         metavar="OUT",
-        help=("Path to output file"),
-        type=str,
+        help="Path to output file",
+        type=Path,
         default=None,
     )
     return subparser
 
 
-def validate_version_args(parser, args):
-    # if everything is valid the run_version
-    fn = args.yaml
-    o = args.o
-    run_version(fn, o)
+def validate_version_args(parser: ArgumentParser, args: Namespace) -> None:
+    """Validate the version command arguments."""
+    if not Path(args.yaml).exists():
+        parser.error(f"Input file does not exist: {args.yaml}")
+
+    if args.output and Path(args.output).exists() and not Path(args.output).is_file():
+        parser.error(f"Output path exists but is not a file: {args.output}")
+
+
+def run_version(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the version command."""
+    validate_version_args(parser, args)
 
+    spec = load_spec(args.yaml)
+    version_info = version(spec)
 
-def run_version(spec_fn, o):
-    spec = load_spec(spec_fn)
-    s = version(spec)
-    if o:
-        with open(o, "w") as f:
-            print(s, file=f)
+    if args.output:
+        args.output.write_text(version_info)
     else:
-        print(s)
-    return
+        print(version_info)
 
 
-def version(spec):
+def version(spec: Assay) -> str:
+    """Get version information for spec and tool."""
     version = spec.seqspec_version
     tool_version = __version__
-    s = f"seqspec version: {tool_version}\nseqspec file version: {version}"
-    return s
+    return f"seqspec version: {tool_version}\nseqspec file version: {version}"
diff --git a/setup.cfg b/setup.cfg
index 594931a..f645753 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -9,7 +9,7 @@ tag = True
 [bumpversion:file:README.md]
 
 [flake8]
-exclude = .git,.github,__pycache__,build,dist
+exclude = .git,.github,__pycache__,build,dist,venv
 statistics = True
 max-line-length = 88
 extend-ignore = E203,E501

From 5f1192f97384cbf17951d8fe16cd177c387f2b3a Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Thu, 10 Jul 2025 15:21:56 -0500
Subject: [PATCH 15/21] CHECK-214-region-type (#10)

---
 seqspec/schema/seqspec.schema.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/seqspec/schema/seqspec.schema.json b/seqspec/schema/seqspec.schema.json
index 0dc9a04..1fbf3aa 100644
--- a/seqspec/schema/seqspec.schema.json
+++ b/seqspec/schema/seqspec.schema.json
@@ -268,6 +268,7 @@
           "enum": [
             "atac",
             "barcode",
+            "bead_TSO",
             "cdna",
             "crispr",
             "custom_primer",

From 1a29c3ce9ed8dc32bbfa24a5f68bb67e5437770f Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Thu, 10 Jul 2025 16:58:46 -0500
Subject: [PATCH 16/21] CHECK-219-api-merge(#12)

---
 seqspec/seqspec_check.py   |  50 +++++---
 seqspec/seqspec_file.py    | 100 +++++++--------
 seqspec/seqspec_find.py    |  67 +++++++---
 seqspec/seqspec_index.py   | 115 +++++++++--------
 seqspec/seqspec_info.py    | 250 ++++++++++++++++++++++++++-----------
 seqspec/seqspec_init.py    |  72 +++++++----
 seqspec/seqspec_print.py   |  36 ++++--
 seqspec/seqspec_split.py   |  27 ++--
 seqspec/seqspec_upgrade.py |  11 +-
 seqspec/seqspec_version.py |  25 +++-
 10 files changed, 488 insertions(+), 265 deletions(-)

diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py
index 82a2894..e9b2cef 100644
--- a/seqspec/seqspec_check.py
+++ b/seqspec/seqspec_check.py
@@ -5,6 +5,7 @@
 
 from pathlib import Path
 from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
+from typing import List, Dict, Optional
 
 from jsonschema import Draft4Validator
 import yaml
@@ -62,26 +63,43 @@ def validate_check_args(parser: ArgumentParser, args: Namespace) -> None:
         parser.error(f"Output path exists but is not a file: {args.output}")
 
 
+def format_error(errobj, idx=0):
+    return f"[error {idx}] {errobj['error_message']}"
+
+
+def seqspec_check(
+    spec: Assay, spec_fn: str, filter_type: Optional[str] = None
+) -> List[Dict]:
+    """Core functionality to check a seqspec and return filtered errors.
+
+    Args:
+        spec: The Assay object to check
+        spec_fn: Path to the spec file, used for relative path resolution
+        filter_type: Optional filter type to apply to errors (e.g. "igvf", "igvf_onlist_skip")
+
+    Returns:
+        List of error dictionaries
+    """
+    errors = check(spec, spec_fn)
+    if filter_type:
+        errors = filter_errors(errors, filter_type)
+    return errors
+
+
 def run_check(parser: ArgumentParser, args: Namespace):
     """Run the check command."""
     validate_check_args(parser, args)
 
     spec = load_spec(args.yaml)
-    errors = check(spec, args.yaml)
-
-    if args.skip == "igvf":
-        errors = filter_errors(errors, "igvf")
-    elif args.skip == "igvf_onlist_skip":
-        errors = filter_errors(errors, "igvf_onlist_skip")
-
-    if errors:
-        if args.output:
-            with open(args.output, "w") as f:
-                for idx, e in enumerate(errors, 1):
-                    print(format_error(e, idx), file=f)
-        else:
+    errors = seqspec_check(spec, args.yaml, args.skip)
+
+    if args.output:
+        with open(args.output, "w") as f:
             for idx, e in enumerate(errors, 1):
-                print(format_error(e, idx))
+                print(format_error(e, idx), file=f)
+    else:
+        for idx, e in enumerate(errors, 1):
+            print(format_error(e, idx))
     return errors
 
 
@@ -123,10 +141,6 @@ def filter_errors(errors, filter_type):
         return errors
 
 
-def format_error(errobj, idx=0):
-    return f"[error {idx}] {errobj['error_message']}"
-
-
 def check(spec: Assay, spec_fn: str):
     # Variety of checks against schema
     def check_schema(spec: Assay, spec_fn: str, errors=[], idx=0):
diff --git a/seqspec/seqspec_file.py b/seqspec/seqspec_file.py
index 48563e6..d1ad8e5 100644
--- a/seqspec/seqspec_file.py
+++ b/seqspec/seqspec_file.py
@@ -129,55 +129,23 @@ def validate_file_args(parser: ArgumentParser, args: Namespace) -> None:
         )
 
 
-def run_file(parser: ArgumentParser, args: Namespace) -> None:
-    """Run the file command."""
-    validate_file_args(parser, args)
-
-    spec = load_spec(args.yaml)
-    ids = args.ids.split(",") if args.ids else []
-
-    files = list_files(
-        spec,
-        args.modality,
-        ids,
-        args.selector,
-        args.format,
-        args.key,
-        args.yaml,
-        args.fullpath,
-    )
-
-    if files:
-        if args.output:
-            args.output.write_text(str(files))
-        else:
-            print(files)
-
-
-def list_files(
+def seqspec_file(
     spec: Assay,
     modality: str,
-    ids: List[str],
-    idtype: str,
-    fmt: str,
-    k: str,
-    spec_fn: Path,
-    fp: bool = False,
-) -> str:
-    """List files based on the given parameters.
+    ids: Optional[List[str]] = None,
+    selector: str = "read",
+) -> Dict[str, List[File]]:
+    """Core functionality to list files from a seqspec.
 
     Args:
-        spec: The seqspec specification.
-        modality: The modality to list files for.
-        ids: List of IDs to filter by.
-        idtype: Type of ID to filter by (read, region, file, region-type).
-        fmt: Output format (paired, interleaved, index, list, json).
-        k: Key to use for output (file_id, filename, etc.).
-        spec_fn: Path to the spec file.
-        fp: Whether to use full paths for local files.
+        spec: The Assay object to operate on
+        spec_fn: Path to the spec file, used for relative path resolution
+        modality: The modality to list files for
+        ids: Optional list of IDs to filter by
+        selector: Type of ID to filter by (read, region, file, region-type)
 
     Returns:
-        Formatted string containing the file information.
+        Dictionary mapping IDs to lists of File objects
     """
     # NOTE: LIST FILES DOES NOT RESPECT ORDERING OF INPUT IDs LIST
     # NOTE: seqspec file -s read gets the files for the read, not the files mapped from the regions associated with the read.
@@ -194,24 +162,46 @@ def list_files(
         "region-type": list_files_by_region_type,
     }
 
-    FORMAT = {
-        "list": format_list_files_metadata,
-        "paired": format_list_files,
-        "interleaved": format_list_files,
-        "index": format_list_files,
-        "json": format_json_files,
-    }
-
     # Get files based on whether we're filtering by IDs
     if not ids:
         # list all files
-        files = LIST_FILES[idtype](spec, modality)
+        return LIST_FILES[selector](spec, modality)
     else:
         # list files by id
-        files = LIST_FILES_BY_ID[idtype](spec, modality, ids)
+        return LIST_FILES_BY_ID[selector](spec, modality, ids)
+
+
+def run_file(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the file command."""
+    validate_file_args(parser, args)
+
+    spec = load_spec(args.yaml)
+    ids = args.ids.split(",") if args.ids else []
 
-    # Format the output
-    return FORMAT[fmt](files, fmt, k, spec_fn, fp)
+    files = seqspec_file(
+        spec=spec,
+        modality=args.modality,
+        ids=ids,
+        selector=args.selector,
+    )
+
+    if files:
+        FORMAT = {
+            "list": format_list_files_metadata,
+            "paired": format_list_files,
+            "interleaved": format_list_files,
+            "index": format_list_files,
+            "json": format_json_files,
+        }
+
+        result = FORMAT[args.format](
+            files, args.format, args.key, Path(args.yaml), args.fullpath
+        )
+
+        if args.output:
+            args.output.write_text(str(result))
+        else:
+            print(result)
 
 
 def list_read_files(spec: Assay, modality: str) -> Dict[str, List[File]]:
diff --git a/seqspec/seqspec_find.py b/seqspec/seqspec_find.py
index c1826ce..d1cd9f5 100644
--- a/seqspec/seqspec_find.py
+++ b/seqspec/seqspec_find.py
@@ -7,7 +7,7 @@
 from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS
 import warnings
 import yaml
-from typing import List
+from typing import List, Optional, Union
 
 from seqspec.utils import load_spec
 from seqspec.Assay import Assay
@@ -104,25 +104,48 @@ def validate_find_args(parser: ArgumentParser, args: Namespace) -> None:
             args.id = args.r
 
 
+def seqspec_find(
+    spec: Assay, selector: str, modality: str, id: Optional[str] = None
+) -> Union[List[Read], List[Region], List[File]]:
+    """Core functionality to find objects in a seqspec file.
+
+    Args:
+        spec: The Assay object to search in
+        selector: Type of object to search for (read, region, file, region-type)
+        modality: The modality to search in
+        id: The ID to search for (optional)
+
+    Returns:
+        List of found objects matching the search criteria:
+        - List[Read] for "read" selector
+        - List[Region] for "region" and "region-type" selectors
+        - List[File] for "file" selector
+        - Empty list for unknown selectors
+    """
+    FIND = {
+        "region-type": find_by_region_type,
+        "region": find_by_region_id,
+        "read": find_by_read_id,
+        "file": find_by_file_id,
+    }
+
+    if selector not in FIND:
+        warnings.warn(
+            f"Unknown selector '{selector}'. Valid selectors are: {', '.join(FIND.keys())}"
+        )
+        return []
+
+    return FIND[selector](spec, modality, id)
+
+
 def run_find(parser: ArgumentParser, args: Namespace) -> None:
     """Run the find command."""
     validate_find_args(parser, args)
 
     spec = load_spec(args.yaml)
-    found = []
-
-    if args.selector == "region-type":
-        found = find_by_region_type(spec, args.modality, args.id)
-    elif args.selector == "region":
-        found = find_by_region_id(spec, args.modality, args.id)
-    elif args.selector == "read":
-        found = find_by_read_id(spec, args.modality, args.id)
-    elif args.selector == "file":
-        found = find_by_file_id(spec, args.modality, args.id)
-    else:
-        raise ValueError(f"Unknown selector: {args.selector}")
+    found = seqspec_find(spec, args.selector, args.modality, args.id)
 
-    # post processing
+    # Handle output
     if args.output:
         with open(args.output, "w") as f:
             yaml.dump(found, f, sort_keys=False)
@@ -130,7 +153,7 @@ def run_find(parser: ArgumentParser, args: Namespace) -> None:
         print(yaml.dump(found, sort_keys=False))
 
 
-def find_by_read_id(spec: Assay, modality: str, id: str) -> List[Read]:
+def find_by_read_id(spec: Assay, modality: str, id: Optional[str]) -> List[Read]:
     """Find reads by their ID.
 
     Args:
@@ -142,6 +165,8 @@ def find_by_read_id(spec: Assay, modality: str, id: str) -> List[Read]:
         A list of Read objects matching the ID.
     """
     rds = []
+    if id is None:
+        return rds
     reads = spec.get_seqspec(modality)
     for r in reads:
         if r.read_id == id:
@@ -149,7 +174,7 @@ def find_by_read_id(spec: Assay, modality: str, id: str) -> List[Read]:
     return rds
 
 
-def find_by_file_id(spec: Assay, modality: str, id: str) -> List[File]:
+def find_by_file_id(spec: Assay, modality: str, id: Optional[str]) -> List[File]:
     """Find files by their ID.
 
     Args:
@@ -161,6 +186,8 @@ def find_by_file_id(spec: Assay, modality: str, id: str) -> List[File]:
         A list of File objects matching the ID.
     """
     files = []
+    if id is None:
+        return files
     lf = list_all_files(spec, modality)
     for k, v in lf.items():
         for f in v:
@@ -169,7 +196,7 @@ def find_by_file_id(spec: Assay, modality: str, id: str) -> List[File]:
     return files
 
 
-def find_by_region_id(spec: Assay, modality: str, id: str) -> List[Region]:
+def find_by_region_id(spec: Assay, modality: str, id: Optional[str]) -> List[Region]:
     """Find regions by their ID.
 
     Args:
@@ -180,12 +207,14 @@ def find_by_region_id(spec: Assay, modality: str, id: str) -> List[Region]:
     Returns:
         A list of Region objects matching the ID.
     """
+    if id is None:
+        return []
     m = spec.get_libspec(modality)
     regions = m.get_region_by_id(id)
     return regions
 
 
-def find_by_region_type(spec: Assay, modality: str, id: str) -> List[Region]:
+def find_by_region_type(spec: Assay, modality: str, id: Optional[str]) -> List[Region]:
     """Find regions by their type.
 
     Args:
@@ -196,6 +225,8 @@ def find_by_region_type(spec: Assay, modality: str, id: str) -> List[Region]:
     Returns:
         A list of Region objects matching the type.
     """
+    if id is None:
+        return []
     m = spec.get_libspec(modality)
     regions = m.get_region_by_region_type(id)
     return regions
diff --git a/seqspec/seqspec_index.py b/seqspec/seqspec_index.py
index c08576b..f0caa74 100644
--- a/seqspec/seqspec_index.py
+++ b/seqspec/seqspec_index.py
@@ -6,7 +6,7 @@
 from pathlib import Path
 from argparse import ArgumentParser, RawTextHelpFormatter, Namespace, SUPPRESS
 import warnings
-from typing import List, Optional
+from typing import List, Optional, Dict, Any
 
 from seqspec.utils import load_spec, map_read_id_to_regions
 from seqspec.seqspec_find import find_by_region_id
@@ -156,52 +156,52 @@ def validate_index_args(parser: ArgumentParser, args: Namespace) -> None:
         parser.error("Must specify ids with -i for -s read or -s region")
 
 
-def run_index(parser: ArgumentParser, args: Namespace) -> None:
-    """Run the index command."""
-    validate_index_args(parser, args)
-
-    spec = load_spec(args.yaml)
-    ids = args.ids.split(",") if args.ids else []
-
-    result = index(
-        spec,
-        args.modality,
-        ids,
-        args.selector,
-        args.tool,
-        args.rev,
-        args.subregion_type,
-    )
-
-    if args.output:
-        with open(args.output, "w") as f:
-            print(result, file=f)
-    else:
-        print(result)
-
-
-def index(
+def seqspec_index(
     spec: Assay,
     modality: str,
     ids: List[str],
     idtype: str,
-    fmt: str,
     rev: bool = False,
-    subregion_type: Optional[str] = None,
+) -> List[Dict[str, Any]]:
+    """Core functionality to get index information from the spec.
+
+    Args:
+        spec: The Assay object to index
+        modality: The modality to index
+        ids: List of IDs to index
+        idtype: Type of ID (read, region, file)
+        rev: Whether to return 3'->5' region order
+
+    Returns:
+        List of index dictionaries containing region coordinates and strand information
+    """
+    GET_INDICES = {
+        "file": get_index_by_files,
+    }
+
+    GET_INDICES_BY_IDS = {
+        "file": get_index_by_file_ids,
+        "region": get_index_by_region_ids,
+        "read": get_index_by_read_ids,
+    }
+
+    if not ids:
+        return GET_INDICES[idtype](spec, modality)
+    return GET_INDICES_BY_IDS[idtype](spec, modality, ids)
+
+
+def format_index(
+    indices: List[Dict[str, Any]], fmt: str, subregion_type: Optional[str] = None
 ) -> str:
-    """Get index information from the spec.
+    """Format index information into a specific output format.
 
     Args:
-        spec: The seqspec specification.
-        modality: The modality to index.
-        ids: List of IDs to index.
-        idtype: Type of ID (read, region, file).
-        fmt: Output format.
-        rev: Whether to return 3'->5' region order.
-        subregion_type: Optional subregion type.
+        indices: List of index dictionaries from seqspec_index
+        fmt: Output format to use
+        subregion_type: Optional subregion type for filtering
 
     Returns:
-        Formatted index information.
+        Formatted index information as a string
     """
     FORMAT = {
         "chromap": format_chromap,
@@ -216,22 +216,37 @@ def index(
         "zumis": format_zumis,
     }
 
-    GET_INDICES = {
-        "file": get_index_by_files,
-    }
+    if fmt not in FORMAT:
+        warnings.warn(
+            f"Unknown format '{fmt}'. Valid formats are: {', '.join(FORMAT.keys())}"
+        )
+        return ""
 
-    GET_INDICES_BY_IDS = {
-        "file": get_index_by_file_ids,
-        "region": get_index_by_region_ids,
-        "read": get_index_by_read_ids,
-    }
+    return FORMAT[fmt](indices, subregion_type)
 
-    if not ids:
-        indices = GET_INDICES[idtype](spec, modality)
-    else:
-        indices = GET_INDICES_BY_IDS[idtype](spec, modality, ids)
 
-    return FORMAT[fmt](indices, subregion_type)
+def run_index(parser: ArgumentParser, args: Namespace) -> None:
+    """Run the index command."""
+    validate_index_args(parser, args)
+
+    spec = load_spec(args.yaml)
+    ids = args.ids.split(",") if args.ids else []
+
+    indices = seqspec_index(
+        spec,
+        args.modality,
+        ids,
+        args.selector,
+        args.rev,
+    )
+
+    result = format_index(indices, args.tool, args.subregion_type)
+
+    if args.output:
+        with open(args.output, "w") as f:
+            print(result, file=f)
+    else:
+        print(result)
 
 
 def get_index_by_files(spec, modality):
diff --git a/seqspec/seqspec_info.py b/seqspec/seqspec_info.py
index b9573f7..9a25731 100644
--- a/seqspec/seqspec_info.py
+++ b/seqspec/seqspec_info.py
@@ -1,8 +1,6 @@
 from seqspec.utils import load_spec
 import json
-from typing import List
-from seqspec.Region import Region
-from seqspec.Read import Read
+from typing import Dict
 from seqspec.Assay import Assay
 from argparse import RawTextHelpFormatter, ArgumentParser, Namespace
 from pathlib import Path
@@ -74,105 +72,215 @@ def run_info(parser: ArgumentParser, args: Namespace) -> None:
     validate_info_args(parser, args)
 
     spec = load_spec(args.yaml)
-    CMD = {
+
+    if args.key:
+        # Extract data
+        info = seqspec_info(spec, args.key)
+        # Format info
+        result = format_info(info, args.key, args.format)
+
+        if args.output:
+            with open(args.output, "w") as f:
+                if args.format == "json":
+                    f.write(result)
+                else:
+                    print(result, file=f)
+        else:
+            print(result)
+
+
+def seqspec_info(spec: Assay, key: str) -> Dict:
+    """Get information from the spec based on the key.
+
+    Args:
+        spec: The Assay object to get info from
+        key: The type of information to retrieve (modalities, meta, sequence_spec, library_spec)
+
+    Returns:
+        Dictionary containing the requested information
+
+    Raises:
+        KeyError: If the requested key is not supported
+    """
+    INFO_FUNCS = {
         "modalities": seqspec_info_modalities,
-        "meta": seqspec_info,
+        "meta": seqspec_info_meta,
         "sequence_spec": seqspec_info_sequence_spec,
         "library_spec": seqspec_info_library_spec,
     }
-    s = ""
-    if args.key:
-        s = CMD[args.key](spec, args.format)
+    if key not in INFO_FUNCS:
+        raise KeyError(
+            f"Unsupported info key: {key}. Must be one of {list(INFO_FUNCS.keys())}"
+        )
+    return INFO_FUNCS[key](spec)
+
 
-    if args.output:
-        with open(args.output, "w") as f:
-            json.dump(s, f, sort_keys=False, indent=4)
-    else:
-        print(s)
+def format_info(info: Dict, key: str, fmt: str = "tab") -> str:
+    """Format information based on the key and format.
 
+    Args:
+        info: Dictionary containing the information to format
+        key: The type of information to format (modalities, meta, sequence_spec, library_spec)
+        fmt: Output format (tab or json)
 
-def seqspec_info(spec: Assay, fmt: str) -> str:
-    """Get meta information about the spec."""
-    s = format_info(spec, fmt)
-    return s
+    Returns:
+        Formatted string
 
+    Raises:
+        KeyError: If the requested key is not supported
+    """
+    FORMAT_FUNCS = {
+        "modalities": format_modalities,
+        "meta": format_meta,
+        "sequence_spec": format_sequence_spec,
+        "library_spec": format_library_spec,
+    }
+    if key not in FORMAT_FUNCS:
+        raise KeyError(
+            f"Unsupported format key: {key}. Must be one of {list(FORMAT_FUNCS.keys())}"
+        )
+    return FORMAT_FUNCS[key](info, fmt)
+
+
+def seqspec_info_meta(spec: Assay) -> Dict:
+    """Get meta information about the spec.
+
+    Args:
+        spec: The Assay object to get info from
+
+    Returns:
+        Dictionary containing meta information
+    """
+    sd = spec.to_dict()
+    del sd["library_spec"]
+    del sd["sequence_spec"]
+    del sd["modalities"]
+    return {"meta": sd}
 
-def seqspec_info_library_spec(spec: Assay, fmt: str) -> str:
-    """Get library specification information."""
+
+def seqspec_info_library_spec(spec: Assay) -> Dict:
+    """Get library specification information.
+
+    Args:
+        spec: The Assay object to get info from
+
+    Returns:
+        Dictionary containing library specifications by modality
+    """
     modalities = spec.list_modalities()
-    s = ""
+    result = {}
     for m in modalities:
         libspec = spec.get_libspec(m)
-        s += format_library_spec(m, libspec.get_leaves(), fmt)
-    return s
+        result[m] = libspec.get_leaves()
+    return {"library_spec": result}
 
 
-def seqspec_info_sequence_spec(spec: Assay, fmt: str) -> str:
-    """Get sequence specification information."""
-    reads = format_sequence_spec(spec.sequence_spec, fmt)
-    return reads
+def seqspec_info_sequence_spec(spec: Assay) -> Dict:
+    """Get sequence specification information.
 
+    Args:
+        spec: The Assay object to get info from
 
-def seqspec_info_modalities(spec: Assay, fmt: str) -> str:
-    """Get list of modalities."""
-    modalities = format_modalities(spec.list_modalities(), fmt)
-    return modalities
+    Returns:
+        Dictionary containing sequence specifications
+    """
+    return {"sequence_spec": spec.sequence_spec}
 
 
-def format_info(spec: Assay, fmt: str = "tab") -> str:
-    """Format meta information."""
-    sd = spec.to_dict()
-    del sd["library_spec"]
-    del sd["sequence_spec"]
-    del sd["modalities"]
-    s = ""
+def seqspec_info_modalities(spec: Assay) -> Dict:
+    """Get list of modalities.
+
+    Args:
+        spec: The Assay object to get info from
+
+    Returns:
+        Dictionary containing list of modalities
+    """
+    return {"modalities": spec.list_modalities()}
+
+
+def format_meta(info: Dict, fmt: str = "tab") -> str:
+    """Format meta information.
+
+    Args:
+        info: Dictionary containing meta information from seqspec_info_meta
+        fmt: Output format (tab or json)
+
+    Returns:
+        Formatted string
+    """
     if fmt == "tab":
-        for k, v in sd.items():
-            s += f"{v}\t"
-        s = s[:-1]
+        return "\t".join(str(v) for v in info["meta"].values())
     elif fmt == "json":
-        s = json.dumps(sd, sort_keys=False, indent=4)
-    return s
+        return json.dumps(info["meta"], sort_keys=False, indent=4)
+    return ""
+
+
+def format_modalities(info: Dict, fmt: str = "tab") -> str:
+    """Format list of modalities.
 
+    Args:
+        info: Dictionary containing modalities from seqspec_info_modalities
+        fmt: Output format (tab or json)
 
-def format_modalities(modalities: List[str], fmt: str = "tab") -> str:
-    """Format list of modalities."""
-    s = ""
+    Returns:
+        Formatted string
+    """
     if fmt == "tab":
-        s = "\t".join(modalities)
+        return "\t".join(info["modalities"])
     elif fmt == "json":
-        s = json.dumps(modalities, sort_keys=False, indent=4)
-    return s
+        return json.dumps(info["modalities"], sort_keys=False, indent=4)
+    return ""
 
 
-def format_sequence_spec(sequence_spec: List[Read], fmt: str = "tab") -> str:
-    """Format sequence specification."""
-    s = ""
+def format_sequence_spec(info: Dict, fmt: str = "tab") -> str:
+    """Format sequence specification.
+
+    Args:
+        info: Dictionary containing sequence specs from seqspec_info_sequence_spec
+        fmt: Output format (tab or json)
+
+    Returns:
+        Formatted string
+    """
     if fmt == "tab":
-        # format the output as a table
-        for r in sequence_spec:
+        lines = []
+        for r in info["sequence_spec"]:
             files = ",".join([i.file_id for i in r.files]) if r.files else ""
-            s += f"{r.modality}\t{r.read_id}\t{r.strand}\t{r.min_len}\t{r.max_len}\t{r.primer_id}\t{r.name}\t{files}\n"
-        s = s[:-1]
+            lines.append(
+                f"{r.modality}\t{r.read_id}\t{r.strand}\t{r.min_len}\t{r.max_len}\t{r.primer_id}\t{r.name}\t{files}"
+            )
+        return "\n".join(lines)
     elif fmt == "json":
-        s = json.dumps([i.to_dict() for i in sequence_spec], sort_keys=False, indent=4)
-    return s
+        return json.dumps(
+            [i.to_dict() for i in info["sequence_spec"]], sort_keys=False, indent=4
+        )
+    return ""
+
+
+def format_library_spec(info: Dict, fmt: str = "tab") -> str:
+    """Format library specification.
 
+    Args:
+        info: Dictionary containing library specs from seqspec_info_library_spec
+        fmt: Output format (tab or json)
 
-def format_library_spec(
-    modality: str, library_spec: List[Region], fmt: str = "tab"
-) -> str:
-    """Format library specification."""
-    s = ""
+    Returns:
+        Formatted string
+    """
     if fmt == "tab":
-        for r in library_spec:
-            file = None
-            if r.onlist:
-                file = r.onlist.filename
-            s += f"{modality}\t{r.region_id}\t{r.region_type}\t{r.name}\t{r.sequence_type}\t{r.sequence}\t{r.min_len}\t{r.max_len}\t{file}\n"
-        s = s[:-1]
+        lines = []
+        for modality, regions in info["library_spec"].items():
+            for r in regions:
+                file = r.onlist.filename if r.onlist else None
+                lines.append(
+                    f"{modality}\t{r.region_id}\t{r.region_type}\t{r.name}\t{r.sequence_type}\t{r.sequence}\t{r.min_len}\t{r.max_len}\t{file}"
+                )
+        return "\n".join(lines)
     elif fmt == "json":
-        s = json.dumps(
-            {modality: [i.to_dict() for i in library_spec]}, sort_keys=False, indent=4
+        return json.dumps(
+            {m: [i.to_dict() for i in r] for m, r in info["library_spec"].items()},
+            sort_keys=False,
+            indent=4,
         )
-    return s
+    return ""
diff --git a/seqspec/seqspec_init.py b/seqspec/seqspec_init.py
index 361324b..a65af02 100644
--- a/seqspec/seqspec_init.py
+++ b/seqspec/seqspec_init.py
@@ -54,7 +54,6 @@ def setup_init_args(parser) -> ArgumentParser:
         help="List of modalities, reads, primer_ids, lengths, and strand (e.g. modality,fastq_name,primer_id,len,strand:...)",
         required=True,
     )
-
     subparser.add_argument(
         "-o",
         "--output",
@@ -85,39 +84,38 @@ def run_init(parser: ArgumentParser, args: Namespace) -> None:
 
     modalities = args.modalities.split(",")
     reads = parse_reads_string(args.reads)
-    tree = newick.loads(args.newick)
-
-    if len(tree[0].descendants) != len(modalities):
-        raise ValueError(
-            "Number of modalities must match number of modality-FASTQs pairs"
-        )
+    regions = newick_to_regions(args.newick)
 
-    spec = init(args.name, modalities, tree[0].descendants, reads)
+    spec = seqspec_init(args.name, modalities, regions, reads)
+    yaml_str = spec.to_YAML()
+    if yaml_str is None:
+        raise ValueError("Failed to generate YAML string from assay")
 
     if args.output:
-        spec.to_YAML(args.output)
+        args.output.write_text(yaml_str)
     else:
-        print(spec.to_YAML())
+        print(yaml_str)
 
 
-def init(
-    name: str, modalities: List[str], tree: List[newick.Node], reads: List[Read]
+def seqspec_init(
+    name: str, modalities: List[str], regions: List[Region], reads: List[Read]
 ) -> Assay:
     """Initialize a new seqspec specification.
 
     Args:
-        name: Name of the assay.
-        modalities: List of modalities.
-        tree: Newick tree nodes.
-        reads: List of read specifications.
+        name: Name of the assay
+        modalities: List of modalities
+        regions: List of Region objects
+        reads: List of read specifications
 
     Returns:
-        Initialized Assay object.
+        Initialized Assay object
+
+    Raises:
+        ValueError: If number of modalities doesn't match number of regions
     """
-    regions = []
-    for node in tree:
-        region = Region(region_id="", region_type="", name="", sequence_type="")
-        regions.append(newick_to_region(node, region))
+    if len(regions) != len(modalities):
+        raise ValueError("Number of modalities must match number of regions")
 
     return Assay(
         assay_id="",
@@ -136,15 +134,39 @@ def init(
     )
 
 
+def newick_to_regions(newick_str: str) -> List[Region]:
+    """Convert a newick string to a list of Region objects.
+
+    Args:
+        newick_str: Newick format string representing the library structure
+
+    Returns:
+        List of Region objects
+
+    Raises:
+        ValueError: If newick string is invalid
+    """
+    try:
+        tree = newick.loads(newick_str)
+    except Exception as e:
+        raise ValueError(f"Invalid newick string: {e}")
+
+    regions = []
+    for node in tree[0].descendants:
+        region = Region(region_id="", region_type="", name="", sequence_type="")
+        regions.append(newick_to_region(node, region))
+    return regions
+
+
 def newick_to_region(node: newick.Node, region: Region) -> Region:
     """Convert a newick node to a Region object.
 
     Args:
-        node: Newick tree node.
-        region: Base region object to populate.
+        node: Newick tree node
+        region: Base region object to populate
 
     Returns:
-        Populated Region object.
+        Populated Region object
     """
     region.region_id = node.name
     region.name = node.name
@@ -178,7 +200,7 @@ def parse_reads_string(input_string: str) -> List[Read]:
             "modality,read_id,primer_id,min_len,strand:..."
 
     Returns:
-        List of Read objects.
+        List of Read objects
     """
     reads = []
     for obj in input_string.split(":"):
diff --git a/seqspec/seqspec_print.py b/seqspec/seqspec_print.py
index 1b7744a..9ad6687 100644
--- a/seqspec/seqspec_print.py
+++ b/seqspec/seqspec_print.py
@@ -99,7 +99,30 @@ def run_print(parser: ArgumentParser, args: Namespace) -> None:
     validate_print_args(parser, args)
 
     spec = load_spec(args.yaml)
+    result = seqspec_print(spec, args.format)
 
+    if args.format == "seqspec-png":
+        result.savefig(args.output, dpi=300, bbox_inches="tight")
+    elif args.output:
+        with open(args.output, "w") as f:
+            print(result, file=f)
+    else:
+        print(result)
+
+
+def seqspec_print(spec: Assay, fmt: str):
+    """Print sequence specification in the specified format.
+
+    Args:
+        spec: The seqspec specification to print
+        fmt: The format to print in (library-ascii, seqspec-html, seqspec-png, seqspec-ascii)
+
+    Returns:
+        The formatted output (string or matplotlib figure)
+
+    Raises:
+        ValueError: If format is not supported
+    """
     # Map format to print function
     format_to_function = {
         "library-ascii": print_library_ascii,
@@ -108,15 +131,12 @@ def run_print(parser: ArgumentParser, args: Namespace) -> None:
         "seqspec-ascii": print_seqspec_ascii,
     }
 
-    result = format_to_function[args.format](spec)
+    if fmt not in format_to_function:
+        raise ValueError(
+            f"Unsupported format: {fmt}. Must be one of {list(format_to_function.keys())}"
+        )
 
-    if args.format == "seqspec-png":
-        result.savefig(args.output, dpi=300, bbox_inches="tight")
-    elif args.output:
-        with open(args.output, "w") as f:
-            print(result, file=f)
-    else:
-        print(result)
+    return format_to_function[fmt](spec)
 
 
 def print_seqspec_ascii(spec: Assay) -> str:
diff --git a/seqspec/seqspec_split.py b/seqspec/seqspec_split.py
index 20436a7..2917c81 100644
--- a/seqspec/seqspec_split.py
+++ b/seqspec/seqspec_split.py
@@ -5,7 +5,7 @@
 
 from pathlib import Path
 from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
-from typing import List, Dict, Any
+from typing import List
 
 from seqspec.utils import load_spec
 from seqspec.Assay import Assay
@@ -53,15 +53,24 @@ def run_split(parser: ArgumentParser, args: Namespace) -> None:
     validate_split_args(parser, args)
 
     spec = load_spec(args.yaml)
-    specs = split(spec, args.output)
+    specs = seqspec_split(spec)
 
-    for spec_info in specs:
-        output_path = args.output / f"{spec_info['prefix']}{spec_info['modality']}.yaml"
-        spec_info["spec"].to_YAML(output_path)
+    prefix = "spec." if args.output.name == "" else f"{args.output.name}."
+    for spec_m in specs:
+        modality = spec_m.list_modalities()[0]
+        output_path = args.output / f"{prefix}{modality}.yaml"
+        spec_m.to_YAML(output_path)
 
 
-def split(spec: Assay, output_dir: Path) -> List[Dict[str, Any]]:
-    """Split spec into one file per modality."""
+def seqspec_split(spec: Assay) -> List[Assay]:
+    """Split spec into one file per modality.
+
+    Args:
+        spec: The Assay object to split
+
+    Returns:
+        List of Assay objects, each containing a single modality
+    """
     specs = []
     modalities = spec.list_modalities()
 
@@ -85,8 +94,6 @@ def split(spec: Assay, output_dir: Path) -> List[Dict[str, Any]]:
         }
         spec_m = Assay(**info)
         spec_m.update_spec()
-
-        prefix = "spec." if output_dir.name == "" else f"{output_dir.name}."
-        specs.append({"prefix": prefix, "spec": spec_m, "modality": modality})
+        specs.append(spec_m)
 
     return specs
diff --git a/seqspec/seqspec_upgrade.py b/seqspec/seqspec_upgrade.py
index 62ec210..b363294 100644
--- a/seqspec/seqspec_upgrade.py
+++ b/seqspec/seqspec_upgrade.py
@@ -55,7 +55,7 @@ def run_upgrade(parser: ArgumentParser, args: Namespace) -> None:
 
     spec = load_spec(args.yaml)
     version = spec.seqspec_version
-    upgraded_spec = upgrade(spec, version)
+    upgraded_spec = seqspec_upgrade(spec, version)
 
     if args.output:
         args.output.write_text(upgraded_spec.to_YAML())
@@ -63,7 +63,7 @@ def run_upgrade(parser: ArgumentParser, args: Namespace) -> None:
         print(upgraded_spec.to_YAML())
 
 
-def upgrade(spec: Assay, version: str) -> Assay:
+def seqspec_upgrade(spec: Assay, version: str) -> Assay:
     """Upgrade spec to current version."""
     UPGRADE = {
         "0.0.0": upgrade_0_0_0_to_0_3_0,
@@ -73,6 +73,11 @@ def upgrade(spec: Assay, version: str) -> Assay:
         get_version(): no_upgrade,
     }
 
+    if version not in UPGRADE:
+        raise ValueError(
+            f"Unsupported version: {version}. Must be one of {list(UPGRADE.keys())}"
+        )
+
     return UPGRADE[version](spec)
 
 
@@ -104,7 +109,6 @@ def upgrade_0_2_0_to_0_3_0(spec: Assay) -> Assay:
         for lf in r.get_leaves():
             if lf.onlist is not None:
                 filename = lf.onlist.filename
-                # location = lf.onlist.location
                 md5 = lf.onlist.md5
                 lf.onlist = Onlist(
                     file_id=filename,
@@ -114,7 +118,6 @@ def upgrade_0_2_0_to_0_3_0(spec: Assay) -> Assay:
                     url="",
                     urltype="",
                     md5=md5,
-                    # location=location,
                 )
     spec.seqspec_version = get_version()
     return spec
diff --git a/seqspec/seqspec_version.py b/seqspec/seqspec_version.py
index 2ed7c54..98580ba 100644
--- a/seqspec/seqspec_version.py
+++ b/seqspec/seqspec_version.py
@@ -5,7 +5,7 @@
 
 from pathlib import Path
 from argparse import ArgumentParser, RawTextHelpFormatter, Namespace
-
+from typing import Dict
 from seqspec.utils import load_spec
 from seqspec.Assay import Assay
 from . import __version__
@@ -53,16 +53,29 @@ def run_version(parser: ArgumentParser, args: Namespace) -> None:
     validate_version_args(parser, args)
 
     spec = load_spec(args.yaml)
-    version_info = version(spec)
+    vinfo = seqspec_version(spec)
+    finfo = format_version(vinfo)
 
     if args.output:
-        args.output.write_text(version_info)
+        args.output.write_text(finfo)
     else:
-        print(version_info)
+        print(finfo)
 
 
-def version(spec: Assay) -> str:
+def seqspec_version(spec: Assay) -> Dict:
     """Get version information for spec and tool."""
     version = spec.seqspec_version
     tool_version = __version__
-    return f"seqspec version: {tool_version}\nseqspec file version: {version}"
+    return {"file_version": version, "tool_version": tool_version}
+
+
+def format_version(vinfo: Dict) -> str:
+    """Format version information into a string.
+
+    Args:
+        vinfo: Dictionary containing file_version and tool_version
+
+    Returns:
+        Formatted string with version information
+    """
+    return f"seqspec version: {vinfo['tool_version']}\nseqspec file version: {vinfo['file_version']}"

From f1e094bc034d9ee12c732762ef9b4e31e365240d Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Wed, 16 Jul 2025 14:24:00 -0500
Subject: [PATCH 17/21] CHECK-201-read-id (#11)

---
 seqspec/schema/seqspec_igvf.schema.json       | 396 ++++++++++++++++++
 .../seqspec_igvf_onlist_skip.schema.json      | 395 +++++++++++++++++
 seqspec/seqspec_check.py                      |  29 +-
 tests/test_region.py                          |  16 +-
 tests/test_seqspec_check.py                   | 132 +++++-
 tests/test_seqspec_onlist.py                  |  91 ++--
 tests/test_utils.py                           |  32 +-
 7 files changed, 1033 insertions(+), 58 deletions(-)
 create mode 100644 seqspec/schema/seqspec_igvf.schema.json
 create mode 100644 seqspec/schema/seqspec_igvf_onlist_skip.schema.json

diff --git a/seqspec/schema/seqspec_igvf.schema.json b/seqspec/schema/seqspec_igvf.schema.json
new file mode 100644
index 0000000..b476011
--- /dev/null
+++ b/seqspec/schema/seqspec_igvf.schema.json
@@ -0,0 +1,396 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "$id": "Assay.schema.json",
+    "title": "Assay",
+    "description": "A Assay of DNA",
+    "type": "object",
+    "properties": {
+      "seqspec_version": {
+        "description": "Version of the seqspec specification used",
+        "type": "string",
+        "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$"
+      },
+      "assay_id": {
+        "description": "Identifier for the assay",
+        "type": "string"
+      },
+      "name": {
+        "description": "The name of the assay",
+        "type": "string"
+      },
+      "doi": {
+        "description": "the doi of the paper that describes the assay",
+        "type": "string"
+      },
+      "date": {
+        "description": "The seqspec creation date",
+        "type": "string",
+        "pattern": "^(0?[1-9]|[12][0-9]|3[01])\\s(January|February|March|April|May|June|July|August|September|October|November|December)\\s(19|20)\\d\\d$"
+      },
+      "description": {
+        "description": "A short description of the assay",
+        "type": "string"
+      },
+      "modalities": {
+        "description": "The modalities the assay targets",
+        "type": "array",
+        "items": {
+          "type": "string",
+          "enum": ["dna", "rna", "tag", "protein", "atac", "crispr"]
+        }
+      },
+      "lib_struct": {
+        "description": "The link to Teichmann's libstructs page derived for this sequence",
+        "type": "string"
+      },
+      "library_protocol": {
+        "description": "The protocol/machine/tool to generate the library insert",
+        "anyOf": [
+          {
+            "type": "string"
+          },
+          {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "protocol_id": { "type": "string" },
+                "name": { "type": ["string", "null"] },
+                "modality": { "type": "string" }
+              }
+            },
+            "minItems": 1
+          }
+        ]
+      },
+      "library_kit": {
+        "description": "The kit used to make the library sequence_protocol compatible",
+        "anyOf": [
+          {
+            "type": "string"
+          },
+          {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "kit_id": { "type": "string" },
+                "name": { "type": ["string", "null"] },
+                "modality": { "type": "string" }
+              }
+            },
+            "minItems": 1
+          }
+        ]
+      },
+      "sequence_protocol": {
+        "description": "The protocol/machine/tool to generate sequences",
+        "anyOf": [
+          {
+            "type": "string"
+          },
+          {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "protocol_id": { "type": "string" },
+                "name": { "type": ["string", "null"] },
+                "modality": { "type": "string" }
+              }
+            },
+            "minItems": 1
+          }
+        ]
+      },
+      "sequence_kit": {
+        "description": "The kit used with the protocol to sequence the library",
+        "anyOf": [
+          {
+            "type": "string"
+          },
+          {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "kit_id": { "type": "string" },
+                "name": { "type": ["string", "null"] },
+                "modality": { "type": "string" }
+              }
+            },
+            "minItems": 1
+          }
+        ]
+      },
+      "sequence_spec": {
+        "description": "The spec for the sequencer",
+        "type": "array",
+        "items": {
+          "$ref": "#/$defs/read"
+        }
+      },
+      "library_spec": {
+        "description": "The spec for the assay",
+        "type": "array",
+        "items": {
+          "$ref": "#/$defs/region"
+        }
+      }
+    },
+    "required": [
+      "seqspec_version",
+      "assay_id",
+      "name",
+      "doi",
+      "date",
+      "description",
+      "modalities"
+    ],
+    "$defs": {
+      "region": {
+        "title": "Region",
+        "description": "A region of DNA",
+        "type": "object",
+        "properties": {
+          "region_id": {
+            "description": "identifier for the region",
+            "type": "string"
+          },
+          "region_type": {
+            "description": "the type of region",
+            "type": "string",
+            "enum": [
+              "atac",
+              "barcode",
+              "cdna",
+              "crispr",
+              "custom_primer",
+              "dna",
+              "fastq",
+              "fastq_link",
+              "gdna",
+              "hic",
+              "illumina_p5",
+              "illumina_p7",
+              "index5",
+              "index7",
+              "linker",
+              "ME1",
+              "ME2",
+              "methyl",
+              "named",
+              "nextera_read1",
+              "nextera_read2",
+              "poly_A",
+              "poly_G",
+              "poly_T",
+              "poly_C",
+              "protein",
+              "rna",
+              "s5",
+              "s7",
+              "tag",
+              "truseq_read1",
+              "truseq_read2",
+              "umi"
+            ]
+          },
+          "sequence_type": {
+            "description": "The type of the sequence",
+            "type": "string",
+            "enum": ["fixed", "random", "onlist", "joined"]
+          },
+          "sequence": {
+            "description": "The sequence",
+            "type": "string"
+          },
+          "min_len": {
+            "description": "The minimum length of the sequence",
+            "type": "integer",
+            "minimum": 0,
+            "maximum": 2048
+          },
+          "max_len": {
+            "description": "The maximum length of the sequence",
+            "type": "integer",
+            "minimum": 0,
+            "maximum": 2048
+          },
+          "onlist": {
+            "description": "The file containing the sequence if seq_type = onlist",
+            "type": ["object", "null"],
+            "properties": {
+              "file_id": {
+                "description": "filename",
+                "type": "string"
+              },
+              "filename": {
+                "description": "filename for the onlist",
+                "type": "string"
+              },
+              "filetype": {
+                "description": "the type of file",
+                "type": "string"
+              },
+              "filesize": {
+                "description": "the size of the file in bytes",
+                "type": "integer"
+              },
+              "url": {
+                "description": "The path or url to the file",
+                "type": "string"
+              },
+              "urltype": {
+                "description": "type of file path",
+                "type": "string",
+                "enum": ["local", "ftp", "http", "https"]
+              },
+              "location": {
+                "description": "location of onlist",
+                "type": "string",
+                "enum": ["local", "remote"]
+              },
+              "md5": {
+                "description": "md5sum for the file pointed to by filename",
+                "type": "string"              }
+            },
+            "required": [
+              "file_id",
+              "filename",
+              "filetype",
+              "filesize",
+              "url",
+              "urltype"
+            ]
+          },
+          "regions": {
+            "description": "The regions being joined",
+            "type": "array",
+            "items": {
+              "$ref": "#/$defs/region"
+            }
+          }
+        },
+        "required": [
+          "region_id",
+          "region_type",
+          "sequence_type",
+          "sequence",
+          "min_len",
+          "max_len"
+        ],
+        "if": {
+          "properties": {
+            "min_len": {
+              "const": 0
+            }
+          }
+        },
+        "then": {
+          "properties": {
+            "sequence": {
+              "type": "string",
+              "pattern": "^[ACGTRYMKSWHBVDNX]*$"
+            }
+          }
+        },
+        "else": {
+          "properties": {
+            "sequence": {
+              "type": "string",
+              "minLength": 1,
+              "pattern": "^[ACGTRYMKSWHBVDNX]+$"
+            }
+          }
+        }
+      },
+      "read": {
+        "title": "Read",
+        "type": "object",
+        "properties": {
+          "read_id": {
+            "type": "string",
+            "description": "The unique identifier for the read.",
+            "pattern": "^IGVF.*"
+          },
+          "name": {
+            "type": "string",
+            "description": "The name of the read."
+          },
+          "modality": {
+            "type": "string",
+            "description": "The modality of the assay generating the read."
+          },
+          "primer_id": {
+            "type": "string",
+            "description": "The region id of the primer used."
+          },
+          "min_len": {
+            "type": "integer",
+            "minimum": 0,
+            "description": "The minimum length of the read, must be greater than or equal to 0."
+          },
+          "max_len": {
+            "type": "integer",
+            "exclusiveMinimum": 0,
+            "description": "The maximum length of the read, must be greater than 0."
+          },
+          "strand": {
+            "type": "string",
+            "enum": ["pos", "neg"],
+            "description": "The strand orientation of the read, either positive ('pos') or negative ('neg')."
+          },
+          "files": {
+            "description": "An array of files containing the reads",
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "file_id": {
+                  "description": "filename",
+                  "type": "string"
+                },
+                "filename": {
+                  "description": "filename",
+                  "type": "string"
+                },
+                "filetype": {
+                  "description": "the type of file",
+                  "type": "string"
+                },
+                "filesize": {
+                  "description": "the size of the file in bytes",
+                  "type": "integer"
+                },
+                "url": {
+                  "description": "The path or url to the file",
+                  "type": "string"
+                },
+                "urltype": {
+                  "description": "type of file path",
+                  "type": "string",
+                  "enum": ["local", "ftp", "http", "https"]
+                },
+                "md5": {
+                  "description": "md5sum for the file pointed to by filename",
+                  "type": "string",
+                  "pattern": "^[a-f0-9]{32}$"
+                }
+              }
+            }
+          }
+        },
+        "required": [
+          "read_id",
+          "modality",
+          "primer_id",
+          "min_len",
+          "max_len",
+          "strand"
+        ],
+        "additionalProperties": false
+      }
+    }
+  }
+  
\ No newline at end of file
diff --git a/seqspec/schema/seqspec_igvf_onlist_skip.schema.json b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json
new file mode 100644
index 0000000..9bf86fd
--- /dev/null
+++ b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json
@@ -0,0 +1,395 @@
+{
+    "$schema": "https://json-schema.org/draft/2020-12/schema",
+    "$id": "Assay.schema.json",
+    "title": "Assay",
+    "description": "A Assay of DNA",
+    "type": "object",
+    "properties": {
+      "seqspec_version": {
+        "description": "Version of the seqspec specification used",
+        "type": "string",
+        "pattern": "^(0|[1-9]\\d*)\\.(0|[1-9]\\d*)\\.(0|[1-9]\\d*)(?:-((?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\\.(?:0|[1-9]\\d*|\\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\\+([0-9a-zA-Z-]+(?:\\.[0-9a-zA-Z-]+)*))?$"
+      },
+      "assay_id": {
+        "description": "Identifier for the assay",
+        "type": "string"
+      },
+      "name": {
+        "description": "The name of the assay",
+        "type": "string"
+      },
+      "doi": {
+        "description": "the doi of the paper that describes the assay",
+        "type": "string"
+      },
+      "date": {
+        "description": "The seqspec creation date",
+        "type": "string",
+        "pattern": "^(0?[1-9]|[12][0-9]|3[01])\\s(January|February|March|April|May|June|July|August|September|October|November|December)\\s(19|20)\\d\\d$"
+      },
+      "description": {
+        "description": "A short description of the assay",
+        "type": "string"
+      },
+      "modalities": {
+        "description": "The modalities the assay targets",
+        "type": "array",
+        "items": {
+          "type": "string",
+          "enum": ["dna", "rna", "tag", "protein", "atac", "crispr"]
+        }
+      },
+      "lib_struct": {
+        "description": "The link to Teichmann's libstructs page derived for this sequence",
+        "type": "string"
+      },
+      "library_protocol": {
+        "description": "The protocol/machine/tool to generate the library insert",
+        "anyOf": [
+          {
+            "type": "string"
+          },
+          {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "protocol_id": { "type": "string" },
+                "name": { "type": ["string", "null"] },
+                "modality": { "type": "string" }
+              }
+            },
+            "minItems": 1
+          }
+        ]
+      },
+      "library_kit": {
+        "description": "The kit used to make the library sequence_protocol compatible",
+        "anyOf": [
+          {
+            "type": "string"
+          },
+          {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "kit_id": { "type": "string" },
+                "name": { "type": ["string", "null"] },
+                "modality": { "type": "string" }
+              }
+            },
+            "minItems": 1
+          }
+        ]
+      },
+      "sequence_protocol": {
+        "description": "The protocol/machine/tool to generate sequences",
+        "anyOf": [
+          {
+            "type": "string"
+          },
+          {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "protocol_id": { "type": "string" },
+                "name": { "type": ["string", "null"] },
+                "modality": { "type": "string" }
+              }
+            },
+            "minItems": 1
+          }
+        ]
+      },
+      "sequence_kit": {
+        "description": "The kit used with the protocol to sequence the library",
+        "anyOf": [
+          {
+            "type": "string"
+          },
+          {
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "kit_id": { "type": "string" },
+                "name": { "type": ["string", "null"] },
+                "modality": { "type": "string" }
+              }
+            },
+            "minItems": 1
+          }
+        ]
+      },
+      "sequence_spec": {
+        "description": "The spec for the sequencer",
+        "type": "array",
+        "items": {
+          "$ref": "#/$defs/read"
+        }
+      },
+      "library_spec": {
+        "description": "The spec for the assay",
+        "type": "array",
+        "items": {
+          "$ref": "#/$defs/region"
+        }
+      }
+    },
+    "required": [
+      "seqspec_version",
+      "assay_id",
+      "name",
+      "doi",
+      "date",
+      "description",
+      "modalities"
+    ],
+    "$defs": {
+      "region": {
+        "title": "Region",
+        "description": "A region of DNA",
+        "type": "object",
+        "properties": {
+          "region_id": {
+            "description": "identifier for the region",
+            "type": "string"
+          },
+          "region_type": {
+            "description": "the type of region",
+            "type": "string",
+            "enum": [
+              "atac",
+              "barcode",
+              "cdna",
+              "crispr",
+              "custom_primer",
+              "dna",
+              "fastq",
+              "fastq_link",
+              "gdna",
+              "hic",
+              "illumina_p5",
+              "illumina_p7",
+              "index5",
+              "index7",
+              "linker",
+              "ME1",
+              "ME2",
+              "methyl",
+              "named",
+              "nextera_read1",
+              "nextera_read2",
+              "poly_A",
+              "poly_G",
+              "poly_T",
+              "poly_C",
+              "protein",
+              "rna",
+              "s5",
+              "s7",
+              "tag",
+              "truseq_read1",
+              "truseq_read2",
+              "umi"
+            ]
+          },
+          "sequence_type": {
+            "description": "The type of the sequence",
+            "type": "string",
+            "enum": ["fixed", "random", "onlist", "joined"]
+          },
+          "sequence": {
+            "description": "The sequence",
+            "type": "string"
+          },
+          "min_len": {
+            "description": "The minimum length of the sequence",
+            "type": "integer",
+            "minimum": 0,
+            "maximum": 2048
+          },
+          "max_len": {
+            "description": "The maximum length of the sequence",
+            "type": "integer",
+            "minimum": 0,
+            "maximum": 2048
+          },
+          "onlist": {
+            "description": "The file containing the sequence if seq_type = onlist",
+            "type": ["object", "null"],
+            "properties": {
+              "file_id": {
+                "description": "filename",
+                "type": "string"
+              },
+              "filename": {
+                "description": "filename for the onlist",
+                "type": "string"
+              },
+              "filetype": {
+                "description": "the type of file",
+                "type": "string"
+              },
+              "filesize": {
+                "description": "the size of the file in bytes",
+                "type": "integer"
+              },
+              "url": {
+                "description": "The path or url to the file",
+                "type": "string"
+              },
+              "urltype": {
+                "description": "type of file path",
+                "type": "string",
+                "enum": ["local", "ftp", "http", "https"]
+              },
+              "location": {
+                "description": "location of onlist",
+                "type": "string",
+                "enum": ["local", "remote"]
+              },
+              "md5": {
+                "description": "md5sum for the file pointed to by filename",
+                "type": "string"              }
+            },
+            "required": [
+              "file_id",
+              "filename",
+              "filetype",
+              "filesize",
+              "url",
+              "urltype"
+            ]
+          },
+          "regions": {
+            "description": "The regions being joined",
+            "type": "array",
+            "items": {
+              "$ref": "#/$defs/region"
+            }
+          }
+        },
+        "required": [
+          "region_id",
+          "region_type",
+          "sequence_type",
+          "sequence",
+          "min_len",
+          "max_len"
+        ],
+        "if": {
+          "properties": {
+            "min_len": {
+              "const": 0
+            }
+          }
+        },
+        "then": {
+          "properties": {
+            "sequence": {
+              "type": "string",
+              "pattern": "^[ACGTRYMKSWHBVDNX]*$"
+            }
+          }
+        },
+        "else": {
+          "properties": {
+            "sequence": {
+              "type": "string",
+              "minLength": 1,
+              "pattern": "^[ACGTRYMKSWHBVDNX]+$"
+            }
+          }
+        }
+      },
+      "read": {
+        "title": "Read",
+        "type": "object",
+        "properties": {
+          "read_id": {
+            "type": "string",
+            "description": "The unique identifier for the read."
+          },
+          "name": {
+            "type": "string",
+            "description": "The name of the read."
+          },
+          "modality": {
+            "type": "string",
+            "description": "The modality of the assay generating the read."
+          },
+          "primer_id": {
+            "type": "string",
+            "description": "The region id of the primer used."
+          },
+          "min_len": {
+            "type": "integer",
+            "minimum": 0,
+            "description": "The minimum length of the read, must be greater than or equal to 0."
+          },
+          "max_len": {
+            "type": "integer",
+            "exclusiveMinimum": 0,
+            "description": "The maximum length of the read, must be greater than 0."
+          },
+          "strand": {
+            "type": "string",
+            "enum": ["pos", "neg"],
+            "description": "The strand orientation of the read, either positive ('pos') or negative ('neg')."
+          },
+          "files": {
+            "description": "An array of files containing the reads",
+            "type": "array",
+            "items": {
+              "type": "object",
+              "properties": {
+                "file_id": {
+                  "description": "filename",
+                  "type": "string"
+                },
+                "filename": {
+                  "description": "filename",
+                  "type": "string"
+                },
+                "filetype": {
+                  "description": "the type of file",
+                  "type": "string"
+                },
+                "filesize": {
+                  "description": "the size of the file in bytes",
+                  "type": "integer"
+                },
+                "url": {
+                  "description": "The path or url to the file",
+                  "type": "string"
+                },
+                "urltype": {
+                  "description": "type of file path",
+                  "type": "string",
+                  "enum": ["local", "ftp", "http", "https"]
+                },
+                "md5": {
+                  "description": "md5sum for the file pointed to by filename",
+                  "type": "string",
+                  "pattern": "^[a-f0-9]{32}$"
+                }
+              }
+            }
+          }
+        },
+        "required": [
+          "read_id",
+          "modality",
+          "primer_id",
+          "min_len",
+          "max_len",
+          "strand"
+        ],
+        "additionalProperties": false
+      }
+    }
+  }
+  
\ No newline at end of file
diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py
index e9b2cef..24a3a3f 100644
--- a/seqspec/seqspec_check.py
+++ b/seqspec/seqspec_check.py
@@ -80,7 +80,7 @@ def seqspec_check(
     Returns:
         List of error dictionaries
     """
-    errors = check(spec, spec_fn)
+    errors = check(spec, spec_fn, filter_type)
     if filter_type:
         errors = filter_errors(errors, filter_type)
     return errors
@@ -103,24 +103,14 @@ def run_check(parser: ArgumentParser, args: Namespace):
     return errors
 
 
-IGVF_FILTERS = [
-    {"error_type": "check_schema", "error_object": "'lib_struct'"},
-    {"error_type": "check_schema", "error_object": "'library_protocol'"},
-    {"error_type": "check_schema", "error_object": "'library_kit'"},
-    {"error_type": "check_schema", "error_object": "'sequence_protocol'"},
-    {"error_type": "check_schema", "error_object": "'sequence_kit'"},
-    {"error_type": "check_schema", "error_object": "'md5'"},
-]
-IGVF_ONLIST_SKIP_FILTERS = IGVF_FILTERS + [
+IGVF_ONLIST_SKIP_FILTERS = [
     {"error_type": "check_onlist_files_exist", "error_object": "onlist"}
 ]
 
 
 def filter_errors(errors, filter_type):
     filters = None
-    if filter_type == "igvf":
-        filters = IGVF_FILTERS
-    elif filter_type == "igvf_onlist_skip":
+    if filter_type == "igvf_onlist_skip":
         filters = IGVF_ONLIST_SKIP_FILTERS
 
     if filters:
@@ -141,10 +131,19 @@ def filter_errors(errors, filter_type):
         return errors
 
 
-def check(spec: Assay, spec_fn: str):
+def check(spec: Assay, spec_fn: str, skip: str = None):
     # Variety of checks against schema
     def check_schema(spec: Assay, spec_fn: str, errors=[], idx=0):
-        schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json")
+        if skip == "igvf":
+            schema_fn = path.join(
+                path.dirname(__file__), "schema/seqspec_igvf.schema.json"
+            )
+        elif skip == "igvf_onlist_skip":
+            schema_fn = path.join(
+                path.dirname(__file__), "schema/seqspec_igvf_onlist_skip.schema.json"
+            )
+        else:
+            schema_fn = path.join(path.dirname(__file__), "schema/seqspec.schema.json")
         with open(schema_fn, "r") as stream:
             schema = yaml.load(stream, Loader=yaml.Loader)
         validator = Draft4Validator(schema)
diff --git a/tests/test_region.py b/tests/test_region.py
index 710665e..3afdf3a 100644
--- a/tests/test_region.py
+++ b/tests/test_region.py
@@ -66,11 +66,8 @@ def test_simple_onlist(self):
         url = filename
         urltype = "file"
         md5sum = "d41d8cd98f00b204e9800998ecf8427e"
-        location = "local"
 
-        permit = Onlist(
-            file_id, filename, filetype, filesize, url, "file", md5sum, location
-        )
+        permit = Onlist(file_id, filename, filetype, filesize, url, "file", md5sum)
 
         self.assertEqual(
             permit.to_dict(),
@@ -204,9 +201,16 @@ def test_onlists(self):
         list_url = list_name
         list_urltype = "file"
         list_md5sum = "d41d8cd98f00b204e9800998ecf8427e"
-        list_location = "local"
 
-        permited = Onlist(list_id, list_name, list_type, list_size, list_url, list_urltype, list_md5sum, list_location)
+        permited = Onlist(
+            list_id,
+            list_name,
+            list_type,
+            list_size,
+            list_url,
+            list_urltype,
+            list_md5sum,
+        )
 
         r = Region(
             region_name,
diff --git a/tests/test_seqspec_check.py b/tests/test_seqspec_check.py
index ae8ad6b..75d31bd 100644
--- a/tests/test_seqspec_check.py
+++ b/tests/test_seqspec_check.py
@@ -30,8 +30,8 @@ def test_check_args(self):
         cmdline = ["check", "-o", output_name, spec_name]
         args = parser.parse_args(cmdline)
 
-        self.assertEqual(args.o, output_name)
-        self.assertEqual(args.yaml, spec_name)
+        self.assertEqual(str(args.output), output_name)
+        self.assertEqual(str(args.yaml), spec_name)
 
     def test_validate_check_args(self):
         parser = create_stub_check_parser()
@@ -49,4 +49,130 @@ def test_validate_check_args(self):
             with patch("os.path.exists") as path_exists:
                 path_exists.return_value = True
                 errors = validate_check_args(None, args)
-                self.assertEqual(errors, [])
+                self.assertEqual(errors, None)
+
+    def test_check_with_igvf_skip(self):
+        """Test that 'igvf' skip condition filters out some IGVF-related errors but not read_id pattern errors."""
+        from seqspec.seqspec_check import run_check
+        from argparse import ArgumentParser, Namespace
+
+        # Create a parser
+        parser = ArgumentParser()
+        subparser = parser.add_subparsers(dest="command")
+        subparser = setup_check_args(subparser)
+
+        # Test file path
+        test_file = Path("tests/data/seqspec_valid_ignore_onlist.yaml")
+
+        # Test with 'igvf' skip
+        args = Namespace()
+        args.yaml = test_file
+        args.output = None
+        args.skip = "igvf"
+
+        # Run check with igvf
+        errors = run_check(parser, args)
+
+        # Should have exactly 2 errors: read_id pattern error and onlist file error
+        self.assertEqual(
+            len(errors), 2, f"Expected 2 errors, got {len(errors)}: {errors}"
+        )
+
+        # Check for read_id pattern error (should not be filtered by igvf skip)
+        read_id_errors = [
+            e
+            for e in errors
+            if e.get("error_type") == "check_schema"
+            and "read_id" in e.get("error_message", "")
+        ]
+        self.assertEqual(
+            len(read_id_errors),
+            1,
+            f"Expected 1 read_id error, got {len(read_id_errors)}",
+        )
+        self.assertIn("1165AJSO", read_id_errors[0]["error_message"])
+        self.assertIn("does not match", read_id_errors[0]["error_message"])
+
+        # Check for onlist file error (should not be filtered by igvf skip)
+        onlist_errors = [
+            e for e in errors if e.get("error_type") == "check_onlist_files_exist"
+        ]
+        self.assertEqual(
+            len(onlist_errors), 1, f"Expected 1 onlist error, got {len(onlist_errors)}"
+        )
+        self.assertIn("does not exist", onlist_errors[0]["error_message"])
+
+    def test_check_with_igvf_onlist_skip(self):
+        """Test that 'igvf_onlist_skip' skip condition filters out IGVF and onlist errors including read_id pattern."""
+        from seqspec.seqspec_check import run_check
+        from argparse import ArgumentParser, Namespace
+
+        # Create a parser
+        parser = ArgumentParser()
+        subparser = parser.add_subparsers(dest="command")
+        subparser = setup_check_args(subparser)
+
+        # Test file path
+        test_file = Path("tests/data/seqspec_valid_ignore_onlist.yaml")
+
+        # Test with 'igvf_onlist_skip' skip
+        args = Namespace()
+        args.yaml = test_file
+        args.output = None
+        args.skip = "igvf_onlist_skip"
+
+        # Run check with igvf_onlist_skip
+        errors = run_check(parser, args)
+
+        # Should have no errors (all errors are filtered out by igvf_onlist_skip)
+        self.assertEqual(
+            len(errors), 0, f"Expected 0 errors, got {len(errors)}: {errors}"
+        )
+
+    def test_check_without_skip(self):
+        """Test that without skip condition, validation errors are reported."""
+        from seqspec.seqspec_check import run_check
+        from argparse import ArgumentParser, Namespace
+
+        # Create a parser
+        parser = ArgumentParser()
+        subparser = parser.add_subparsers(dest="command")
+        subparser = setup_check_args(subparser)
+
+        # Test file path
+        test_file = Path("tests/data/seqspec_valid_ignore_onlist.yaml")
+
+        # Test without skip
+        args = Namespace()
+        args.yaml = test_file
+        args.output = None
+        args.skip = None
+
+        # Run check without skip
+        errors = run_check(parser, args)
+
+        # Should have exactly 2 errors: sequence_protocol error and onlist file error
+        self.assertEqual(
+            len(errors), 2, f"Expected 2 errors, got {len(errors)}: {errors}"
+        )
+
+        # Check for sequence_protocol error
+        protocol_errors = [
+            e
+            for e in errors
+            if e.get("error_type") == "check_schema"
+            and "sequence_protocol" in e.get("error_message", "")
+        ]
+        self.assertEqual(
+            len(protocol_errors),
+            1,
+            f"Expected 1 sequence_protocol error, got {len(protocol_errors)}",
+        )
+
+        # Check for onlist file error
+        onlist_errors = [
+            e for e in errors if e.get("error_type") == "check_onlist_files_exist"
+        ]
+        self.assertEqual(
+            len(onlist_errors), 1, f"Expected 1 onlist error, got {len(onlist_errors)}"
+        )
diff --git a/tests/test_seqspec_onlist.py b/tests/test_seqspec_onlist.py
index 40f6ad9..cd31149 100644
--- a/tests/test_seqspec_onlist.py
+++ b/tests/test_seqspec_onlist.py
@@ -1,6 +1,5 @@
 from argparse import ArgumentParser
 from contextlib import contextmanager
-from io import StringIO
 import os
 from tempfile import TemporaryDirectory
 from unittest import TestCase
@@ -12,7 +11,6 @@
     join_onlists,
     join_product_onlist,
     join_multi_onlist,
-    join_onlists,
     run_onlist_region,
     run_onlist_read,
     setup_onlist_args,
@@ -33,7 +31,7 @@ def create_temporary_barcode_files(filenames):
             os.chdir(tmpdir)
             for name in filenames:
                 filename = os.path.join(tmpdir, name)
-                with open(filename, "wt") as outstream:
+                with open(filename, "wt"):
                     pass
             yield tmpdir
     finally:
@@ -66,13 +64,29 @@ def test_find_list_target_dir_local(self):
         with create_temporary_barcode_files(["index_onlist.txt"]) as tmpdir:
             filename = os.path.join(tmpdir, "temp.tsv")
 
-            onlist1 = Onlist("temp_id", filename, "tsv", 300, filename, "local", "d41d8cd98f00b204e9800998ecf8427e", "local")
+            onlist1 = Onlist(
+                "temp_id",
+                filename,
+                "tsv",
+                300,
+                filename,
+                "local",
+                "d41d8cd98f00b204e9800998ecf8427e",
+            )
 
             target_dir = find_list_target_dir([onlist1])
             self.assertEqual(target_dir, tmpdir)
 
     def test_find_list_target_dir_remote(self):
-        onlist1 = Onlist("temp_id", "temp.tsv", "tsv", 300, "http://localhost:9/temp.tsv", "http", "d41d8cd98f00b204e9800998ecf8427e", "remote")
+        onlist1 = Onlist(
+            "temp_id",
+            "temp.tsv",
+            "tsv",
+            300,
+            "http://localhost:9/temp.tsv",
+            "http",
+            "d41d8cd98f00b204e9800998ecf8427e",
+        )
 
         target_dir = find_list_target_dir([onlist1])
         self.assertEqual(target_dir, os.getcwd())
@@ -126,57 +140,80 @@ def test_join_onlist_multi(self):
     def test_local_validate_onlist_args(self):
         onlist_name = "index_onlist.tsv"
         with create_temporary_barcode_files([onlist_name]) as tmpdir:
-            expected_onlist_path = os.path.join(tmpdir, onlist_name)
             spec_path = os.path.join(tmpdir, "spec.yaml")
 
             parser = ArgumentParser()
             subparser = parser.add_subparsers(dest="command")
             subparser = setup_onlist_args(subparser)
-            args = parser.parse_args([
-                "onlist", "-m", "rna", "-i", "read1.fastq.gz", "-f", "multi", spec_path])
+            args = parser.parse_args(
+                [
+                    "onlist",
+                    "-m",
+                    "rna",
+                    "-i",
+                    "read1.fastq.gz",
+                    "-f",
+                    "multi",
+                    spec_path,
+                ]
+            )
 
             def load_spec(*args, **kwargs):
                 return load_example_spec(example_spec)
 
-            with patch("seqspec.seqspec_onlist.load_spec", load_spec) as loader:
-                validate_onlist_args(parser, args)
+            with patch("seqspec.seqspec_onlist.load_spec", load_spec):
+                with patch("pathlib.Path.exists", return_value=True):
+                    validate_onlist_args(parser, args)
 
     def test_local_cached_remote_validate_onlist_args(self):
         # Test that we will can use a locally cached copy of one barcode file
         # even if it is marked remote.
         onlist_name = "index_onlist.txt"
         with create_temporary_barcode_files([onlist_name]) as tmpdir:
-            expected_onlist_path = os.path.join(tmpdir, onlist_name)
             spec_path = os.path.join(tmpdir, "spec.yaml")
 
             parser = ArgumentParser()
             subparser = parser.add_subparsers(dest="command")
             subparser = setup_onlist_args(subparser)
-            args = parser.parse_args([
-                "onlist", "-m", "rna", "-i", "read1.fastq.gz", "-f", "multi", spec_path])
+            args = parser.parse_args(
+                [
+                    "onlist",
+                    "-m",
+                    "rna",
+                    "-i",
+                    "read1.fastq.gz",
+                    "-f",
+                    "multi",
+                    spec_path,
+                ]
+            )
 
             def load_spec(*args, **kwargs):
-                remote_spec = example_spec.replace(
-                    "location: local",
-                    "location: remote"
-                ).replace(
-                    "url: index_onlist.tsv",
-                    "url: http://localhost:9/foo/index_onlist.tsv"
-                ).replace(
-                    "urltype: local",
-                    "urltype: http",
+                remote_spec = (
+                    example_spec.replace("location: local", "location: remote")
+                    .replace(
+                        "url: index_onlist.tsv",
+                        "url: http://localhost:9/foo/index_onlist.tsv",
+                    )
+                    .replace(
+                        "urltype: local",
+                        "urltype: http",
+                    )
                 )
                 print(remote_spec)
                 return load_example_spec(remote_spec)
 
-            with patch("seqspec.seqspec_onlist.load_spec", load_spec) as loader, patch("seqspec.seqspec_onlist.read_remote_list", return_value="index_onlist.tsv") as fake_remote_list:
-                # Failed validation would raise an exception
-                validate_onlist_args(parser, args)
-
+            with patch("seqspec.seqspec_onlist.load_spec", load_spec), patch(
+                "seqspec.seqspec_onlist.read_remote_list",
+                return_value="index_onlist.tsv",
+            ):
+                with patch("pathlib.Path.exists", return_value=True):
+                    # Failed validation would raise an exception
+                    validate_onlist_args(parser, args)
 
     def test_write_onlist_no_double_spacing(self):
         # Make sure that joined onlists don't end up double spaced.
-        
+
         onlists = [
             ["AAAA", "TTTT"],
             ["GGGG", "CCCC", "GGTT"],
diff --git a/tests/test_utils.py b/tests/test_utils.py
index 34a36bc..0fbe293 100644
--- a/tests/test_utils.py
+++ b/tests/test_utils.py
@@ -2,14 +2,16 @@
 from hashlib import md5
 from io import StringIO, BytesIO
 import os
-from pathlib import Path
 from tempfile import TemporaryDirectory
 from requests import HTTPError
 from unittest import TestCase
 from unittest.mock import patch
 
 from seqspec.Region import (
-    Region, RegionCoordinate, Onlist, project_regions_to_coordinates
+    Region,
+    RegionCoordinate,
+    Onlist,
+    project_regions_to_coordinates,
 )
 from seqspec.utils import (
     get_remote_auth_token,
@@ -18,9 +20,8 @@
     write_read,
     read_local_list,
     read_remote_list,
-    yield_onlist_contents
+    yield_onlist_contents,
 )
-from seqspec import __version__
 
 from .test_region import (
     region_rna_joined_dict,
@@ -154,6 +155,7 @@
     parent_id: rna
 """
 
+
 def load_example_spec(spec_text):
     with StringIO(spec_text) as instream:
         spec = load_spec_stream(instream)
@@ -217,7 +219,15 @@ def test_read_local_list(self):
             with gzip.open(temp_list_filename, "wt") as stream:
                 stream.write(fake_contents)
 
-            onlist1 = Onlist("123", temp_list_filename, "tsv", 300, temp_list_filename, "local", fake_md5, "local")
+            onlist1 = Onlist(
+                "123",
+                temp_list_filename,
+                "tsv",
+                300,
+                temp_list_filename,
+                "local",
+                fake_md5,
+            )
             loaded_list = read_local_list(onlist1)
 
             self.assertEqual(fake_onlist, loaded_list)
@@ -232,7 +242,15 @@ def test_read_local_list_gz(self):
             with open(temp_list_filename, "wt") as stream:
                 stream.write(fake_contents)
 
-            onlist1 = Onlist("123", temp_list_filename, "tsv", 300, temp_list_filename, "local", fake_md5, "local")
+            onlist1 = Onlist(
+                "123",
+                temp_list_filename,
+                "tsv",
+                300,
+                temp_list_filename,
+                "local",
+                fake_md5,
+            )
             loaded_list = read_local_list(onlist1)
 
             self.assertEqual(fake_onlist, loaded_list)
@@ -256,7 +274,7 @@ def raise_for_status(self):
 
         with patch("requests.get", new=fake_request_get):
             url = "http://localhost/testlist.txt"
-            onlist1 = Onlist("123", "testlist.txt", "http", 300, url, "http", fake_md5, "remote")
+            onlist1 = Onlist("123", "testlist.txt", "http", 300, url, "http", fake_md5)
             loaded_list = read_remote_list(onlist1)
 
             self.assertEqual(fake_onlist, loaded_list)

From 5ca2bbed4beb69bc4b678b63564addcf33e706d1 Mon Sep 17 00:00:00 2001
From: Mingjie Li <mingjiecn@gmail.com>
Date: Wed, 16 Jul 2025 14:51:02 -0500
Subject: [PATCH 18/21] add bead_TSO to all schema

---
 seqspec/schema/seqspec_igvf.schema.json             | 1 +
 seqspec/schema/seqspec_igvf_onlist_skip.schema.json | 1 +
 2 files changed, 2 insertions(+)

diff --git a/seqspec/schema/seqspec_igvf.schema.json b/seqspec/schema/seqspec_igvf.schema.json
index b476011..2b7a258 100644
--- a/seqspec/schema/seqspec_igvf.schema.json
+++ b/seqspec/schema/seqspec_igvf.schema.json
@@ -163,6 +163,7 @@
             "enum": [
               "atac",
               "barcode",
+              "bead_TSO",
               "cdna",
               "crispr",
               "custom_primer",
diff --git a/seqspec/schema/seqspec_igvf_onlist_skip.schema.json b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json
index 9bf86fd..3ec0d61 100644
--- a/seqspec/schema/seqspec_igvf_onlist_skip.schema.json
+++ b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json
@@ -163,6 +163,7 @@
             "enum": [
               "atac",
               "barcode",
+              "bead_TSO",
               "cdna",
               "crispr",
               "custom_primer",

From aff80515dd9d95de1c7855e51c53141f6f1cbc28 Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Fri, 22 Aug 2025 19:18:20 -0500
Subject: [PATCH 19/21] CHECK-231-region-type (#72) (#16)

---
 docs/SPECIFICATION.md                           |  1 +
 docs/assays/10xcrispr.spec.yaml                 |  2 +-
 docs/assays/sccrispra.spec.yaml                 |  2 +-
 seqspec/schema/seqspec.schema.json              |  1 +
 seqspec/schema/seqspec_igvf.schema.json         |  1 +
 .../schema/seqspec_igvf_onlist_skip.schema.json |  1 +
 seqspec/seqspec_index.py                        | 17 +++++------------
 7 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/docs/SPECIFICATION.md b/docs/SPECIFICATION.md
index f4ad0be..63b8384 100644
--- a/docs/SPECIFICATION.md
+++ b/docs/SPECIFICATION.md
@@ -153,6 +153,7 @@ Each `Region` has the following properties which are useful to annotate the elem
   - `rna`: The modality corresponding to assaying RNA.
   - `s5`: A sequencing primer or adaptor typically used in the Nextera kit in conjunction with ME1.
   - `s7`: A sequencing primer or adaptor typically used in the Nextera kit in conjunction with ME2.
+  - `sgrna_target`: A sequence corresponding to the guide RNA spacer region that determines the genomic target of CRISPR-based perturbations.
   - `tag`: A short sequence of DNA or RNA used to label or identify a sample, protein, or other grouping.
   - `truseq_read1`: The first read primer in a paired-end sequencing run using the Illumina TruSeq Library preparation kit.
   - `truseq_read2`: The second read primer in a paired-end sequencing run using the Illumina TruSeq Library preparation kit.
diff --git a/docs/assays/10xcrispr.spec.yaml b/docs/assays/10xcrispr.spec.yaml
index 87e4d36..a95cc5a 100644
--- a/docs/assays/10xcrispr.spec.yaml
+++ b/docs/assays/10xcrispr.spec.yaml
@@ -203,7 +203,7 @@ library_spec:
     parent_id: crispr
   - !Region
     region_id: sgrna_target
-    region_type: crispr
+    region_type: sgrna_target
     name: sgrna_target
     sequence_type: onlist
     sequence: NNNNNNNNNNNNNNNNNXXX
diff --git a/docs/assays/sccrispra.spec.yaml b/docs/assays/sccrispra.spec.yaml
index e0f2790..c825dc3 100644
--- a/docs/assays/sccrispra.spec.yaml
+++ b/docs/assays/sccrispra.spec.yaml
@@ -223,7 +223,7 @@ library_spec:
     - !Region
       parent_id: crispr_R2_001.fastq.gz
       region_id: gRNA
-      region_type: gRNA
+      region_type: sgrna_target
       name: Guide RNAs
       sequence_type: onlist
       sequence: NNNNNNNNNNNNNNNNNNNN
diff --git a/seqspec/schema/seqspec.schema.json b/seqspec/schema/seqspec.schema.json
index 1fbf3aa..5b3ddd0 100644
--- a/seqspec/schema/seqspec.schema.json
+++ b/seqspec/schema/seqspec.schema.json
@@ -296,6 +296,7 @@
             "rna",
             "s5",
             "s7",
+            "sgrna_target",
             "tag",
             "truseq_read1",
             "truseq_read2",
diff --git a/seqspec/schema/seqspec_igvf.schema.json b/seqspec/schema/seqspec_igvf.schema.json
index 2b7a258..9d0ffd3 100644
--- a/seqspec/schema/seqspec_igvf.schema.json
+++ b/seqspec/schema/seqspec_igvf.schema.json
@@ -191,6 +191,7 @@
               "rna",
               "s5",
               "s7",
+              "sgrna_target",
               "tag",
               "truseq_read1",
               "truseq_read2",
diff --git a/seqspec/schema/seqspec_igvf_onlist_skip.schema.json b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json
index 3ec0d61..9908307 100644
--- a/seqspec/schema/seqspec_igvf_onlist_skip.schema.json
+++ b/seqspec/schema/seqspec_igvf_onlist_skip.schema.json
@@ -191,6 +191,7 @@
               "rna",
               "s5",
               "s7",
+              "sgrna_target",
               "tag",
               "truseq_read1",
               "truseq_read2",
diff --git a/seqspec/seqspec_index.py b/seqspec/seqspec_index.py
index f0caa74..fff7f59 100644
--- a/seqspec/seqspec_index.py
+++ b/seqspec/seqspec_index.py
@@ -319,6 +319,9 @@ def get_index_by_primer(
     return {read_id: new_rcs, "strand": rdc.read.strand}
 
 
+FEATURE_REGION_TYPES = {"CDNA", "GDNA", "PROTEIN", "TAG", "SGRNA_TARGET"}
+
+
 def format_kallisto_bus(indices, subregion_type=None):
     bcs = []
     umi = []
@@ -331,12 +334,7 @@ def format_kallisto_bus(indices, subregion_type=None):
                     bcs.append(f"{idx},{cut.start},{cut.stop}")
                 elif cut.region_type.upper() == "UMI":
                     umi.append(f"{idx},{cut.start},{cut.stop}")
-                elif (
-                    cut.region_type.upper() == "CDNA"
-                    or cut.region_type.upper() == "GDNA"
-                    or cut.region_type.upper() == "PROTEIN"
-                    or cut.region_type.upper() == "TAG"
-                ):
+                elif cut.region_type.upper() in FEATURE_REGION_TYPES:
                     feature.append(f"{idx},{cut.start},{cut.stop}")
     if len(umi) == 0:
         umi.append("-1,-1,-1")
@@ -362,12 +360,7 @@ def format_kallisto_bus_force_single(indices, subregion_type=None):
                     bcs.append(f"{idx},{cut.start},{cut.stop}")
                 elif cut.region_type.upper() == "UMI":
                     umi.append(f"{idx},{cut.start},{cut.stop}")
-                elif (
-                    cut.region_type.upper() == "CDNA"
-                    or cut.region_type.upper() == "GDNA"
-                    or cut.region_type.upper() == "PROTEIN"
-                    or cut.region_type.upper() == "TAG"
-                ):
+                elif cut.region_type.upper() in FEATURE_REGION_TYPES:
                     length = cut.stop - cut.start
                     if length > max_length:
                         max_length = length

From 9fd3b559839e78c5627493b9522f7a753151f2a7 Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Tue, 23 Sep 2025 14:52:26 -0500
Subject: [PATCH 20/21] CHECK-244-random-x (#18)

---
 seqspec/seqspec_check.py                      |  14 +-
 ...urementSet_X056_G4_RNA_rna_seqspec.yaml.gz | Bin 0 -> 1400 bytes
 tests/data/seqspec_valid_ignore_onlist.yaml   | 197 ++++++++++++++++++
 tests/test_seqspec_check.py                   |  37 +++-
 4 files changed, 228 insertions(+), 20 deletions(-)
 create mode 100644 tests/data/2881_corces_measurementSet_X056_G4_RNA_rna_seqspec.yaml.gz
 create mode 100644 tests/data/seqspec_valid_ignore_onlist.yaml

diff --git a/seqspec/seqspec_check.py b/seqspec/seqspec_check.py
index 24a3a3f..672ebaa 100644
--- a/seqspec/seqspec_check.py
+++ b/seqspec/seqspec_check.py
@@ -453,20 +453,17 @@ def seqtype_check(rgn, errors, idx):
                     "error_message": f"'{rgn.region_id}' sequence_type is 'random' and contains subregions",
                     "error_object": "region",
                 }
-                # errors.append(
-                #     f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and contains subregions"
-                # )
                 errors.append(errobj)
                 idx += 1
-            if rgn.sequence_type == "random" and rgn.sequence != "X" * rgn.max_len:
+            if rgn.sequence_type == "random" and (
+                set(rgn.sequence) != {"X"}
+                or not (rgn.min_len <= len(rgn.sequence) <= rgn.max_len)
+            ):
                 errobj = {
                     "error_type": "check_sequence_types",
                     "error_message": f"'{rgn.region_id}' sequence_type is 'random' and sequence is not all X's",
                     "error_object": "region",
                 }
-                # errors.append(
-                #     f"[error {idx}] '{rgn.region_id}' sequence_type is 'random' and sequence is not all X's"
-                # )
                 errors.append(errobj)
                 idx += 1
             if rgn.sequence_type == "onlist" and not rgn.onlist:
@@ -475,9 +472,6 @@ def seqtype_check(rgn, errors, idx):
                     "error_message": f"'{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object",
                     "error_object": "region",
                 }
-                # errors.append(
-                #     f"[error {idx}] '{rgn.region_id}' sequence_type is 'onlist' and does not have an onlist object"
-                # )
                 errors.append(errobj)
                 idx += 1
             if rgn.regions:
diff --git a/tests/data/2881_corces_measurementSet_X056_G4_RNA_rna_seqspec.yaml.gz b/tests/data/2881_corces_measurementSet_X056_G4_RNA_rna_seqspec.yaml.gz
new file mode 100644
index 0000000000000000000000000000000000000000..217d923c9d8ca9a0fa9284546891ba7a856d45f3
GIT binary patch
literal 1400
zcmV-;1&8_{iwFP!000021MOJdbDKC2fA?RZzO@gwg+K!QRL5<oua~r!kDK(#BZP3d
z!{G9fOZwN}iVuS^PI21a%-tN5321jE?e1@PwepL3n)1g{D!-&#DOUF~NrO0=Aw1D1
zc*GTR6?ikG;fL}3X1*9-UCzfBx4+&jM-ku1KE`wzhWRFl_{fWc8Ty=M+jKUa!eA07
z>#2q(8rH38YG~9ZW31U@Z0P#f8hJcZ3!3PcJd%h2XXMFLB*8W-bPCq@{!GM)km+h8
zd73A3BctqEW~)0)*~&3ipagOx%$5(^M5fA?(I)nI7-WG=XQMGnK(8=xS80~yB5TxB
zi%-@;_BnSaK|HM-GF7q(Paap>B+g<HhclE0(K?jlC>Np3Q>3;8>0Cwv7^rX=dimjO
zIl~z18n$2U*7!4k4Qub9n<R&M=uOT;w2VCYFshc3t<84Qm+?Kn25|<fqL**aKd9v`
zrkk&tX~p`r;2pY_2^b9souMnq*WfDjKFAtzWuS7&3*dSqgc5iaU@x8D&(D_{G0j`&
z?K|L<P%AN{!Lah^<E-!|Lfa(R$fTGGgmG3Fwo=O#(m}KeCE%ygV#pubL%@MYFmW5F
zF!F;?!GsWKbq?R~RTBI4DtV=L`^<^Y)9lM+{dFgmJ#Im&An#~t@Kr?|CD<}dOifyF
z@+55Hf^UNf1SX#p0ipu9_TRWzVY<KX^z~~m{woS&?oFT42=*uyD)uQz8;{PA5Qb~m
zJ~kvH8gq3^H(cH1y00@~x!iaHF59N>mhYV%Tt!SF4ZRE~3e@+j?|=^PA%|PQMRGk9
zxJJVs;40=xz!BZBAjjx;;8tzhgIljcM{#2oH?a^Fm4@a~&+?cpuqPSQG)#zg95;q-
z<Lu*EC(X!7SdqbLHXN;%UWaJa%+t^!rfxDuZ1!xp>U|I4>euKPTucgVYo?ExVHqCd
zwvfJWP-ZhsOv^Nn!)4h_pI@BacG8SMnwif@GcqvEhS$fIhKIn_%#(oAOk!(rTpQ2E
zt={$!w|<3=;l^!C2uZ1QeQ=#iEX^gJ$+7ULB_UyVM{VnDTdO7jAm9m{fQ6r;JPZqD
z6{`lNA(T6pq+?CaSRLIkQV7-OX*utYIEbXzqR-HLe&eX=j<ay)iv|2Sb092$G+$iq
z9yJ2u&COy_vOw4=C~5+{FFViOfAA?B<2ZA8JBz{zU^ooH4~sc`?>P)uzz6Rx3Na-K
z|DAS9>C9^9lvxbpC=Al969{!)?ILC`X3B8r2F*CA`m@!Rw%KJUbeUBm)-Bz~lG=$M
zKX@p@XK(<O*-_g|>~dTiT#Y*xH#XOllMPp;uFvL%UYc8XAAPo#T6K(V@ZGnpD2Lth
z*jV=BL$laHO@z`;-_I}KwYcr7ShFW0y{}79yDDbseT!CAMU#?>_>GE~aW@ZnvR4r6
z9S;j)zb-umu_}qS>$<)r4D9+Q)r91jV(LR9!WrW(wtfiOoxZSx`2n|TWZ-60U2=9;
zhhgFVtwOjT7`nb67}fZPhrEzsxN><SVox4Aq0K1mUw}M1YmrZj{@xs5og84pbHIY|
z`1IS=rz1H~vnSc`93YxODAk{z2KA;reL=4}Pox1A9ILyQZrhA<Nlg#if(cFdF2Sxb
zAObpYLy<m2xh9sLbKc&e+lyr%xT<T5Hyv=>2VDnP(gBtXi{YNQJvpr36m3k$P~lb5
zyIgJrPc`-5{B%xN+&P38#y_Xnbgl@;D=ulF)ULK<aJBtr$JHsfMyK2wKe59wk8~L3
zDOUgAX8c2K20~#hc=>B>v%+@=!mf=rpX#yA-X7Z=oCe1)6YT=f34Sst`2PkNP%)`y
G7XSbZ`mVnK

literal 0
HcmV?d00001

diff --git a/tests/data/seqspec_valid_ignore_onlist.yaml b/tests/data/seqspec_valid_ignore_onlist.yaml
new file mode 100644
index 0000000..2d73ac0
--- /dev/null
+++ b/tests/data/seqspec_valid_ignore_onlist.yaml
@@ -0,0 +1,197 @@
+!Assay
+seqspec_version: 0.3.0
+assay_id: 10x-ATAC-RNA-MULTI
+name: 10x-ATAC-RNA-MULTI/Illumina
+doi: https://doi.org/10.1038/s41592-019-0433-8
+date: 17 June 2019
+description: ansuman-satpathy:igvf_exp11_atac_10x4_NGS1 Single Cell Multiome ATAC
+modalities:
+- atac
+lib_struct: https://igvf.github.io/seqspec/
+library_protocol: single-nucleus ATAC-seq (OBI:0002762)
+library_kit: Illumina Truseq Dual Index
+sequence_protocol: Illumina NovaSeq X
+sequence_kit: NovaSeq X Series 10B Reagent Kit
+sequence_spec:
+- !Read
+  read_id: 1165AJSO
+  name: Read 1
+  modality: atac
+  primer_id: atac-nextera_read1
+  min_len: 50
+  max_len: 50
+  strand: pos
+  files:
+  - !File
+    file_id: IGVFFI1165AJSO
+    filename: IGVFFI1165AJSO.fastq.gz
+    filetype: ''
+    filesize: 4960657092
+    url: https://api.data.igvf.org/sequence-files/IGVFFI1165AJSO/@@download/IGVFFI1165AJSO.fastq.gz
+    urltype: https
+    md5: 0a4d87a0edf52511e72948c11de9df8b
+- !Read
+  read_id: IGVFFI2309FCAH
+  name: Index 1 (i7 index)
+  modality: atac
+  primer_id: atac-nextera_read2
+  min_len: 8
+  max_len: 8
+  strand: pos
+  files:
+  - !File
+    file_id: IGVFFI2309FCAH
+    filename: IGVFFI2309FCAH.fastq.gz
+    filetype: ''
+    filesize: 1176913287
+    url: https://api.data.igvf.org/sequence-files/IGVFFI2309FCAH/@@download/IGVFFI2309FCAH.fastq.gz
+    urltype: https
+    md5: 1f17b83b0c293ad74507cf0dde38a286
+- !Read
+  read_id: IGVFFI6229GGKZ
+  name: Read 2 (technically Index 2 (i5 index))
+  modality: atac
+  primer_id: atac-nextera_read1
+  min_len: 24
+  max_len: 24
+  strand: neg
+  files:
+  - !File
+    file_id: IGVFFI6229GGKZ
+    filename: IGVFFI6229GGKZ.fastq.gz
+    filetype: ''
+    filesize: 2696388379
+    url: https://api.data.igvf.org/sequence-files/IGVFFI6229GGKZ/@@download/IGVFFI6229GGKZ.fastq.gz
+    urltype: https
+    md5: bc9775c746941a760da73a6304c1b0bd
+- !Read
+  read_id: IGVFFI9141IFTT
+  name: Read 3 (technically Read 2)
+  modality: atac
+  primer_id: atac-nextera_read2
+  min_len: 50
+  max_len: 50
+  strand: neg
+  files:
+  - !File
+    file_id: IGVFFI9141IFTT
+    filename: IGVFFI9141IFTT.fastq.gz
+    filetype: ''
+    filesize: 4922922820
+    url: https://api.data.igvf.org/sequence-files/IGVFFI9141IFTT/@@download/IGVFFI9141IFTT.fastq.gz
+    urltype: https
+    md5: 0a8ee69e4918bb52664bbf4a3842c405
+library_spec:
+- !Region
+  parent_id: null
+  region_id: atac
+  region_type: bead_TSO
+  name: ATAC
+  sequence_type: joined
+  sequence: AATGATACGGCGACCACCGAGATCTACACNNNNNNNNNNNNNNNNCGCGTCTGTCGTCGGCAGCGTCAGATGTGTATAAGAGACAGXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXCTGTCTCTTATACACATCTCCGAGCCCACGAGACNNNNNNNNATCTCGTATGCCGTCTTCTGCTTG
+  min_len: 352
+  max_len: 352
+  onlist: null
+  regions:
+  - !Region
+    parent_id: atac
+    region_id: atac-illumina_p5
+    region_type: illumina_p5
+    name: Illumina P5
+    sequence_type: fixed
+    sequence: AATGATACGGCGACCACCGAGATCTACAC
+    min_len: 29
+    max_len: 29
+    onlist: null
+    regions: null
+  - !Region
+    parent_id: atac
+    region_id: atac-cell_barcode
+    region_type: barcode
+    name: R2 Cell Barcode
+    sequence_type: onlist
+    sequence: NNNNNNNNNNNNNNNN
+    min_len: 16
+    max_len: 16
+    onlist: !Onlist
+      file_id: IGVFFI7587TJLC
+      filename: IGVFFI7587TJLC.tsv.gz
+      filetype: ''
+      filesize: 2465078
+      url: https://api.data.igvf.org/tabular-files/IGVFFI7587TJLC/@@download/IGVFFI7587TJLC.tsv.gztest
+      urltype: https
+      md5: 91f5bd173373fa1815830444480236fb
+    regions: null
+  - !Region
+    parent_id: atac
+    region_id: atac-linker
+    region_type: linker
+    name: atac linker
+    sequence_type: fixed
+    sequence: CGCGTCTG
+    min_len: 8
+    max_len: 8
+    onlist: null
+    regions: null
+  - !Region
+    parent_id: atac
+    region_id: atac-nextera_read1
+    region_type: nextera_read1
+    name: nextera_read1
+    sequence_type: fixed
+    sequence: TCGTCGGCAGCGTCAGATGTGTATAAGAGACAG
+    min_len: 33
+    max_len: 33
+    onlist: null
+    regions: null
+  - !Region
+    parent_id: atac
+    region_id: gDNA
+    region_type: gdna
+    name: gDNA
+    sequence_type: random
+    sequence: XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX
+    min_len: 200
+    max_len: 200
+    onlist: null
+    regions: null
+  - !Region
+    parent_id: atac
+    region_id: atac-nextera_read2
+    region_type: nextera_read2
+    name: nextera_read2
+    sequence_type: fixed
+    sequence: CTGTCTCTTATACACATCTCCGAGCCCACGAGAC
+    min_len: 34
+    max_len: 34
+    onlist: null
+    regions: null
+  - !Region
+    parent_id: atac
+    region_id: atac-index7
+    region_type: index7
+    name: ATAC index7
+    sequence_type: onlist
+    sequence: NNNNNNNN
+    min_len: 8
+    max_len: 8
+    onlist: !Onlist
+      file_id: IGVFFI1608YDWY
+      filename: IGVFFI1608YDWY.csv.gz
+      filetype: ''
+      filesize: 1658
+      url: https://api.data.igvf.org/tabular-files/IGVFFI1608YDWY/@@download/IGVFFI1608YDWY.csv.gz
+      urltype: https
+      md5: db54507732297fafea74bacfcc203238
+    regions: null
+  - !Region
+    parent_id: atac
+    region_id: atac-illumina_p7
+    region_type: illumina_p7
+    name: Illumina P7
+    sequence_type: fixed
+    sequence: ATCTCGTATGCCGTCTTCTGCTTG
+    min_len: 24
+    max_len: 24
+    onlist: null
+    regions: null
diff --git a/tests/test_seqspec_check.py b/tests/test_seqspec_check.py
index 75d31bd..c81957a 100644
--- a/tests/test_seqspec_check.py
+++ b/tests/test_seqspec_check.py
@@ -1,8 +1,9 @@
-from argparse import ArgumentParser
 from pathlib import Path
 from tempfile import TemporaryDirectory
 from unittest import TestCase
 from unittest.mock import patch
+from seqspec.seqspec_check import run_check
+from argparse import ArgumentParser, Namespace
 
 from seqspec.seqspec_check import (
     setup_check_args,
@@ -53,8 +54,6 @@ def test_validate_check_args(self):
 
     def test_check_with_igvf_skip(self):
         """Test that 'igvf' skip condition filters out some IGVF-related errors but not read_id pattern errors."""
-        from seqspec.seqspec_check import run_check
-        from argparse import ArgumentParser, Namespace
 
         # Create a parser
         parser = ArgumentParser()
@@ -104,16 +103,14 @@ def test_check_with_igvf_skip(self):
 
     def test_check_with_igvf_onlist_skip(self):
         """Test that 'igvf_onlist_skip' skip condition filters out IGVF and onlist errors including read_id pattern."""
-        from seqspec.seqspec_check import run_check
-        from argparse import ArgumentParser, Namespace
-
+        file_path = "tests/data/seqspec_valid_ignore_onlist.yaml"
         # Create a parser
         parser = ArgumentParser()
         subparser = parser.add_subparsers(dest="command")
         subparser = setup_check_args(subparser)
 
         # Test file path
-        test_file = Path("tests/data/seqspec_valid_ignore_onlist.yaml")
+        test_file = Path(file_path)
 
         # Test with 'igvf_onlist_skip' skip
         args = Namespace()
@@ -131,9 +128,6 @@ def test_check_with_igvf_onlist_skip(self):
 
     def test_check_without_skip(self):
         """Test that without skip condition, validation errors are reported."""
-        from seqspec.seqspec_check import run_check
-        from argparse import ArgumentParser, Namespace
-
         # Create a parser
         parser = ArgumentParser()
         subparser = parser.add_subparsers(dest="command")
@@ -176,3 +170,26 @@ def test_check_without_skip(self):
         self.assertEqual(
             len(onlist_errors), 1, f"Expected 1 onlist error, got {len(onlist_errors)}"
         )
+
+    def test_check_sequence_type_random_x(self):
+        file_path = (
+            "tests/data/2881_corces_measurementSet_X056_G4_RNA_rna_seqspec.yaml.gz"
+        )
+        # Create a parser
+        parser = ArgumentParser()
+        subparser = parser.add_subparsers(dest="command")
+        subparser = setup_check_args(subparser)
+
+        # Test file path
+        test_file = Path(file_path)
+
+        # Test without skip
+        args = Namespace()
+        args.yaml = test_file
+        args.output = None
+        args.skip = None
+
+        # Run check without skip
+        errors = run_check(parser, args)
+        # should have no errors
+        self.assertEqual(len(errors), 0)

From 876aeefa058d2e8a73cf853a0f2e074f271ef288 Mon Sep 17 00:00:00 2001
From: Mingjie Li <44071821+mingjiecn@users.noreply.github.com>
Date: Tue, 27 Jan 2026 14:32:50 -0600
Subject: [PATCH 21/21] update readme  (#21)

---
 docs/INSTALLATION.md | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/docs/INSTALLATION.md b/docs/INSTALLATION.md
index 9e7db60..46e683a 100644
--- a/docs/INSTALLATION.md
+++ b/docs/INSTALLATION.md
@@ -7,16 +7,8 @@ authors:
 
 # Installation
 
-The development version can be installed with
-
-```bash
-pip install git+https://github.com/pachterlab/seqspec@devel
-```
-
-The official release can be installed directly from pypi
-
 ```bash
-pip install seqspec
+pip install git+https://github.com/IGVF-DACC/seqspec.git@main
 ```
 
 Verify the installation