diff --git a/.github/workflows/black.yml b/.github/workflows/black.yml index aec1766ed..46241541a 100644 --- a/.github/workflows/black.yml +++ b/.github/workflows/black.yml @@ -1,14 +1,15 @@ name: Lint -on: [pull_request] +on: [push, pull_request] jobs: lint: runs-on: ubuntu-latest steps: - - uses: actions/checkout@v2 - - uses: actions/setup-python@v2 - - uses: psf/black@stable + - uses: actions/checkout@v6 + - uses: actions/setup-python@v5 with: - options: "--check --diff --color --verbose" - jupyter: true + python-version: "3.12" + - run: pip install ruff==0.15.0 + - run: ruff check . + - run: ruff format --check . diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 365a8b1b1..692182e85 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -1,6 +1,3 @@ -# This workflows will upload a Python Package using Twine when a release is created -# For more information see: https://help.github.com/en/actions/language-and-framework-guides/using-python-with-github-actions#publishing-to-package-registries - name: Upload Python Package on: @@ -12,19 +9,18 @@ jobs: runs-on: ubuntu-latest name: upload release to PyPI permissions: - id-token: write # IMPORTANT: this permission is mandatory for trusted publishing + contents: read + id-token: write + steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.x" - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install setuptools wheel twine - - name: Build and publish - run: | - python setup.py sdist bdist_wheel - - name: Publish package distributions to PyPI - uses: pypa/gh-action-pypi-publish@release/v1 + - uses: actions/checkout@v6 + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.x' + - name: Install build dependencies + run: python -m pip install --upgrade pip build + - name: Build package + run: python -m build + - name: Publish package distributions to PyPI + uses: pypa/gh-action-pypi-publish@release/v1 diff --git a/.github/workflows/run-pytest.yml b/.github/workflows/run-pytest.yml index b518c4048..8fd3c126a 100644 --- a/.github/workflows/run-pytest.yml +++ b/.github/workflows/run-pytest.yml @@ -9,22 +9,19 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: ["3.8", "3.11"] - os: [ubuntu-latest] + python-version: ["3.10", "3.14"] + os: [ubuntu-latest, macos-latest] steps: - - uses: actions/checkout@v2 + - uses: actions/checkout@v6 - - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v2 - with: - python-version: ${{ matrix.python-version }} + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v5 + with: + python-version: ${{ matrix.python-version }} - - name: Install test dependencies - run: if [ -f requirements/requirements-test.txt ]; then pip install -r requirements/requirements-test.txt; fi + - name: Install package with test dependencies + run: python -m pip install ".[test]" - - name: Install package - run: python -m pip install . - - - name: Run pytest tests - run: pytest tests -x -vv --remote-data + - name: Run pytest tests + run: pytest tests diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c91376149..a136b1232 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -1,20 +1,15 @@ repos: - repo: https://github.com/pre-commit/pre-commit-hooks - rev: v4.0.1 + rev: v5.0.0 hooks: - id: trailing-whitespace - id: check-yaml - id: end-of-file-fixer - - id: requirements-txt-fixer - - id: trailing-whitespace - - - repo: https://github.com/PyCQA/isort - rev: 5.9.1 - hooks: - - id: isort - args: ["--profile", "black"] + - id: check-ast - - repo: https://github.com/psf/black - rev: 21.6b0 + - repo: https://github.com/astral-sh/ruff-pre-commit + rev: v0.15.0 hooks: - - id: black + - id: ruff + args: [--fix] + - id: ruff-format diff --git a/MANIFEST.in b/MANIFEST.in deleted file mode 100644 index 15473d351..000000000 --- a/MANIFEST.in +++ /dev/null @@ -1,9 +0,0 @@ -include requirements/* -include README.md -include logo_looper.svg -include looper/jinja_templates/* -include looper/default_config/* -include looper/default_config/divvy_templates/* -include looper/jinja_templates_old/* -include looper/schemas/* -include looper/command_models/* diff --git a/README.md b/README.md index 631f686c4..9e6bdefd0 100644 --- a/README.md +++ b/README.md @@ -1,4 +1,4 @@ -# looper logo +# looper logo ![Run pytests](https://github.com/pepkit/looper/workflows/Run%20pytests/badge.svg) [![PEP compatible](http://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io) diff --git a/docs/changelog.md b/changelog.md similarity index 96% rename from docs/changelog.md rename to changelog.md index 268f8e0e9..c77934a9b 100644 --- a/docs/changelog.md +++ b/changelog.md @@ -2,6 +2,25 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html) and [Keep a Changelog](https://keepachangelog.com/en/1.0.0/) format. +## [2.1.0] -- 2026-02-25 +### Changed +- Migrated to new yacman API (`YAMLConfigManager.from_yaml_file()`, `write_lock`/`read_lock` context managers); requires yacman >=0.9.5 +- Migrated CLI configuration to pydantic-settings +- Improved CLI startup time by deferring heavy imports to module level +- Made signal handling thread-safe +- Replaced wildcard imports with explicit imports +- Converted docstrings to Google style +- Better shell inference for submission commands (#282) +- Updated pipestat constructor usage to classmethod format +- Separated fast unit tests from slow CLI integration tests + +### Added +- HTTP API server (experimental/alpha) with FastAPI: `looper serve` +- Stricter looper-pipestat interface validation + +### Removed +- Old documentation files (moved to separate docs site) + ## [2.0.3] -- 2025-09-23 ### Fixed - Fixed [#543](https://github.com/pepkit/looper/issues/543) diff --git a/docs/README.md b/docs/README.md deleted file mode 100644 index 026059840..000000000 --- a/docs/README.md +++ /dev/null @@ -1,57 +0,0 @@ -# pipeline submitting engine - -[![PEP compatible](https://pepkit.github.io/img/PEP-compatible-green.svg)](http://pepkit.github.io) - -## What is looper? - -Looper is a job submitting engine. Looper deploys arbitrary shell commands for each sample in a [standard PEP project](https://pepkit.github.io/docs/home/). You can think of looper as providing a single user interface to running, monitoring, and managing all of your sample-intensive research projects the same way, regardless of data type or pipeline used. - -## What makes looper better? - -Looper **decouples job handling from the pipeline process**. In a typical pipeline, job handling (managing how individual jobs are submitted to a cluster) is delicately intertwined with actual pipeline commands (running the actual code for a single compute job). In contrast, the looper approach is modular: looper *only* manages job submission. This approach leads to several advantages compared with the traditional integrated approach: - -1. pipelines do not need to independently re-implement job handling code, which is shared. -2. every project uses a universal structure, so datasets can move from one pipeline to another. -3. users must learn only a single interface that works with any project for any pipeline. -4. running just one or two samples/jobs is simpler, and does not require a distributed compute environment. - - - - -## Installing - -Releases are posted as [GitHub releases](https://github.com/pepkit/looper/releases), or you can install using `pip`: - - -```console -pip install --user looper -``` - -Update with: - -```console -pip install --user --upgrade looper -``` - -If the `looper` executable in not automatically in your `$PATH`, add the following line to your `.bashrc` or `.profile`: - -```console -export PATH=~/.local/bin:$PATH -``` - -## Quick start - -To test `looper`, follow the [Hello Looper example repository](https://github.com/databio/hello_looper) to run your first looper project: - - -```console -# download and unzip the hello_looper repository -wget https://github.com/databio/hello_looper/archive/master.zip -unzip master.zip - -# Run looper: -cd hello_looper-master -looper run --looper-config .looper.yaml project/project_config.yaml -``` - -Detailed explanation of results is in the [Hello world tutorial](hello-world.md). diff --git a/docs/advanced.md b/docs/advanced.md deleted file mode 100644 index e2d653bc1..000000000 --- a/docs/advanced.md +++ /dev/null @@ -1,85 +0,0 @@ -# Advanced features - -## Handling multiple input files -Sometimes you have multiple input files that you want to merge for one sample. -For example, a common use case is a single library that was spread across multiple sequencing lanes, -yielding multiple input files that need to be merged and then run through the pipeline as one unit. -Rather than putting multiple lines in your sample annotation sheet, which causes conceptual and analytical challenges, -we introduce **two ways to merge inputs**: - -1. Use *shell expansion characters* (`*` or `[]`) in your `data_source` definition or filename; -for relatively simple merge cases this works well. -2. Specify a *merge table*, which maps input files to samples for samples with more than one input file. -To accommodate complex merger use cases, this is infinitely customizable. - -To do the first option, simply change data source specification: - -```yaml -data_sources: - data_R1: "${DATA}/{id}_S{nexseq_num}_L00*_R1_001.fastq.gz" - data_R2: "${DATA}/{id}_S{nexseq_num}_L00*_R2_001.fastq.gz" -``` - -For the second option, provide *in the `metadata` section* of your project config file a path to merge table file: - -```yaml -metadata: - merge_table: mergetable.csv -``` - -Make sure the `sample_name` column of this table matches, and then include any columns needed to point to the data. -Looper will automatically include all of these files as input passed to the pipelines. - -***Warning***: do not use *both* of these options for the same sample at the same time; that will lead to multiple mergers. - -**Note**: mergers are *not* the way to handle different functional/conceptual *kinds* of input files (e.g., `read1` and `read2` for a sample sequenced with a paired-end protocol). -Such cases should be handled as *separate derived columns* in the main sample annotation sheet if they're different arguments to the pipeline. - - -## Connecting to multiple pipelines - -If you have a project that contains samples of different types, then you may need to specify multiple pipeline repositories to your project. -Starting in version 0.5, looper can handle a priority list of pipelines. -Starting with version 0.6, each path should be directly to a pipeline interface file. - -**Example**: - -```yaml -metadata: - pipeline_interfaces: [pipeline_iface1.yaml, pipeline_iface2.yaml] -``` - -In this case, for a given sample, `looper` will first look in `pipeline_iface1.yaml` -to see if an appropriate (i.e., protocol-matched) pipeline exists for this sample type. -If one is found, `looper` will use that pipeline (or set of pipelines, as specified in the `protocol_mapping`). -Once a pipeline is submitted any remaining interface files will be ignored. -Until an appropriate pipeline is found, each interface file will be considered in succession. -If no suitable pipeline is found in any interface, the sample will be skipped. -In other words, the `pipeline_interfaces` value specifies a *prioritized* search list. - -## Set up tab completion - -Source `bash_complete.sh` to your `~/.bashrc` to get basic tab completion for Looper. - -Then, simply type `looper ` to see a list of commands and `looper comma` to get autocompletion for specific commands. - -Source script to add to `~/.bashrc`: -```bash -# Begin looper bash autocomplete -_looper_autocomplete() -{ - local cur prev opts1 - cur=${COMP_WORDS[COMP_CWORD]} - prev=${COMP_WORDS[COMP_CWORD-1]} - opts1=$(looper --commands) - case ${COMP_CWORD} in - 1) - COMPREPLY=($(compgen -W "${opts1}" -- ${cur})) - ;; - 2) - COMPREPLY=() - ;; - esac -} && complete -o bashdefault -o default -F _looper_autocomplete looper -# end looper bash autocomplete -``` \ No newline at end of file diff --git a/docs/autodoc_build/.gitignore b/docs/autodoc_build/.gitignore deleted file mode 100644 index d6b7ef32c..000000000 --- a/docs/autodoc_build/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/docs/concentric-templates.md b/docs/concentric-templates.md deleted file mode 100644 index 8ca3155a5..000000000 --- a/docs/concentric-templates.md +++ /dev/null @@ -1,60 +0,0 @@ -# Looper's concentric template system - -## Introduction - -To build job scripts, looper uses a 2-level template system consisting of an inner template wrapped by an outer template. The inner template is called a *command template*, which produces the individual commands to execute. The outer template is the *submission template*, which wraps the commands in environment handling code. This layered design allows us to decouple the computing environment from the pipeline, which improves portability. - -## The command template - -The command template is specified by a pipeline in the pipeline interface. A very basic command template could be something like this: - -```console -pipeline_command {sample.input_file} --arg -``` - -In the simplest case, looper can run the pipeline by simply running these commands. This example contains no information about computing environment, such as SLURM submission directives. - -## The submission template - -To extend to submitting the commands to a cluster, we simply need to add some more information around the command above, specifying things like memory use, job name, *etc.* It may be tempting to add these details directly to the command template, causing the jobs to be submitted to SLURM instead of run directly. This *would* work; however, this would restrict the pipeline to *only* running via SLURM, since the submission code would be tightly coupled to the command code. Instead, looper retains flexibility by introducing a second template layer, the *submission template*. While the *command template* is specified by the pipeline interface, the *submission template* is specified at the level of the computing environment. A submission template can also be as simple or complex as required. For a command to be run in a local computing environment, a basic template will suffice: - -```console -#! /usr/bin/bash - -{CODE} -``` - -A more complicated template could submit a job to a SLURM cluster: - -```console -#!/bin/bash -#SBATCH --job-name='{JOBNAME}' -#SBATCH --output='{LOGFILE}' -#SBATCH --mem='{MEM}' -#SBATCH --cpus-per-task='{CORES}' -#SBATCH --time='{TIME}' -echo 'Compute node:' `hostname` -echo 'Start time:' `date +'%Y-%m-%d %T'` - -srun {CODE} -``` - -In these templates, the `{CODE}` variable is populated by the populated result of the command template -- that's what makes these templates concentric. - -## The advantages of concentric templates - -Looper first populates the command template, and then provides the output as a variable and used to populate the `{CODE}` variable in the submission template. This decoupling provides substantial advantages: - -1. The commands can be run on any computing environment by simply switching the submission template. -2. The submission template can be used for any computing environment parameters, such as containers. -3. The submission template only has to be defined once *per environment*, so many pipelines can use them. -4. We can [group multiple individual commands](grouping-jobs.md) into a single submission script. -5. The submission template is universal and can be handled by dedicated submission template software. - -## Looper and divvy - -The last point about the submission template being universal is exactly what looper does. Looper uses [divvy](http://divvy.databio.org) to handle submission templates. Besides being useful for looper, this means the divvy submission templates can be used for interactive submission of jobs, or used by other software. It also means to configure looper to work with your computing environment, you just have to configure divvy. - -## Populating templates - -The task of running jobs can be thought of as simply populating the templates with variables. To do this, Looper provides [variables from several sources](variable-namespaces.md). diff --git a/docs/config-files.md b/docs/config-files.md deleted file mode 100644 index bb6bfe5e0..000000000 --- a/docs/config-files.md +++ /dev/null @@ -1,44 +0,0 @@ -# Configuration files - -Looper uses [YAML](http://www.yaml.org/) configuration files for several purposes. -It's designed to be organized, modular, and very configurable, so there are several configuration files. -We've organized these files so that each handle a different level of infrastructure - -- Environment -- Project -- Pipeline - -This makes the system very adaptable and portable, but for a newcomer, it is easy to map each to its purpose. -So, here's an explanation of each for you to use as a reference until you are familiar with the whole ecosystem. -Which ones you need to know about will depend on whether you're a pipeline *user* (running pipelines on your project) -or a pipeline *developer* (building your own pipeline). - - -## Pipeline users - -Users (non-developers) of pipelines only need to be aware of one or two config files. - -### Project configuration - -[**project config**](defining-a-project.md) -- this file is specific to each project and contains information about the project's metadata, where the processed files should be saved, and other variables that allow to configure the pipelines specifically for this project. It follows the standard Portable Encapsulated Project format, or PEP for short. - -### Environment configuration - -[**environment config**](http://divvy.databio.org/en/latest/configuration/) -- if you are planning to submit jobs to a cluster, then you need to be aware of environment configuration. This task is farmed out to [divvy](http://divvy.databio.org/en/latest/), a computing resource configuration manager. Follow the divvy documentation to learn about ways to tweak the computing environment settins according to your needs. - -That should be all you need to worry about as a pipeline user. If you need to adjust compute resources or want to develop a pipeline or have more advanced project-level control over pipelines, you'll need knowledge of the config files used by pipeline developers. - - -## Pipeline developers - -### Pipeline configuration - -If you want to make pipeline compatible with looper, tweak the way looper interacts with a pipeline for a given project, -or change the default cluster resources requested by a pipeline, you need to know about a configuration file that coordinates linking pipelines to a project. This happens via the [pipeline interface file](pipeline-interface-specification.md). - -Finally, if you're using [the pypiper framework](https://github.com/databio/pypiper) to develop pipelines, -it uses a pipeline-specific configuration file, which is detailed in the [pypiper documentation](http://pypiper.readthedocs.io/en/latest/advanced.html#pipeline-config-files). - -Essentially, each pipeline may provide a configuration file describing where software is, -and parameters to use for tasks within the pipeline. This configuration file is by default named like pipeline name, -with a `.yaml` extension instead of `.py`. For example, by default `rna_seq.py` looks for an accompanying `rna_seq.yaml` file. diff --git a/docs/containers.md b/docs/containers.md deleted file mode 100644 index fbe26eaad..000000000 --- a/docs/containers.md +++ /dev/null @@ -1,64 +0,0 @@ -# How to run jobs in a linux container - -Because `looper` uses `divvy` for computing configuration, running jobs in containers is easy! `Divvy` can use the same template system to do either cluster computing or to run jobs in linux containers (for example, using `docker` or `singularity`). You can even run jobs in a container *on a cluster*. - -All you need to do is follow the same instructions as in [running jobs on a cluster](running-on-a-cluster.md), but use templates that run those jobs in containers. To see examples of how to do this, refer to the [divvy docs on running containers](http://divvy.databio.org/en/latest/containers/). - - -## Overview - -Here is a quick guide to get you started using containers with `looper`: - -### 1. Get your container image. - -This could be a docker image (hosted on dockerhub), which you would download via `docker pull`, or it could be a `singularity` image you have saved in a local folder. This is pipeline-specific, and you'll need to download the image recommended by the authors of the pipeline or pipelines you want to run. - - -### 2. Specify the image in your `pipeline_interface` - -The `pipeline_interface.yaml` file will need a `compute` section for each pipeline that can be run in a container, specifying the image. For example: - - -```yaml -compute: - singularity_image: ${SIMAGES}myimage - docker_image: databio/myimage -``` - -For singularity images, you just need to make sure that the images indicated in the `pipeline_interface` are available in those locations on your system. For docker, make sure you have the docker images pulled. - - -### 3. Configure your `DIVCFG`. - -`Divvy` will need templates that work with the container. This just needs to be set up once for your compute environment, which would enable you to run any pipeline in a container (as long as you have an image). You should set up the DIVCFG compute environment configuration by following instructions in the [DIVCFG readme](https://github.com/pepkit/divcfg). If it's not already container-aware, you will just need to add a new container-aware "compute package" to your DIVCFG file. Here's an example of how to add one for using singularity in a SLURM environment: - -```yaml -singularity_slurm: - submission_template: templates/slurm_singularity_template.sub - submission_command: sbatch - singularity_args: -B /sfs/lustre:/sfs/lustre,/nm/t1:/nm/t1 -``` - -In `singularity_args` you'll need to pass any mounts or other settings to be passed to singularity. The actual `slurm_singularity_template.sub` file looks something like this: - -```bash -#!/bin/bash -#SBATCH --job-name='{JOBNAME}' -#SBATCH --output='{LOGFILE}' -#SBATCH --mem='{MEM}' -#SBATCH --cpus-per-task='{CORES}' -#SBATCH --time='{TIME}' -#SBATCH --partition='{PARTITION}' -#SBATCH -m block -#SBATCH --ntasks=1 - -echo 'Compute node:' `hostname` -echo 'Start time:' `date +'%Y-%m-%d %T'` - -singularity instance.start {SINGULARITY_ARGS} {SINGULARITY_IMAGE} {JOBNAME}_image -srun singularity exec instance://{JOBNAME}_image {CODE} - -singularity instance.stop {JOBNAME}_image -``` - -Notice how these values will be used to populate a template that will run the pipeline in a container. Now, to use singularity, you just need to activate this compute package in the usual way, which is using the `package` argument: ``looper run --package singularity_slurm``. diff --git a/docs/contributing.md b/docs/contributing.md deleted file mode 100644 index 412132b02..000000000 --- a/docs/contributing.md +++ /dev/null @@ -1,10 +0,0 @@ -# Contributing - -Pull requests or issues are welcome. - -- After adding tests in `tests` for a new feature or a bug fix, please run the test suite. -- To do so, the only additional dependencies needed beyond those for the package can be installed with: - - `pip install -r requirements/requirements-test.txt` - -- Once those are installed, the tests can be run with `pytest` or `python setup.py test`. diff --git a/docs/defining-a-project.md b/docs/defining-a-project.md deleted file mode 100644 index 14225969d..000000000 --- a/docs/defining-a-project.md +++ /dev/null @@ -1,42 +0,0 @@ -# How to define a project - -## 1. Start with a basic PEP - -To start, you need a project defined in the [standard Portable Encapsulated Project (PEP) format](http://pep.databio.org). Start by [creating a PEP](https://pep.databio.org/en/latest/simple_example/). - -## 2. Specify the Sample Annotation - -This information generally lives in a `project_config.yaml` file. - -Simplest example: -```yaml -pep_version: 2.0.0 -sample_table: sample_annotation.csv -``` - -A more complicated example taken from [PEPATAC](https://pepatac.databio.org/en/latest/): - -```yaml -pep_version: 2.0.0 -sample_table: tutorial.csv - -sample_modifiers: - derive: - attributes: [read1, read2] - sources: - # Obtain tutorial data from http://big.databio.org/pepatac/ then set - # path to your local saved files - R1: "${TUTORIAL}/tools/pepatac/examples/data/{sample_name}_r1.fastq.gz" - R2: "${TUTORIAL}/tools/pepatac/examples/data/{sample_name}_r2.fastq.gz" - imply: - - if: - organism: ["human", "Homo sapiens", "Human", "Homo_sapiens"] - then: - genome: hg38 - prealignment_names: ["rCRSd"] - deduplicator: samblaster # Default. [options: picard] - trimmer: skewer # Default. [options: pyadapt, trimmomatic] - peak_type: fixed # Default. [options: variable] - extend: "250" # Default. For fixed-width peaks, extend this distance up- and down-stream. - frip_ref_peaks: None # Default. Use an external reference set of peaks instead of the peaks called from this run -``` \ No newline at end of file diff --git a/docs/derived-columns.md b/docs/derived-columns.md deleted file mode 100644 index c13713f46..000000000 --- a/docs/derived-columns.md +++ /dev/null @@ -1,74 +0,0 @@ -# Derived columns - -On your sample sheet, you will need to point to the input file or files for each sample. -Of course, you could just add a column with the file path, like `/path/to/input/file.fastq.gz`. For example: - -A ***bad* example**: - -```CSV -sample_name,library,organism,time,file_path -pig_0h,RRBS,pig,0,/data/lab/project/pig_0h.fastq -pig_1h,RRBS,pig,1,/data/lab/project/pig_1h.fastq -frog_0h,RRBS,frog,0,/data/lab/project/frog_0h.fastq -frog_1h,RRBS,frog,1,/data/lab/project/frog_1h.fastq -``` - -This is common, and it works in a pinch with Looper, but what if the data get moved, or your filesystem changes, or you switch servers or move institutes? -Will this data still be there in 2 years? Do you want long file paths cluttering your annotation sheet? -What if you have 2 or 3 input files? Do you want to manually manage these unwieldy absolute paths? - -Looper makes it really easy to do better. You can make one or your annotation columns into a flexible *derived column* -that will be populated based on a source template you specify in the project configuration file. -What was originally `/long/path/to/sample.fastq.gz` would instead contain just a key, like `source1`. -Columns that use a key like this are called *derived columns*. -Here's an example of the same sheet using a derived column (`file_path`): - -A ***good* example**: -```CSV -sample_name,library,organism,time,file_path -pig_0h,RRBS,pig,0,source1 -pig_1h,RRBS,pig,1,source1 -frog_0h,RRBS,frog,0,source1 -frog_1h,RRBS,frog,1,source1 -``` - -For this to succeed, your project config file must specify two things: -- Which columns are to be derived (in this case, ``file_path``) -- A `data_sources` section mapping keys to strings that will construct your path, like this: - ```yaml - derived_columns: [file_path] - data_sources: - source1: /data/lab/project/{sample_name}.fastq - source2: /path/from/collaborator/weirdNamingScheme_{external_id}.fastq - ``` - -That's it! The source string can use other sample attributes (columns) using braces, as in `{sample_name}`. -The attributes will be automatically populated separately for each sample. -To take this a step further, you'd get the same result with this config file, -which substitutes `{sample_name}` for other sample attributes, `{organism}` and `{time}`: - -```yaml -derived_columns: [file_path] -data_sources: - source1: /data/lab/project/{organism}_{time}h.fastq - source2: /path/from/collaborator/weirdNamingScheme_{external_id}.fastq -``` - -As long as your file naming system is systematic, you can easily deal with any external naming scheme, no problem at all. -The idea is this: don't put *absolute* paths to files in your annotation sheet. -Instead, specify a data source and then provide a regex in the config file. - -Then if your data change locations (which happens more often than we would like), or you change servers, -or you want to share or publish the project, you just have to change the config file and not update paths in the annotation sheet. -This makes the annotation sheet universal across environments, users, publication, etc. The whole project is now portable. - -You can specify as many derived columns as you want. An expression including any sample attributes (using `{attribute}`) will be populated for each of those columns. - -Think of each sample as belonging to a certain type (for simple experiments, the type will be the same). -Then define the location of these samples in the project configuration file. -As a side bonus, you can easily include samples from different locations, and you can also use the same sample annotation sheet on different environments -(i.e. servers or users) by having multiple project config files (or, better yet, by defining a `subproject` for each environment). -The only thing you have to change is the project-level expression describing the location, not any sample attributes. -Plus, you get to eliminate those annoying `long/path/arguments/in/your/sample/annotation/sheet`. - -Check out the complete working example in the [`microtest` repository](https://github.com/databio/microtest/tree/master/config). diff --git a/docs/divvy/README.md b/docs/divvy/README.md deleted file mode 100644 index a691fda91..000000000 --- a/docs/divvy/README.md +++ /dev/null @@ -1,66 +0,0 @@ -![Logo](../img/divvy_logo.svg) - -## What is `divvy`? - -The submission configuration tool embedded in `looper` is called `divvy`. Divvy is useful independently from looper, but it ships with looper. Divvy allows you to populate job submission scripts by integrating job-specific settings with separately configured computing environment settings. Divvy *makes software portable*, so users may easily toggle among any computing resource (laptop, cluster, cloud). - -![Merge](../img/divvy-merge.svg) -## What makes `divvy` better? - -![NoDivvy](../img/nodivvy.svg) - -Tools require a particular compute resource setup. For example, one pipeline requires SLURM, another requires AWS, and yet another just runs directly on your laptop. This makes it difficult to transfer to different environments. For tools that can run in multiple environments, each one must be configured separately. - -
- - -Instead, `divvy`-compatible tools can run on any computing resource. **Users configure their computing environment once, and all divvy-compatible tools will use this same configuration.** - -![Connect](../img/divvy-connect.svg) - -Divvy reads a standard configuration file describing available compute resources and then uses a simple template system to write custom job submission scripts. Computing resources are organized as *compute packages*, which users select, populate with values, and build scripts for compute jobs. - -
- -Use the default compute packages or [configure your own](configuration.md). See what's available: - -```{console} -divvy list -``` - -```{console} -Divvy config: divvy_config.yaml - -docker -default -singularity_slurm -singularity -local -slurm -``` - - -Divvy will take variables from a file or the command line, merge these with environment settings to create a specific job script. Write a submission script from the command line: - -```{console} -divvy write --package slurm \ - --settings myjob.yaml \ - --compute sample=sample1 \ - --outfile submit_script.txt -``` - -### Python interface - -You can also use `divvy` via python interface, or you can use it to make your own python tools divvy-compatible: - -```{python} -import divvy -dcc = divvy.ComputingConfiguration() -dcc.activate_package("slurm") - -# write out a submission script -dcc.write_script("test_script.sub", - {"code": "bowtie2 input.bam output.bam"}) -``` - -For more details, check out the [tutorial](tutorial). diff --git a/docs/divvy/adapters.md b/docs/divvy/adapters.md deleted file mode 100644 index 161fd51e6..000000000 --- a/docs/divvy/adapters.md +++ /dev/null @@ -1,18 +0,0 @@ -# Adapters make template variables flexible - -Starting with `divvy v0.5.0` the configuration file can include an `adapters` section, which is used to provide a set of variable mappings that `divvy` uses to populate the submission templates. - -This makes the connection with `divvy` and client software more flexible and more elegant, since the source of the data does not need to follow any particular naming scheme, any mapping can be used and adapted to work with any `divvy` templates. - -## Example - -```yaml -adapters: - CODE: namespace.command - LOGFILE: namespace1.log_file - JOBNAME: user_settings.program.job_name - CORES: processors_number -... -``` - -As you can see in the example `adapters` section above, each adapter is a key-value pair that maps a `divvy` template variable to a target value. The target values can use namespaces (nested mapping). diff --git a/docs/divvy/configuration.md b/docs/divvy/configuration.md deleted file mode 100644 index ad5943e01..000000000 --- a/docs/divvy/configuration.md +++ /dev/null @@ -1,97 +0,0 @@ -# Installing divvy - -Divvy is automatically installed when you install looper. See if your install worked by calling `divvy -h` on the command line. If the `divvy` executable in not in your `$PATH`, append this to your `.bashrc` or `.profile` (or `.bash_profile` on macOS): - -```{console} -export PATH=~/.local/bin:$PATH -``` - -# Initial configuration - -On a fresh install, `divvy` comes pre-loaded with some built-in compute packages, which you can explore by typing `divvy list`. If you need to tweak these or create your own packages, you will need to configure divvy manually. Start by initializing an empty `divvy` config file: - -```{console} -export DIVCFG="divvy_config.yaml" -divvy init $DIVCFG -``` - -This `init` command will create a default config file, along with a folder of templates. - -The `divvy write` and `list` commands require knowing where this genome config file is. You can pass it on the command line all the time (using the -c parameter), but this gets old. An alternative is to set up the $DIVCFG environment variable. Divvy will automatically use the config file in this environmental variable if it exists. Add this line to your `.bashrc` or `.profile` if you want it to persist for future command-line sessions. You can always specify -c if you want to override the value in the $DIVCFG variable on an ad-hoc basis: - -```{console} -export DIVCFG=/path/to/divvy_config.yaml -``` - -# The divvy configuration file - -At the heart of `divvy` is a the *divvy configuration file*, or `DIVCFG` for short. This is a `yaml` file that specifies a user's available *compute packages*. Each compute package represents a computing resource; for example, by default we have a package called `local` that populates templates to simple run jobs in the local console, and another package called `slurm` with a generic template to submit jobs to a SLURM cluster resource manager. Users can customize compute packages as much as needed. - -## Configuration file priority lookup - -When `divvy` starts, it checks a few places for the `DIVCFG` file. First, the user may may specify a `DIVCFG` file when invoking `divvy` either from the command line or from within python. If the file is not provided, `divvy` will next look file in the `$DIVCFG` environment variable. If it cannot find one there, then it will load a default configuration file with a few basic compute packages. We recommend setting the `DIVCFG` environment variable as the most convenient use case. - -## Customizing your configuration file - -The easiest way to customize your computing configuration is to edit the default configuration file. To get a fresh copy of the default configuration, use `divvy init custom_divvy_config.yaml`. This will create for you a config file along with a folder containing all the default templates. - -Here is an example `divvy` configuration file: - -```{console} -compute_packages: - default: - submission_template: templates/local_template.sub - submission_command: sh - local: - submission_template: templates/local_template.sub - submission_command: sh - develop_package: - submission_template: templates/slurm_template.sub - submission_command: sbatch - partition: develop - big: - submission_template: templates/slurm_template.sub - submission_command: sbatch - partition: bigmem -``` - -The sub-sections below `compute_packages` each define a *compute package* that can be activated. `Divvy` uses these compute packages to determine how to submit your jobs. If you don't specify a package to activate, `divvy` uses the package named `default`. You can make your default whatever you like. You can activate any other compute package __on the fly__ by calling the `activate_package` function from python, or using the `--package` command-line option. - -You can make as many compute packages as you wish, and name them whatever you wish. You can also add whatever attributes you like to the compute package. There are only two required attributes: each compute package must specify the `submission_command` and `submission_template` attributes. - -### The `submission_command` attribute - -The `submission_command` attribute is the string your cluster resource manager uses to submit a job. For example, in our compute package named `develop_package`, we've set `submission_command` to `sbatch`. We are telling divvy that submitting this job should be done with: `sbatch submission_script.txt`. - -### The `submission_template` attribute - -Each compute package specifies a path to a template file (`submission_template`). The template file provides a skeleton that `divvy` will populate with job-specific attributes. These paths can be relative or absolute; relative paths are considered *relative to the DIVCFG file*. Let's explore what template files look like next. - -## Template files - -Each compute package must point to a template file with the `submission_template` attribute. These template files are typically stored relative to the `divvy` configuration file. Template files are taken by `divvy`, populated with job-specific information, and then run as scripts. Here's an example of a generic SLURM template file: - -```{bash} -#!/bin/bash -#SBATCH --job-name='{JOBNAME}' -#SBATCH --output='{LOGFILE}' -#SBATCH --mem='{MEM}' -#SBATCH --cpus-per-task='{CORES}' -#SBATCH --time='{TIME}' -#SBATCH --partition='{PARTITION}' -#SBATCH -m block -#SBATCH --ntasks=1 - -echo 'Compute node:' `hostname` -echo 'Start time:' `date +'%Y-%m-%d %T'` - -srun {CODE} -``` - -Template files use variables (*e.g.* `{VARIABLE}`), which will be populated independently for each job. If you want to make your own templates, you should check out the default templates (in the [submit_templates](https://github.com/pepkit/divcfg/tree/master/templates) folder). Many users will not need to tweak the template files, but if you need to, you can also create your own templates, giving `divvy` ultimate flexibility to work with any compute infrastructure in any environment. To create a custom template, just follow the examples. Then, point to your custom template in the `submission_template` attribute of a compute package in your `DIVCFG` config file. - - - -## Resources - -You may notice that the compute config file does not specify resources to request (like memory, CPUs, or time). Yet, these are required in order to submit a job to a cluster. **Resources are not handled by the divcfg file** because they not relative to a particular computing environment; instead they vary by pipeline and sample. As such, these items should be provided elsewhere. diff --git a/docs/divvy/containers.md b/docs/divvy/containers.md deleted file mode 100644 index a90d801c3..000000000 --- a/docs/divvy/containers.md +++ /dev/null @@ -1,76 +0,0 @@ - -# Configuring containers with divvy - -The divvy template framework is a natural way to run commands in a container, for example, using `docker` or `singularity`. All we need to do is 1) design a template that will run the job in the container, instead of natively; and 2) create a new compute package that will use that template. - -## A template for container runs - -If you start up divvy without giving it a DIVCFG file, it will come with a few default compute packages that include templates for containers. You can also find these in [the divcfg repository](http://github.com/pepkit/divcfg), which includes these scenarios: - -- singularity on SLURM -- singularity on localhost -- docker on localhost -- others - -If you need a different system, looking at those examples should get you started toward making your own. To take a quick example, using singularity on SLURM combines the basic SLURM script template with these lines to execute the run in container: - -``` -singularity instance.start {SINGULARITY_ARGS} {SINGULARITY_IMAGE} {JOBNAME}_image -srun singularity exec instance://{JOBNAME}_image {CODE} -singularity instance.stop {JOBNAME}_image -``` - -This particular template uses some variables provided by different sources: `{JOBNAME}`, `{CODE}`, `{SINGULARITY_ARGS}` and `{SINGULARITY_IMAGE}`. These arguments could be defined at different places. For example, the `{SINGULARITY_IMAGE}` variable should point to a singularity image that could vary by pipeline, so it makes most sense to define this variable individually for each pipeline. So, any pipeline that provides a container should probably include a `singularity_image` attribute providing a place to point to the appropriate container image. - -Of course, you will also need to make sure that you have access to `singularity` command from the compute nodes; on some clusters, you may need to add a `module load singularity` (or some variation) to enable it. - -The `{SINGULARITY_ARGS}` variable comes just right after the `instance.start` command, and can be used to pass any command-line arguments to singularity. We use these, for example, to bind host disk paths into the container. **It is critical that you explicitly bind any file systems with data necessary for the pipeline so the running container can see those files**. The [singularity documentation](https://singularity.lbl.gov/docs-mount#specifying-bind-paths) explains this, and you can find other arguments detailed there. Because this setting describes something about the computing environment (rather than an individual pipeline or sample), it makes most sense to put it in the `DIVCFG` file for a particular compute package. The next section includes examples of how to use `singularity_args`. - -If you're using [looper](http://looper.databio.org), the `{JOBNAME}` and `{CODE}` variables will be provided automatically by looper. - -## Adding compute packages for container templates - -To add a package for these templates to a `DIVCFG` file, we just add a new section. There are a few examples in this repository. A singularity example we use at UVA looks like this: - -``` -singularity_slurm: - submission_template: templates/slurm_singularity_template.sub - submission_command: sbatch - singularity_args: --bind /sfs/lustre:/sfs/lustre,/nm/t1:/nm/t1 -singularity_local: - submission_template: templates/localhost_singularity_template.sub - submission_command: sh - singularity_args: --bind /ext:/ext -``` - -These singularity compute packages look just like the typical ones, but just change the `submission_template` to point to the new containerized templates described in the previous section, and then they add the `singularity_args` variable, which is what will populate the `{SINGULARITY_ARGS}` variable in the template. Here we've used these to bind (mount) particular file systems the container will need. You can use these to pass along any environment-specific settings to your singularity container. - -With this setup, if you want to run a singularity container, just specify `--compute singularity_slurm` or `--compute singularity_local` and it will use the appropriate template. - -For another example, take a look at the basic `localhost_container.yaml` DIVCFG file, which describes a possible setup for running docker on a local computer: - -``` -compute: - default: - submission_template: templates/localhost_template.sub - submission_command: sh - singularity: - submission_template: templates/localhost_singularity_template.sub - submission_command: sh - singularity_args: --bind /ext:/ext - docker: - submission_template: templates/localhost_docker_template.sub - submission_command: sh - docker_args: | - --user=$(id -u) \ - --env="DISPLAY" \ - --volume ${HOME}:${HOME} \ - --volume="/etc/group:/etc/group:ro" \ - --volume="/etc/passwd:/etc/passwd:ro" \ - --volume="/etc/shadow:/etc/shadow:ro" \ - --volume="/etc/sudoers.d:/etc/sudoers.d:ro" \ - --volume="/tmp/.X11-unix:/tmp/.X11-unix:rw" \ - --workdir="`pwd`" \ -``` - -Notice the `--volume` arguments, which mount disk volumes from the host into the container. This should work out of the box for most docker users. diff --git a/docs/divvy/default-packages.md b/docs/divvy/default-packages.md deleted file mode 100644 index eed0fa14b..000000000 --- a/docs/divvy/default-packages.md +++ /dev/null @@ -1,6 +0,0 @@ -# Default divvy compute packages - -Divvy comes with a built-in default configuration that provides a few packages and templates. You can configure your own with `divvy init` and then adding whatever you like. The defaults provided can be found at these links: - -- [list of available default packages](https://github.com/pepkit/divvy/blob/master/divvy/submit_templates/default_compute_settings.yaml) -- [default templates](https://github.com/pepkit/divvy/tree/master/divvy/submit_templates) \ No newline at end of file diff --git a/docs/faq.md b/docs/faq.md deleted file mode 100644 index 3ae9b30c1..000000000 --- a/docs/faq.md +++ /dev/null @@ -1,43 +0,0 @@ -# FAQ - - -## What kind of pipelines can `looper` run? - -`Looper` can run samples through *any pipeline that runs on the command line*. The flexible [pipeline interface](../pipeline-interface) file allows `looper` to execute arbitrary shell commands. A pipeline may consist of scripts in languages like Perl, Python, or bash, or it may be built with a particular framework. Typically, we use Python pipelines built using the [`pypiper` package](http://pypiper.readthedocs.io), which provides some additional power to `looper`, but that's optional. - - -## Why isn't the `looper` executable available on `PATH`? - -By default, Python packages are installed to `~/.local/bin`. -You can add that location to your path by appending it (`export PATH=$PATH:~/.local/bin`). - -## How can I run my jobs on a cluster? - -Looper uses the external package [divvy](http://code.databio.org/divvy) for cluster computing, making it flexible enough to use with any cluster resource environment. Please see the [tutorial on cluster computing with looper and divvy](running-on-a-cluster.md). - - -## What's the difference between `looper` and `pypiper`? - -[`pypiper`](http://pypiper.readthedocs.io) is a more traditional workflow-building framework; it helps you build pipelines to process individual samples. [`looper`](http://looper.readthedocs.io) is completely pipeline-agnostic, and has nothing to do with individual processing steps; it operates groups of samples (as in a project), submitting the appropriate pipeline(s) to a cluster or server (or running them locally). The two projects are independent and can be used separately, but they are most powerful when combined. They complement one another, together constituting a comprehensive pipeline management system. - -## Why isn't a sample being processed by a pipeline (`Not submitting, flag found: ['*_.flag']`)? - -When using the `run` subcommand, for each sample being processed `looper` first checks for *"flag" files* in the sample's designated output folder for flag files (which can be `_completed.flag`, or `_running.flag`, or `_failed.flag`). Typically, we don't want to resubmit a job that's already running or already finished, so by default, `looper` **will *not* submit a job when it finds a flag file**. This is what the message above is indicating. - -If you do in fact want to re-rerun a sample (maybe you've updated the pipeline, or you want to run restart a failed attempt), you can do so by just passing to `looper` at startup the `--ignore-flags` option; this will skip the flag check **for *all* samples**. If you only want to re-run or restart a few samples, it's best to just delete the flag files for the samples you want to restart, then use `looper run` as normal. - -You may be interested in the [usage docs](../usage) for the `looper rerun` command, which runs any failed samples. - -## How can I resubmit a subset of jobs that failed? - -As of version `0.11`, you can use `looper rerun` to submit only jobs with a `failed` flag. By default, `looper` will *not* submit a job that has already run. If you want to restart a sample (maybe you've updated the pipeline, or you want to restart a failed attempt), you can either use `looper rerun` to restart only failed jobs, or you pass `--ignore-flags`, which will **resubmit *all* samples**. If you want more specificity, you can just manually delete the "flag" files for the samples you want to restart, then use `looper run` as normal. - -## Why are computing resources defined in the pipeline interface file instead of in the `divvy` computing configuration file? - -You may notice that the compute config file does not specify resources to request (like memory, CPUs, or time). Yet, these are required in order to submit a job to a cluster. **Resources are not handled by the divcfg file** because they not relative to a particular computing environment; instead they vary by pipeline and sample. As such, these items should be defined at other stages. - -Resources defined in the `pipeline_interface.yaml` file that connects looper to a pipeline. The reason for this is that pipeline developers are the most likely to know what sort of resources their pipeline requires, so they are in the best position to define the resources requested. For more information on how to adjust resources, see the `compute` section of the [pipeline interface page](pipeline-interface-specification.md). If all the different configuration files seem confusing, now is a good time to review [who's who in configuration files](config-files.md). - -## Which configuration file has which settings? - -There's a list on the [config files page](config-files.md). diff --git a/docs/features.md b/docs/features.md deleted file mode 100644 index c45ff71f9..000000000 --- a/docs/features.md +++ /dev/null @@ -1,49 +0,0 @@ -# Features and benefits - -[cli]: img/cli.svg -[computing]: img/computing.svg -[flexible_pipelines]: img/flexible_pipelines.svg -[job_monitoring]: img/job_monitoring.svg -[resources]: img/resources.svg -[subprojects]: img/subprojects.svg -[collate]: img/collate.svg -[file_yaml]: img/file_yaml.svg -[html]: img/HTML.svg -[modular]: img/modular.svg - - -![modular][modular] **Modular approach to job handling** - -Looper **completely divides job handling from pipeline processing**. This modular approach simplifies the pipeline-building process because pipelines no longer need to worry about sample metadata parsing. - -![file_yaml][file_yaml] **The power of standard PEP format** - -`Looper` inherits a bunch of advantages from [standard PEP format](http://pepkit.github.io): For example, **you only need to learn 1 way to format your project metadata, and it will work with any pipeline**. PEP format allows **subprojects**, which make it easy to define two very similar projects without duplicating project metadata. It also makes your project immediately compatible with other tools in pepkit; for example, you can import all your sample metadata (and pipeline results) in an R or python analysis environment with the [pepr](https://github.com/pepkit/pepr) R package or the [peppy](https://github.com/pepkit/peppy) python package. Using PEP's *derived attributes* feature makes projects portable, and can also be used to collate input files across file systems and naming conventions, making it easy to share projects across compute environments or individuals. - - -![computing][computing] **Universal parallelization implementation** - -Looper's sample-level parallelization applies to all pipelines, so individual pipelines do not need reinvent the wheel. By default `looper`will simply run your jobs serially, but `looper` employs [divvy](http://code.databio.org/divvy) to let you process your pipelines on any cluster resource manager (SLURM, SGE, etc.). Looper also allows you to specify compute queue/partition on-the-fly, by passing the ``--compute`` parameter to your call to ``looper run``, making flexible if you have complex resource needs. This provides a convenient interface for submitting pipelines either to local compute or to any cluster resource manager, so individual pipeline authors do not need to worry about cluster job submission. - -![flexible_pipelines][flexible_pipelines] **Flexible pipelines** - -Use looper with any pipeline, any library, in any domain. We designed it to work with [pypiper](http://code.databio.org/pypiper), but **looper has an infinitely flexible command-line argument system that will let you configure it to work with any script (pipeline) that accepts command-line arguments**. You can also configure looper to submit multiple pipelines per sample. - - -![job_monitoring][job_monitoring] **Job completion monitoring** - -Looper is job-aware and will not submit new jobs for samples that are already running or finished, making it easy to add new samples to existing projects, or re-run failed samples. - - -![resources][resources] **Flexible resources** - -Looper has an easy-to-use resource requesting scheme. With a few lines to define CPU, memory, clock time, or anything else, pipeline authors can specify different computational resources depending on the size of the input sample and pipeline to run. Or, just use a default if you don't want to mess with setup. - -![cli][cli] **Command line interface** - -Looper uses a command-line interface so you have total power at your fingertips. - -![html][html] **Beautiful linked result reports** - -Looper automatically creates an internally linked, portable HTML report highlighting all results for your pipeline, for every pipeline. -For an html report example see: [PEPATAC Gold Summary](https://pepatac.databio.org/en/latest/files/examples/gold/gold_summary.html) \ No newline at end of file diff --git a/docs/grouping-jobs.md b/docs/grouping-jobs.md deleted file mode 100644 index 9c247b4de..000000000 --- a/docs/grouping-jobs.md +++ /dev/null @@ -1,11 +0,0 @@ -# Grouping many jobs into one - -By default, `looper` will translate each row in your `sample_table` into a single job. But perhaps you are running a project with tens of thousands of rows, and each job only takes mere minutes to run; in this case, you'd rather just submit a single job to process many samples. `Looper` makes this easy with the `--lump` and `--lumpn` command line arguments. - -## Lumping jobs by job count: `--lumpn` - -It's quite simple: if you want to run 100 samples in a single job submission script, just tell looper `--lumpn 100`. - -## Lumping jobs by input file size: `--lump` - -But what if your samples are quite different in terms of input file size? For example, your project may include many small samples, which you'd like to lump together with 10 jobs to 1, but you also have a few control samples that are very large and should have their own dedicated job. If you just use `--lumpn` with 10 samples per job, you could end up lumping your control samples together, which would be terrible. To alleviate this problem, `looper` provides the `--lump` argument, which uses input file size to group samples together. By default, you specify an argument in number of gigabytes. Looper will go through your samples and accumulate them until the total input file size reaches your limit, at which point it finalizes and submits the job. This will keep larger files in independent runs and smaller files grouped together. diff --git a/docs/how-to-merge-inputs.md b/docs/how-to-merge-inputs.md deleted file mode 100644 index a1d983f9c..000000000 --- a/docs/how-to-merge-inputs.md +++ /dev/null @@ -1,60 +0,0 @@ -# How to handle multiple input files - -*Dealing with multiple input files is described in detail in the [PEP documentation](http://pep.databio.org/en/latest/specification/#project-attribute-subsample_table).* - -Briefly: - -Sometimes you have multiple input files that you want to merge for one sample. For example, a common use case is a single library that was spread across multiple sequencing lanes, yielding multiple input files that need to be merged, and then run through the pipeline as one. Rather than putting multiple lines in your sample annotation sheet, which causes conceptual and analytical challenges, PEP has two ways to merge these: - -1. Use shell expansion characters (like `*` or `[]`) in your file path definitions (good for simple merges) -2. Specify a *sample subannotation tables* which maps input files to samples for samples with more than one input file (infinitely customizable for more complicated merges). - - -## Multi-value sample attributes behavior in the pipeline interface command templates - -Both sample subannotation tables and shell expansion characters lead to sample attributes with multiple values, stored in a list of strings (`multi_attr1` and `multi_attr1`), as opposed to a standard scenario, where a single value is stored as a string (`single_attr`): - -``` -Sample -sample_name: sample1 -subsample_name: ['0', '1', '2'] -multi_attr1: ['one', 'two', 'three'] -multi_attr2: ['four', 'five', 'six'] -single_attr: test_val -``` - -### Access individual elements in lists - -Pipeline interface author can leverage that fact and access the individual elements, e.g iterate over them and append to a string using the Jinja2 syntax: - -```bash -pipeline_name: test_iter -pipeline_type: sample -command_template: > - --input-iter {%- for x in sample.multi_attr1 -%} --test-individual {x} {% endfor %} # iterate over multiple values - --input-single {sample.single_attr} # use the single value as is - -``` - -This results in a submission script that includes the following command: -```bash ---input-iter --test-individual one --test-individual two --test-individual three ---input-single test_val -``` - -### Concatenate elements in lists - -The most common use case is just concatenating the multiple values and separate them with space -- **providing multiple input values to a single argument on the command line**. Therefore, all the multi-value sample attributes that have not been processed with Jinja2 logic are automatically concatenated. For instance, the following command template in a pipeline interface will result in the submission script presented below: - -Pipeline interface: -```bash -pipeline_name: test_concat -pipeline_type: sample -command_template: > - --input-concat {sample.multi_attr1} # concatenate all the values -``` - -Command in the submission script: -```bash ---input-concat one two three -``` diff --git a/docs/img/HTML.svg b/docs/img/HTML.svg deleted file mode 100644 index 3282c9982..000000000 --- a/docs/img/HTML.svg +++ /dev/null @@ -1,526 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - Openclipart - - - - - - - - - - - - diff --git a/docs/img/cli.svg b/docs/img/cli.svg deleted file mode 100644 index 803ad3b99..000000000 --- a/docs/img/cli.svg +++ /dev/null @@ -1,379 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/img/collate.svg b/docs/img/collate.svg deleted file mode 100644 index c536fff2e..000000000 --- a/docs/img/collate.svg +++ /dev/null @@ -1,133 +0,0 @@ - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - diff --git a/docs/img/computing.svg b/docs/img/computing.svg deleted file mode 100644 index eb3fb2f8d..000000000 --- a/docs/img/computing.svg +++ /dev/null @@ -1,756 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/img/divvy-connect.svg b/docs/img/divvy-connect.svg deleted file mode 100644 index 9bf7c637a..000000000 --- a/docs/img/divvy-connect.svg +++ /dev/null @@ -1,648 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - Tool 1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - SLURMcluster - - Tool 2 - - Tool 3 - Cloud - - - - - - - - Laptop - - - - - Tool 1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - SLURMcluster - - Tool 2 - - Tool 3 - Cloud - - - - - - - - Laptop - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/img/divvy-merge.svg b/docs/img/divvy-merge.svg deleted file mode 100644 index fefe9cd7d..000000000 --- a/docs/img/divvy-merge.svg +++ /dev/null @@ -1,1066 +0,0 @@ - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Jobsettings - - - - - - Environmentsettings - - - - - - - - Submissionscript - - - - - - - - - - - - - - - - SUB - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - template - command - - - - - - - - - - Command-lineadjustments - - - - diff --git a/docs/img/divvy_bug.svg b/docs/img/divvy_bug.svg deleted file mode 100644 index c9f1472c8..000000000 --- a/docs/img/divvy_bug.svg +++ /dev/null @@ -1,103 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - diff --git a/docs/img/divvy_logo.svg b/docs/img/divvy_logo.svg deleted file mode 100644 index 0ca13923e..000000000 --- a/docs/img/divvy_logo.svg +++ /dev/null @@ -1,153 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - divvy - - diff --git a/docs/img/divvy_logo_dark.svg b/docs/img/divvy_logo_dark.svg deleted file mode 100644 index b7b6dfc66..000000000 --- a/docs/img/divvy_logo_dark.svg +++ /dev/null @@ -1,153 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - - - - divvy - - diff --git a/docs/img/favicon.ico b/docs/img/favicon.ico deleted file mode 100644 index d118e4754..000000000 Binary files a/docs/img/favicon.ico and /dev/null differ diff --git a/docs/img/favicon_looper.ico b/docs/img/favicon_looper.ico deleted file mode 100644 index d118e4754..000000000 Binary files a/docs/img/favicon_looper.ico and /dev/null differ diff --git a/docs/img/favicon_looper.svg b/docs/img/favicon_looper.svg deleted file mode 100644 index 8b16d8fee..000000000 --- a/docs/img/favicon_looper.svg +++ /dev/null @@ -1,72 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - diff --git a/docs/img/file_yaml.svg b/docs/img/file_yaml.svg deleted file mode 100644 index 2aaa54142..000000000 --- a/docs/img/file_yaml.svg +++ /dev/null @@ -1,394 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - .SH - - - - - - - - - - - .yaml - - - - - - - - diff --git a/docs/img/flexible_pipelines.svg b/docs/img/flexible_pipelines.svg deleted file mode 100644 index 5a331625c..000000000 --- a/docs/img/flexible_pipelines.svg +++ /dev/null @@ -1,270 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - diff --git a/docs/img/job_monitoring.svg b/docs/img/job_monitoring.svg deleted file mode 100644 index 3f09da534..000000000 --- a/docs/img/job_monitoring.svg +++ /dev/null @@ -1,286 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - diff --git a/docs/img/looper_bug.svg b/docs/img/looper_bug.svg deleted file mode 100644 index 27e97ac2d..000000000 --- a/docs/img/looper_bug.svg +++ /dev/null @@ -1,94 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - diff --git a/docs/img/looper_bug_dark.svg b/docs/img/looper_bug_dark.svg deleted file mode 100644 index eb0129501..000000000 --- a/docs/img/looper_bug_dark.svg +++ /dev/null @@ -1,94 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - diff --git a/docs/img/looper_logo_dark.svg b/docs/img/looper_logo_dark.svg deleted file mode 100644 index 6b7d25ab5..000000000 --- a/docs/img/looper_logo_dark.svg +++ /dev/null @@ -1,122 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/img/looper_logo_text.svg b/docs/img/looper_logo_text.svg deleted file mode 100644 index 86ce44636..000000000 --- a/docs/img/looper_logo_text.svg +++ /dev/null @@ -1,110 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - - - - - - - - looper - - - diff --git a/docs/img/modular.svg b/docs/img/modular.svg deleted file mode 100644 index 10e1edf81..000000000 --- a/docs/img/modular.svg +++ /dev/null @@ -1,118 +0,0 @@ - - - - - - - - - - - - - - - - image/svg+xml - - - - - Openclipart - - - ftnetwork connected - 2011-01-31T02:06:32 - Originally uploaded by Danny Allen for OCAL 0.18 this icon is part of the flat theme - https://openclipart.org/detail/113647/ftnetwork-connected-by-anonymous - - - Anonymous - - - - - flat - icon - theme - - - - - - - - - - - diff --git a/docs/img/nodivvy.svg b/docs/img/nodivvy.svg deleted file mode 100644 index 50316a87f..000000000 --- a/docs/img/nodivvy.svg +++ /dev/null @@ -1,646 +0,0 @@ - - - - - - - - - - image/svg+xml - - - - - - - - Tool 1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - SLURMcluster - - Tool 2 - - Tool 3 - Cloud - - - - - - - - Laptop - - - - - Tool 1 - - - - - - - - - - - - - - - - - - - - - - - - - - - - SLURMcluster - - Tool 2 - - Tool 3 - Cloud - - - - - - - - Laptop - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - diff --git a/docs/img/resources.svg b/docs/img/resources.svg deleted file mode 100644 index 944f83f2e..000000000 --- a/docs/img/resources.svg +++ /dev/null @@ -1,635 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - - diff --git a/docs/img/subprojects.svg b/docs/img/subprojects.svg deleted file mode 100644 index e35e1db46..000000000 --- a/docs/img/subprojects.svg +++ /dev/null @@ -1,293 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - image/svg+xml - - - - - - - - - - - diff --git a/docs/implied-columns.md b/docs/implied-columns.md deleted file mode 100644 index 7879956ca..000000000 --- a/docs/implied-columns.md +++ /dev/null @@ -1,49 +0,0 @@ -# Implied columns - -At some point, you may have a situation where you need a single sample attribute (or column) -to populate several different pipeline arguments with different values. -In other words, the value of a given attribute may *imply* values for other attributes. -It would be nice if you didn't have to enumerate all of these secondary, implied attributes, -and could instead just infer them from the value of the original attribute. - -For example, if my `organism` attribute is `human`, this implies a few other secondary attributes -(which may be project-specific): For one project, I want to set `genome` to `hg38` and `macs_genome_size` to `hs`. -Of course, I could just define columns called `genome` and `macs_genome_size`, but these would be constant across samples, so it feels inefficient and unwieldy. -Plus, changing the aligned genome would require changing the sample annotation sheet (every sample, in fact). -You can certainly do this with `looper`, but a better way is to handle these things at the project level. - -As a more elegant alternative, in a project config file `looper` will recognize a section called `implied_columns`. -Instead of hard-coding `genome` and `macs_genome_size` in the sample annotation sheet, -you can simply specify that the attribute `organism` *implies* additional attribute-value pairs -(which may vary by sample based on the value of the `organism` attribute). -This lets you specify assemblies, genome size, and other similar variables all in your project config file. - -To do this, just add an `implied_columns` section to your project_config.yaml file. Example: - -```yaml -implied_columns: - organism: - human: - genome: "hg38" - macs_genome_size: "hs" - mouse: - genome: "mm10" - macs_genome_size: "mm" -``` - -There are 3 levels in the `implied_columns` hierarchy. -The first (directly under `implied_columns`; here, `organism`), are primary columns from which new attributes will be inferred. -The second layer (here, `human` or `mouse`) are possible values your samples may take in the primary column. -The third layer (`genome` and `macs_genome_size`) are the key-value pair of new, implied columns -for any samples with the required value for that primary column. - -In this example, any samples with organism set to `"human"` will automatically also have attributes for `genome` (`"hg38"`) and for `macs_genome_size` (`"hs"`). -Any samples with `organism` set to `"mouse"` will have the corresponding values. -A sample with `organism` set to `"frog"` would lack attributes for `genome` and `macs_genome_size`, since those columns are not implied by `"frog"`. - -This system essentially lets you set global, species-level attributes at the project level instead of duplicating -that information for every sample that belongs to a species. -Even better, it's generic, so you can do this for any partition of samples (just replace `organism` with whatever you like). - -This makes your project more portable and does a better job conceptually with separating sample attributes from project attributes. -After all, a reference assembly is not a property of a sample, but is part of the broader project context. diff --git a/docs/initialize.md b/docs/initialize.md deleted file mode 100644 index 0a2c71537..000000000 --- a/docs/initialize.md +++ /dev/null @@ -1,21 +0,0 @@ -# How to initialize a looper repository - -*This is considered a beta feature and may change in future releases*. - -Looper provides a command `looper init` that allows you to initialize folders as looper repositories. This enables you to use `looper` without passing your PEP every time. - -```bash -looper init pep.yaml -``` - -Now, as long as you are operating from within this directory or any of the subdirectories, you can run any looper command without passing `pep.yaml`: - -```bash -looper run -``` - -The `looper init` command creates a dotfile called `.looper.yaml` in the current directory. This file simply points looper to the config file passed as positional argument to `looper init`: - -```yaml -config_file_path: relative/path/to/pep.yaml -``` diff --git a/docs/looper-config.md b/docs/looper-config.md deleted file mode 100644 index 3c2d095ce..000000000 --- a/docs/looper-config.md +++ /dev/null @@ -1,36 +0,0 @@ -# How to use the looper config file - -Starting with `looper` version `>=1.5.0`, you should specify a pipeline interface in the looper config file, rather than in the PEP. - -Example looper config file using local PEP: - -```yaml -pep_config: $HOME/hello_looper-master/project/project_config.yaml -output_dir: "$HOME/hello_looper-master/output" -pipeline_interfaces: - sample: ["$HOME/hello_looper-master/pipeline/pipeline_interface"] - project: "some/project/pipeline" -``` - -In addition, looper>=1.5.0 supports projects from [PEPhub](https://pephub.databio.org/). -Using a PEP from PEPhub allows a user to run a pipeline without downloading the PEP. This allows you to keep the sample table in a centralized, shared location. You need only specify all necessary -environment variables used by the PEP. - -Example looper config file using PEPhub project: - -```yaml -pep_config: pephub::databio/looper:default -output_dir: "$HOME/hello_looper-master/output" -pipeline_interfaces: - sample: ["$HOME/hello_looper-master/pipeline/pipeline_interface"] - project: "$HOME/hello_looper-master/project/pipeline" -``` - -Where: -- `output_dir` is pipeline output directory, where results will be saved. -- `pep_config` is a local config file or PEPhub registry path. (registry path should be specified in one -one of supported ways: `namespace/name`, `pephub::namespace/name`, `namespace/name:tag`, or `pephub::namespace/name:tag`) -- `pipeline interfaces` is a local path to project or sample pipelines. - -To run pipeline, go to the directory of .looper.config and execute command in your terminal: -`looper run --looper-config {looper_config_path}` or `looper runp --looper-config {looper_config_path}`. diff --git a/docs/looper-report.md b/docs/looper-report.md deleted file mode 100644 index 6cd4a79ea..000000000 --- a/docs/looper-report.md +++ /dev/null @@ -1,13 +0,0 @@ -# Create a Browsable HTML Report - -Looper can create a browsable html report of all project results using the command: - -```terminal -looper report --looper-config .your_looper_config.yaml -``` - -Beginning in Looper 1.7.0, the ``--portable`` flag can be used to create a shareable, zipped version of the html report. - -An example html report out put can be found here: [PEPATAC Gold Summary](https://pepatac.databio.org/en/latest/files/examples/gold/gold_summary.html) - -Note: pipestat must be configured by looper to perform this operation. Please see the pipestat section for more information: [Using pipestat](pipestat.md) \ No newline at end of file diff --git a/docs/multiple-pipelines.md b/docs/multiple-pipelines.md deleted file mode 100644 index adc296006..000000000 --- a/docs/multiple-pipelines.md +++ /dev/null @@ -1,22 +0,0 @@ -# A project with multiple pipelines - -In earlier versions of looper (v < 1.0), we used a `protocol_mappings` section to map samples with different `protocol` attributes to different pipelines. In the current pipeline interface (looper v > 1.0), we eliminated the `protocol_mappings`, because this can now be handled using sample modifiers, simplifying the pipeline interface. Now, each pipeline has exactly 1 pipeline interface. You link to the pipeline interface with a sample attribute. If you want the same pipeline to run on all samples, it's as easy as using an `append` modifier like this: - -``` -sample_modifiers: - append: - pipeline_interfaces: "test.yaml" -``` - -But if you want to submit different samples to different pipelines, depending on a sample attribute, like `protocol`, you can use an implied attribute: - -``` -sample_modifiers: - imply: - - if: - protocol: [PRO-seq, pro-seq, GRO-seq, gro-seq] # OR - then: - pipeline_interfaces: ["peppro.yaml"] -``` - -This approach uses only functionality of PEPs to handle the connection to pipelines as sample attributes, which provides full control and power using the familiar sample modifiers. It completely eliminates the need for re-inventing this complexity within looper, which eliminated the protocol mapping section to simplify the looper pipeline interface files. You can read more about the rationale of this change in [issue 244](https://github.com/pepkit/looper/issues/244#issuecomment-611154594). diff --git a/docs/parameterizing-pipelines.md b/docs/parameterizing-pipelines.md deleted file mode 100644 index e1c6f3a62..000000000 --- a/docs/parameterizing-pipelines.md +++ /dev/null @@ -1,69 +0,0 @@ -# How to pass extra command-line arguments - -Occasionally, a particular project needs to run a particular flavor of a pipeline. How can you adjust pipeline arguments for just this project? You can use looper *command extras* to solve this problem. Command extras let you pass any string on to the pipeline, which will be appended to the command. - -There are 2 ways to use command extras: for sample pipelines, or for project pipelines: - -## 1. Sample pipeline command extras - -### Adding sample command extras via sample attributes - -Looper uses a reserved sample attribute called `command_extras`, which you can set using general PEP sample modifiers however you wish. For example, if your extras are the same for all samples you could use an `append` modifier: - - -```yaml -sample_modifiers: - append: - command_extra: "--flavor-flag" -``` - -This will add `--flavor-flag` the end of the command looper constructs. If you need to modulate the extras depending on another attribute value, you could use an imply modifier: - -```yaml -sample_modifiers: - imply: - - if: - protocol: "rrbs" - then: - command_extra: "-C flavor.yaml --epilog" -``` - -### Adding sample command extras via the command line - -You can also pass extra arguments using `--command-extra` like this: - -```bash -looper run --looper-config .looper.yaml --command-extra="--flavor-flag" -``` - -## 2. Project pipeline command extras - -For *project pipelines*, you can specify command extras in the `looper` section of the PEP config: - -```yaml -looper: - output_dir: "/path/to/output_dir" - cli: - runp: - command-extra: "--flavor" -``` - -or as an argument to the `looper runp` command: - - -```bash -looper runp --looper-config .looper.yaml --command-extra="--flavor-flag" -``` - - -## Overriding PEP-based command extras - -By default, the CLI extras are *appended to the command_extra specified in your PEP*. If you instead want to *override* the command extras listed in the PEP, you can instead use `--command-extra-override`. - -So, for example, make your looper call like this: - -```bash -looper run --command-extra-override="-R" -``` - -That will remove any defined command extras and append `-R` to the end of any commands created by looper. diff --git a/docs/pipeline-interface-specification.md b/docs/pipeline-interface-specification.md deleted file mode 100644 index 8a0a01732..000000000 --- a/docs/pipeline-interface-specification.md +++ /dev/null @@ -1,222 +0,0 @@ ---- -title: Pipeline interface specification ---- - -

Pipeline interface specification

- -Table of contents: - -[TOC] - -## Introduction - -In order to run an arbitrary pipeline, we require a formal specification for how the pipeline is to be used. We define this using a *pipeline interface* file. It maps attributes of a PEP project or sample to the pipeline CLI arguments. Thus, it defines the interface between the project metadata (the PEP) and the pipeline itself. - -If you're using *existing* `looper`-compatible pipelines, you don't need to create a new interface; just point your project at the one that comes with the pipeline. When creating *new* `looper`-compatible pipelines, you'll need to create a new pipeline interface file. - -Pipeline interfaces are defined in the looper config file (e.g. `.looper.yaml`): - -```yaml -pep_config: ./project/project_config.yaml # pephub registry path or local path -output_dir: ./results -pipeline_interfaces: - sample: ./pipeline_pipestat/pipeline_interface.yaml -pipestat: - results_file_path: results.yaml - -``` - - -## Overview of pipeline interface components - -A pipeline interface may contain the following keys: - -- `pipeline_name` (REQUIRED) - A string identifying the pipeline, -- `pipeline_type` (REQUIRED) - A string indicating a pipeline type: "sample" (for `run`) or "project" (for `runp`), -- `command_template` (REQUIRED) - A [Jinja2](https://jinja.palletsprojects.com/en/2.11.x/) template used to construct a pipeline command to run. -- `linked_pipeline_interfaces` (OPTIONAL) - A collection of paths to sample pipeline interfaces related to this pipeline interface (used only in project pipeline interfaces for `looper report` purposes). -- `input_schema` (RECOMMENDED) - A [PEP Schema](http://eido.databio.org) formally defining *required inputs* for the pipeline -- `schema_path` (RECOMMENDED| REQUIRED FOR PIPESTAT) - A schema describing the *outputs* of the pipeline. -- `compute` (RECOMMENDED) - Settings for computing resources -- `var_templates` (RECOMMENDED) - A mapping of [Jinja2](https://jinja.palletsprojects.com/en/2.11.x/) templates and corresponding names, typically used to encode submission-specific paths that can be submission-specific -- `pre_submit` (OPTIONAL) - A mapping that defines the pre-submission tasks to be executed - -The pipeline interface should define either a sample pipeline or a project pipeline. Here's a simple example: - -```yaml -pipeline_name: RRBS -pipeline_type: sample -var_templates: - pipeline: "{looper.piface_dir}/pipeline1.py" - sample_info: "{looper.piface_dir}/{sample.name}/info.txt" -input_schema: path/to/rrbs_schema.yaml -command_template: {pipeline.var_templates.pipeline} --input {sample.data_path} --info {pipeline.sample_info.path} -``` - -Pretty simple. The `pipeline_name` is arbitrary. It's used for messaging and identification. Ideally, it's unique to each pipeline. In this example, we define a single sample-level pipeline. - -## Details of pipeline interface components - -### pipeline_name - -The pipeline name is arbitrary. It should be unique for each pipeline. Looper uses it for a few things: - -1. to construct the `job_name` variable (accessible via `{ looper.job_name }`). See [variable namespaces](variable-namespaces.md) for more details. - -2. to check for flags. For pipelines that produce flags, looper will be aware of them and not re-submit running jobs. - -### pipeline_type - -Looper can run 2 kinds of pipeline: *sample pipelines* run once per sample; *project pipelines* run once per project. The type of pipeline must be specified in the pipeline interface as `pipeline_type: sample` or `pipeline_type: project`. - -### command_template - -The command template is the most critical part of the pipeline interface. It is a [Jinja2](https://jinja.palletsprojects.com/) template for the command to run for each sample. Within the `command_template`, you have access to variables from several sources. These variables are divided into namespaces depending on the variable source. You can access the values of these variables in the command template using the single-brace jinja2 template language syntax: `{namespace.variable}`. For example, looper automatically creates a variable called `job_name`, which you may want to pass as an argument to your pipeline. You can access this variable with `{looper.job_name}`. The available namespaces are described in detail in [looper variable namespaces](variable-namespaces.md). - -Because it's based on Jinja2, command templates are extremely flexible. For example, optional arguments can be accommodated using Jinja2 syntax, like this: - -``` -command_template: > - {pipeline.path} - --sample-name {sample.sample_name} - --genome {sample.genome} - --input {sample.read1} - --single-or-paired {sample.read_type} - {% if sample.read2 is defined %} --input2 {sample.read2} {% endif %} - {% if sample.peak_caller is defined %} --peak-caller {sample.peak_caller} {% endif %} - {% if sample.FRIP_ref is defined %} --frip-ref-peaks {sample.FRIP_ref} {% endif %} -``` - -Arguments wrapped in Jinja2 conditionals will only be added *if the specified attribute exists for the sample*. - -### input_schema - -The input schema formally specifies the *input processed by this pipeline*. The input schema serves 2 related purposes: - -1. **Validation**. Looper uses the input schema to ensure that the project fulfills all pipeline requirements before submitting any jobs. Looper uses the PEP validation tool, [eido](http://eido.databio.org), to validate input data by ensuring that input samples have the attributes and input files required by the pipeline. Looper will only submit a sample pipeline if the sample validates against the pipeline's input schema. - -2. **Description**. The input schema is also useful to describe the inputs, including both required and optional inputs, thereby providing a standard way to describe a pipeline's inputs. In the schema, the pipeline author can describe exactly what the inputs mean, making it easier for users to learn how to structure a project for the pipeline. - -Details for how to write a schema in [writing a schema](http://eido.databio.org/en/latest/writing-a-schema/). The input schema format is an extended [PEP JSON-schema validation framework](http://pep.databio.org/en/latest/howto_validate/), which adds several capabilities, including - -- `required` (optional): A list of sample attributes (columns in the sample table) that **must be defined** -- `required_files` (optional): A list of sample attributes that point to **input files that must exist**. -- `files` (optional): A list of sample attributes that point to input files that are not necessarily required, but if they exist, should be counted in the total size calculation for requesting resources. - -If no `input_schema` is included in the pipeline interface, looper will not be able to validate the samples and will simply submit each job without validation. - -### output_schema - -The output schema formally specifies the *output produced by this pipeline*. It is used by downstream tools to that need to be aware of the products of the pipeline for further visualization or analysis. Beginning with Looper 1.6.0 and Pipestat 0.6.0, the output schema is a JSON-schema: [pipestat schema specification](http://pipestat.databio.org/en/latest/pipestat_specification/#pipestat-schema). - -Here is an example output schema: - -```yaml -title: An example output schema -description: An example description -type: object -properties: - pipeline_name: "default_pipeline_name" - samples: - type: object - properties: - number_of_things: - type: integer - description: "Number of things" - percentage_of_things: - type: number - description: "Percentage of things" - name_of_something: - type: string - description: "Name of something" - switch_value: - type: boolean - description: "Is the switch on or off" - output_file: - $ref: "#/$defs/file" - description: "This a path to the output file" - output_image: - $ref: "#/$defs/image" - description: "This a path to the output image" - md5sum: - type: string - description: "MD5SUM of an object" - highlight: true -$defs: - image: - type: object - object_type: image - properties: - path: - type: string - thumbnail_path: - type: string - title: - type: string - required: - - path - - thumbnail_path - - title - file: - type: object - object_type: file - properties: - path: - type: string - title: - type: string - required: - - path - - title -``` -Looper uses the output schema in its `report` function, which produces a browsable HTML report summarizing the pipeline results. The output schema provides the relative locations to sample-level and project-level outputs produced by the pipeline, which looper can then integrate into the output results. If the output schema is not included, the `looper report` will be unable to locate and integrate the files produced by the pipeline and will therefore be limited to simple statistics. - -### compute - -The compute section of the pipeline interface provides a way to set compute settings at the pipeline level. These variables can then be accessed in the command template. They can also be overridden by values in the PEP config, or on the command line. See the [looper variable namespaces](variable-namespaces.md) for details. - -There is one reserved attribute under `compute` with specialized behavior -- `size_dependent_variables` which we'll now describe in detail. - -#### size_dependent_variables - -The `size_dependent_variables` section lets you specify variables with values that are modulated based on the total input file size for the run. This is typically used to add variables for memory, CPU, and clock time to request, if they depend on the input file size. Specify variables by providing a relative path to a `.tsv` file that defines the variables as columns, with input sizes as rows. - -The pipeline interface simply points to a `tsv` file: - -```yaml -pipeline_type: sample -var_templates: - pipeline: {looper.piface_dir}/pepatac.py -command_template: > - {pipeline.var_templates.pipeline} ... -compute: - size_dependent_variables: resources-sample.tsv -``` - -The `resources-sample.tsv` file consists of a file with at least 1 column called `max_file_size`. Add any other columns you wish, each one will represent a new attribute added to the `compute` namespace and available for use in your command template. Here's an example: - -```tsv -max_file_size cores mem time -0.001 1 8000 00-04:00:00 -0.05 2 12000 00-08:00:00 -0.5 4 16000 00-12:00:00 -1 8 16000 00-24:00:00 -10 16 32000 02-00:00:00 -NaN 32 32000 04-00:00:00 -``` - -This example will add 3 variableS: `cores`, `mem`, and `time`, which can be accessed via `{compute.cores}`, `{compute.mem}`, and `{compute.time}`. Each row defines a "packages" of variable values. Think of it like a group of steps of increasing size. For a given job, looper calculates the total size of the input files (which are defined in the `input_schema`). Using this value, looper then selects the best-fit row by iterating over the rows until the calculated input file size does not exceed the `max_file_size` value in the row. This selects the largest resource package whose `max_file_size` attribute does not exceed the size of the input file. Max file sizes are specified in GB, so `5` means 5 GB. - -This final line in the resources `tsv` must include `NaN` in the `max_file_size` column, which serves as a catch-all for files larger than the largest specified file size. Add as many resource sets as you want. - -#### var_templates - -This section can consist of multiple variable templates that are rendered and can be reused. The namespaces available to the templates are listed in [variable namespaces](variable-namespaces.md) section. Please note that the variables defined here (even if they are paths) are arbitrary and are *not* subject to be made relative. Therefore, the pipeline interface author needs take care of making them portable (the `{looper.piface_dir}` value comes in handy!). - -#### pre_submit - -This section can consist of two subsections: `python_funcions` and/or `command_templates`, which specify the pre-submission tasks to be run before the main pipeline command is submitted. Please refer to the [pre-submission hooks system](pre-submission-hooks.md) section for a detailed explanation of this feature and syntax. - -## Validating a pipeline interface - -A pipeline interface can be validated using JSON Schema against [schema.databio.org/pipelines/pipeline_interface.yaml](http://schema.databio.org/pipelines/pipeline_interface.yaml). Looper automatically validates pipeline interfaces at submission initialization stage. diff --git a/docs/pipeline-tiers.md b/docs/pipeline-tiers.md deleted file mode 100644 index 13c2593b6..000000000 --- a/docs/pipeline-tiers.md +++ /dev/null @@ -1,19 +0,0 @@ -# The concept of two-tiered pipelines - -In our experience, we are typically interested in running two different types of commands: Those that operate on each sample independently, and those that operate on all samples simultaneously. Since sample-independent pipelines can be easily parallelized by sample, we distinguish these. - -Looper divides pipelines into two types: *sample* pipelines and *project* pipelines. - -This philosophy is conceptually similar to the [MapReduce](https://en.wikipedia.org/wiki/MapReduce) programming model, which applies a *split-apply-combine* strategy. In the case of running pipelines on sample-intensive research projects, we *split* the project into samples and *apply* the first tier of processing (the *sample* pipeline). We then *combine* the results in the second tier of processing (the *project* pipeline). - -Looper doesn't require you to use this two-stage system, but it simply makes it easy to do so. Many pipelines operate only at the sample level and leave the downstream cross-sample analysis to the user. - -## Sample pipelines - -The typical use case is sample-level pipelines. These are run with `looper run`. Pipeline interface defining a sample pipeline must to include `pipeline_type: "sample"` statement. - -## Project pipelines - -Project pipelines, identified by `pipeline_type: "project"` statement in the pipeline interface, will be run with `looper runp` (where the *p* stands for *project*). Running a project pipeline operates in almost exactly the same way as the sample pipeline, with 2 key exceptions: First, instead of creating a separate command for every sample, the `looper runp` will only create a single command per pipeline for the project. And second, the command template itself will not have access to a `sample` namespace representing a particular sample, since it's not running on a particular sample; instead, it will have access to a `samples` (plural) namespace, which contains all the attributes from all the samples. - -In a typical workflow, a user will first run the samples individually using `looper run`, and then, if the pipeline provides one, will run the project component using `looper runp` to summarize or aggregate the results into a project-level output. diff --git a/docs/pipestat.md b/docs/pipestat.md deleted file mode 100644 index d7ced7ef3..000000000 --- a/docs/pipestat.md +++ /dev/null @@ -1,175 +0,0 @@ -# Pipestat - -Starting with version 1.4.0, looper supports additional functionality for [pipestat](http://pipestat.databio.org/)-compatible pipelines. Pipestat-compatible pipelines will allow you to use looper to do 2 things: - -1. monitor the status of pipeline runs -2. summarize the results of pipelines - -For non-pipestat-compatible pipelines, you can still use looper to run pipelines, but you won't be able to use `looper report` or `looper check` to manage their output. - -## Pipestat configuration overview -Starting with version 1.6.0 configuring looper to work with pipestat has changed. - -Now, Looper will obtain pipestat configurations data from two sources: -1. pipeline interface -2. looper_config file - -Looper will combine the necessary configuration data and write a new pipestat configuration file named `looper_pipestat_config.yaml` which looper will place in its output directory. Pipestat then uses this configuration file to create the required PipestatManager objects. See [Hello_Looper](https://github.com/pepkit/hello_looper) for a specific example. - -Briefly, the Looper config file must contain a pipestat field. A project name must be supplied if running a project level pipeline. The user must also supply a file path for a results file if using a local file backend or database credentials if using a postgresql database backend. - -```yaml -pep_config: project_config_pipestat.yaml # pephub registry path or local path -output_dir: output -sample_table: annotation_sheet.csv -pipeline_interfaces: - sample: ./pipeline_interface1_sample_pipestat.yaml - project: ./pipeline_interface1_project_pipestat.yaml -pipestat: - project_name: TEST_PROJECT_NAME - results_file_path: tmp_pipestat_results.yaml - flag_file_dir: output/results_pipeline - database: - dialect: postgresql - driver: psycopg2 - name: pipestat-test - user: postgres - password: pipestat-password - host: 127.0.0.1 - port: 5432 -``` -And the pipeline interface must include information required by pipestat such as pipeline_name, pipeline_type, and an output schema path: -```yaml -pipeline_name: example_pipestat_pipeline -pipeline_type: sample -output_schema: pipeline_pipestat/pipestat_output_schema.yaml -command_template: > - python {looper.piface_dir}/count_lines.py {sample.file} {sample.sample_name} {pipestat.results_file} - -``` - - - - -### Pipestat Configuration for Looper Versions 1.4.0-1.5.0 -Note: The instructions below are for older versions of Looper. - -Generally, pipestat configuration comes from 3 sources, with the following priority: - -1. `PipestatManager` constructor -2. Pipestat configuration file -3. Environment variables - -In looper, only 1 and 2 are available, and can be specified via the project or sample attributes. Pipestat environment variables are *intentionally not supported* to ensure looper runs are reproducible -- otherwise, jobs configured in one computing environment could lead to totally different configuration and errors in other environments. - -## Usage - -The `PipestatManager` constructor attributes mentioned in the previous section are sourced from either sample attributes (for `looper run`) or project attributes ( for`looper runp`). One of the attributes can be used to specify the [pipestat configuration file](http://pipestat.databio.org/en/latest/config/), which is the other way of configuring pipestat. - -The *names* of the attributes can be adjusted in the PEP configuration file. Let's take a pipestat namespace as an example: by default the value for the namespace is taken from `Sample.sample_name` but can be changed with `looper.pipestat.sample.namespace_attribute` in the PEP configuration file, like so: - -```yaml -looper: - pipestat: - sample: - namespace_attribute: custom_attribute -``` - -Now the value for the pipestat namespace will be sourced from `Sample.custom_attribute` rather than `Sample.sample_name`. - -Similarly, a project-level pipestat namespace can be configured with `looper.pipestat.project.namespace_attribute`: - -```yaml -looper: - pipestat: - project: - namespace_attribute: custom_attribute -``` - -Now the value for the pipestat namespace will be sourced from `Project.custom_attribute` rather than `Project.name`. - -Naturally, this configuration procedure can be applied to other pipestat options. The only exception is pipestat results schema, which is never specified here, since it's sourced from the `output_schema` attribute of the pipeline interface. - -```yaml -looper: - pipestat: - sample: - results_file_attribute: pipestat_results_file - config_attribute: pipestat_config - namespace_attribute: sample_name - project: - results_file_attribute: pipestat_results_file - config_attribute: pipestat_config - namespace_attribute: name -``` - -Again, the values above are defaults -- not needed, but configurable. - -## Examples - -To make the pipestat configuration rules more clear let's consider the following pipestat configuration setups. - -### **Example 1:** All configuration as sample attributes - -In this case the pipestat configuration options are sourced only from the sample attributes. Namely, `pipestat_results_file` and `custom_namespace`. - -#### PEP config - -```yaml -pep_version: 2.0.0 -sample_table: sample_table.csv -sample_modifiers: - append: - pipestat_results_file: $HOME/my_results_file.yaml - derive: - attributes: [custom_namespace] - sources: - namespace: "{sample_name}_pipelineX" -looper: - pipestat: - sample: - namespace_attribute: "custom_namespace" -``` - -#### PEP sample table (`sample_table.csv`) - -```csv -sample_name,custom_namespace -sample1,namespace -``` - -### **Example 2:** A mix of pipestat configuration sources - -In this case the pipestat configuration options are sourced from both sample attributes and pipestat configuration file. - -Looper sourced the value for pipestat namespace from `Sample.sample_name` and database login credentials from the pipestat configuration file. - -#### PEP config - -```yaml -pep_version: 2.0.0 -sample_table: sample_table.csv -sample_modifiers: - append: - pipestat_config: pipestat_config.yaml -``` - -#### PEP sample table (`sample_table.csv`) - -```csv -sample_name -sample1 -``` - -#### Pipestat configuration file (`pipestat_config.yaml`) - -```yaml -database: - name: database_name - user: user_name - password: user_password - host: localhost - port: 5432 - dialect: postgresql - driver: psycopg2 -``` diff --git a/docs/pre-submission-hooks.md b/docs/pre-submission-hooks.md deleted file mode 100644 index d0628a769..000000000 --- a/docs/pre-submission-hooks.md +++ /dev/null @@ -1,282 +0,0 @@ -# Pre-submission hooks - -## Purpose - -Sometimes there is a need to perform some job/submission related tasks *before* the main pipeline submission. For example, we may need to generate a particular representation of the sample metadata to be consumed by a pipeline run. Some of these pre-submission tasks may depend on the information outside of the sample, such as the compute settings. For this purpose looper provides **pre-submission hooks**, which allows users to run arbitrary shell commands or Python functions before submitting the actual pipeline. These hooks have access to all of the job submission settings looper uses to populate the primary command template. They can be used in two ways: 1) to simply run required tasks, producing required output before the pipeline is run; and 2) to modify the job submission settings, which can then be used in the actual submission template. - - -## How to specify pre-submission tasks in the pipeline interface - -The pre-submission tasks to be executed are listed in the [pipeline interface](pipeline-interface-specification.md) file under the top-level `pre_submit` key. The `pre_submit` section is divided into two subsections corresponding to two types of hooks: `python_functions` and `command_templates`. The `python_functions` key specifies a list of strings corresponding to Python functions to run. The `command_templates` key is more generic, specifying shell command templates to be executed in a subprocess. Here is an example: - -```yaml -pre_submit: - python_functions: - - "package_name.function_name" - - "package_name1.function_name" - command_templates: - - "tool.sh --param {sample.attribute}" - - "tool1.sh --param {sample.attribute1}" -``` - -Because the looper variables are the input to each task, and are also potentially modified by each task, the order of execution is critical. Execution order follows two rules: First, `python_functions` are *always* executed before `command_templates`; and second, the user-specified order in the pipeline interface is preserved within each subsection. - -## Built-in pre-submission functions - -Looper ships with several included plugins that you can use as pre-submission functions without installing additional software. These plugins produce various representations of the sample metadata, which can be useful for different types of pipelines. The included plugins are described below: - - -### Included plugin: `looper.write_sample_yaml` - -Saves all sample metadata as a YAML file. The output file path can be customized using `var_templates.sample_yaml_path`. If this parameter is not provided, the file will be saved as `{looper.output_dir}/submission/{sample.sample_name}_sample.yaml`. - -**Parameters:** - - - `pipeline.var_templates.sample_yaml_path` (optional): absolute path to file where YAML is to be stored. - -**Usage:** - -```yaml -pipeline_type: sample -var_templates: - main: "{looper.piface_dir}/pipelines/pipeline1.py" - sample_yaml_path: "{looper.output_dir}/custom_sample_yamls" -pre_submit: - python_functions: - - looper.write_sample_yaml -command_template: > - {pipeline.var_templates.main} {sample.sample_yaml_path} ... -``` - -### Included plugin: `looper.write_sample_yaml_cwl` - -This plugin writes a sample yaml file compatible as a job input file for a CWL pipeline. This plugin allows looper to be used as a scatterer to run an independent CWL workflow for each sample in your PEP sample table. You can parametrize the plugin with a custom output file name using `sample_yaml_cwl_path`. If the parameter is not provided, the file will be saved in `{looper.output_dir}/submission/{sample.sample_name}_sample_cwl.yaml`. - -**Parameters:** - - - `pipeline.var_templates.sample_yaml_path` (optional): absolute path to file where YAML is to be stored. - -**Usage:** - -```yaml -pipeline_type: sample -var_templates: - main: "{looper.piface_dir}/pipelines/pipeline1.py" - sample_yaml_cwl_path: "{looper.output_dir}/custom_sample_yamls/custom_{sample.name}.yaml" -pre_submit: - python_functions: - - looper.write_sample_yaml_cwl -command_template: > - {pipeline.var_templates.main} {sample.sample_yaml_cwl} ... -``` - - -### Included plugin: `looper.write_sample_yaml_prj` - -Saves the sample to YAML file with project reference. This plugin can be parametrized with a custom YAML directory (see "parameters" below). If the parameter is not provided, the file will be saved in `{looper.output_dir}/submission/{sample.sample_name}_sample_prj.yaml`. - -**Parameters:** - - - `pipeline.var_templates.sample_yaml_prj_path` (optional): absolute path to file where YAML is to be stored. - -**Usage:** - -```yaml -pipeline_type: sample -var_templates: - main: "{looper.piface_dir}/pipelines/pipeline1.py" - sample_yaml_prj_path: "{looper.output_dir}/custom_sample_yamls" -pre_submit: - python_functions: - - looper.write_sample_yaml_prj -command_template: > - {pipeline.var_templates.main} ... -``` - -### Included plugin: `looper.write_submission_yaml` - -Saves all five namespaces of pre-submission to YAML file. This plugin can be parametrized with a custom YAML directory (see "parameters" below). If the parameter is not provided, the file will be saved in `{looper.output_dir}/submission/{sample.sample_name}_submission.yaml`. - -**Parameters:** - - - `pipeline.var_templates.submission_yaml_path` (optional): a complete and absolute path to the *directory* where submission YAML representation is to be stored. - -**Example usage:** - -```yaml -pipeline_type: sample -var_templates: - main: "{looper.piface_dir}/pipelines/pipeline1.py" - submission_yaml_path: "{looper.output_dir}/custom_path" -pre_submit: - python_functions: - - looper.write_submission_yaml -command_template: > - {pipeline.var_templates.main} ... -``` - -### Included plugin: `looper.write_custom_template` - -Populates an independent jinja template with values from all the available looper namespaces. - -**Parameters:** -- `pipeline.var_templates.custom_template` (required): a jinja template to be populated for each job. -- `pipeline.var_templates.custom_template_output` (optional): path to which the populated template file will be saved. If not provided, the populated fill will be saved in `{looper.output_dir}/submission/{sample.sample_name}_config.yaml - -**Example usage:** - -```yaml -pipeline_type: sample -var_templates: - custom_template: custom_template.jinja - custom_template_output: "{looper.output_dir}/submission/{sample.sample_name}_custom_config.yaml" -pre_submit: - python_functions: - - looper.write_custom_template -command_template: > - {pipeline.var_templates.main} ... -``` - - -## Writing your own pre-submission hooks - -Pre-submission tasks can be written as a Python function or a shell commands. We will explain each type below: - -### Python functions - -Python plugin functions have access *all of the metadata variables looper has access to to construct the primary command template*. The Python function must obey the following rules: - -1. The Python function *must* take as input a `namespaces` object, which is a Python [`dict`](https://docs.python.org/3/tutorial/datastructures.html#dictionaries) of [looper variable namespaces](variable-namespaces.md). - -2. The function *should* return any updated namespace variables; or can potentially return an empty `dict` (`{}`) if no changes are intended, which may the case if the function is only used for its side effect. - -#### Custom function input parameters - -How can you parameterize your plugin function? Since the function will have access to all the looper variable namespaces, this means that plugin authors may require users to specify any attributes within any namespace to parametrize them. For example, a plugin that increases the compute wall time by an arbitrary amount of time may require `extra_time` attribute in the `pipeline` namespace. Users would specify this parameter like this: - -```{yaml} -pipeline_name: my_pipeline -pipeline_type: sample -extra_time: 3 -``` - -This variable would be accessible in your python function as `namespaces["pipeline"]["extra_time"]`. This works, but we recommend keeping things clean by putting all required pipeline parameters into the [`pipeline.template_vars`](pipeline-interface-specification.md#var_templates) section. This not only keeps things tidy in a particular section, but also adds additional functionality of making these templates that can themselves refer to namespace variables, which can be very convenient. For example, a better approach would be: - -```{yaml} -pipeline_name: my_pipeline -pipeline_type: sample -var_templates: - extra_time: 3 - plugin_path: "{looper.piface_dir}/plugin_results" -``` - -In this example you'd use `namespaces["pipeline"]["var_templates"]["extra_time"]` to access the user-provided parameter. Notice we included another example, `plugin_path`, which can refer to the `{looper.piface_dir}` variable. Because this variable is included under `var_templates`, it will be populated with any namespace variables. - -The plugins need to handle incomplete parametrization, either by providing defaults or by raising exceptions. - -#### Function output: updating submission metadata via return value - -One of the features of the pre-submission hooks is that they can be used to update the [looper variable namespaces](variable-namespaces.md) so that you can use modified variables in your primary command template. This is effectively a way for a plugin function to provide output that can be used by looper. The way this works is that after every successful pre-submission hook execution, the input namespaces are updated with the return value of the hook execution. Existing values are overwritten with the returned ones, whereas omitted values are not changed. Therefore, you must simply write your function to return any updated variables in the same format as in the input function. That is, your return value should be a Python [`dict`](https://docs.python.org/3/tutorial/datastructures.html#dictionaries) of [looper variable namespaces](variable-namespaces.md) - - -For example, given this input (which represents the looper variable namespaces): - -Input: -```yaml -sample: - name: test - size: 30 - genome: hg38 -looper: - log_file: /home/michal/my_log.txt - job_name: test_pepatac -compute: - submission_template: /home/michal/divvy_templates/localhost_template.sub - submission_command: sh -... -``` - -Say your function returned this data: -```yaml -sample: - size: 1000 -looper: - log_file: /home/michal/Desktop/new_log.txt -``` - -Then looper would have this object available for populating the primary command template (input + returned data): -```yaml -sample: - name: test - size: 1000 - genome: hg38 -looper: - log_file: /home/michal/Desktop/new_log.txt - job_name: test_pepatac -compute: - submission_template: /home/michal/divvy_templates/localhost_template.sub - submission_command: sh -... -``` - -### Shell command plugins - -In case you need more flexibility than a Python function, you can also execute arbitrary commands as a pre-submission task. You define exactly what command you want to run, like this: - -```yaml -var_templates: - compute_script: "{looper.piface_dir}/hooks/script.py" -pre_submit: - command_templates: - - "{pipeline.var_templates.compute_script} --genome {sample.genome} --log-file {looper.output_dir}/log.txt" -``` - -This `command_templates` section specifies a list with one or more entries. Each entry specifies a command. The commands are themselves templates, just like the primary `command_template`, so you have access to the looper variable namespaces to put together the appropriate command. In fact, really, the other difference between these `pre_submit.command_templates` and the primary `command_template` is that the final one has access to the changes introduce in the variables by the `pre_submit` commands. The inputs to the script are completely user-defined -- you choose what information and how you want to pass it to your script. - -**Output:** The output of your command should be a JSON-formatted string (`str`), that is processed with [json.loads](https://docs.python.org/3/library/json.html#json.loads) and [subprocess.check_output](https://docs.python.org/3/library/subprocess.html#subprocess.check_output) as follows: `json.loads(subprocess.check_output(str))`. This JSON object will be used to update the looper variable namespaces. - -#### Example: Dynamic compute parameters - -In the `compute` section of the pipeline interface, looper allows you to specify a `size_dependent_variables` section, which lets you specify variables with values that are modulated based on the total input file size for the run. This is typically used to add variables for memory, CPU, and clock time to request, if they depend on the input file size. This a good example of modulating computing variables based on file size, but it is not flexible enough to allow modulated compute variables on the basis of other sample attributes. For a more flexible version, you can use a pre-submission hook. - -The `pre_submit.command_templates` specifies a list of Jinja2 templates to construct a system command run in a subprocess. This command template has available all of the namespaces in the primary command template. The command should return a JSON object, which is then used to populate the namespaces. This allows you to specify computing variables that depend on any attributes of a project, sample, or pipeline, which can be used for ultimate flexibility in computing. - -**Usage**: - -```yaml -pipeline_type: sample -var_templates: - pipeline_path: "{looper.piface_dir}/pipelines/pepatac.py" - compute_script: "{looper.piface_dir}/hooks/script.py" -pre_submit: - command_templates: - - "{pipeline.var_templates.compute_script} --genome {sample.genome} --log-file {looper.output_dir}/log.txt" -command_template: > - {pipeline.var_templates.pipeline_path} ... -``` - -**Script example:** - -```python -#!/usr/bin/env python3 - -import json -from argparse import ArgumentParser - -parser = ArgumentParser(description="Test script") - -parser.add_argument("-s", "--sample-size", help="Sample size", required=False) -parser.add_argument("-g", "--genome", type=str, help="Genome", required=True) -parser.add_argument("-m", "--log-file", type=str, help="Log file path", required=True) -parser.add_argument("-c", "--custom-cores", type=str, help="Force number of cores to use", required=False) -args = parser.parse_args() - -y = json.dumps({ - "cores": args.custom_cores or "4", - "mem": "10000" if args.genome == "hg38" else "20000", - "time": "00-11:00:00", - "logfile": args.log_file -}) - -print(y) -``` diff --git a/docs/running-a-pipeline.md b/docs/running-a-pipeline.md deleted file mode 100644 index c6aad0f72..000000000 --- a/docs/running-a-pipeline.md +++ /dev/null @@ -1,19 +0,0 @@ -# How to run a pipeline - -You first have to [define your project](defining-a-project.md) and a [config file](looper-config.md). This will give you a PEP linked to a pipeline. Next, we'll run the pipeline. - -The basic command is `looper run`. To run your pipeline, just: - -```console -looper run --looper-config .your_looper_config.yaml -``` - -This will submit a job for each sample. That's basically all there is to it; after this, there's a lot of powerful options and tweaks you can do to control your jobs. Here we'll just mention a few of them. - -- **Dry runs**. You can use `-d, --dry-run` to create the job submission scripts, but not actually run them. This is really useful for testing that everything is set up correctly before you commit to submitting hundreds of jobs. -- **Limiting the number of jobs**. You can `-l, --limit` to test a few before running all samples. You can also use the `--selector-*` arguments to select certain samples to include or exclude. -- **Grouping jobs**. You can use `-u, --lump` or `-n, --lumpn` to group jobs. [More details on grouping jobs](grouping-jobs.md). -- **Changing compute settings**. You can use `-p, --package`, `-s, --settings`, or `-c, --compute` to change the compute templates. Read more in [running on a cluster](running-on-a-cluster.md). -- **Time delay**. You can stagger submissions to not overload a submission engine using `--time-delay`. -- **Use rerun to resubmit jobs**. To run only jobs that previously failed, try `looper rerun`. -- **Tweak the command on-the-fly**. The `--command-extra` arguments allow you to pass extra arguments to every command straight through from looper. See [parameterizing pipelines](parameterizing-pipelines.md). diff --git a/docs/running-on-a-cluster.md b/docs/running-on-a-cluster.md deleted file mode 100644 index 76fe54ae5..000000000 --- a/docs/running-on-a-cluster.md +++ /dev/null @@ -1,24 +0,0 @@ -# How to submit looper jobs to a cluster - -By default, `looper` will build a shell script for each sample and then run it sequentially on the local computer. This is convenient for simple cases, but when it comes time to scale up, this is where `looper` really excels. Looper uses a powerful [concentric template system](concentric-templates.md) that enables looper to run jobs on any cluster resource manager (like SLURM, SGE, LFS, etc.) by simply setting up a template for it. The environment templates are managed by [divvy](http://code.databio.org/divvy). - -## Overview and basic example of cluster computing - -To configure `looper` for cluster computing, you just configure divvy. Divvy is automatically installed when you install looper. Briefly, first create a `divvy` computing configuration file using `divvy init`: - -```bash -export DIVCFG="divvy_config.yaml" -divvy init -c $DIVCFG -``` - -Looper will now have access to your computing configuration. You can run `divvy list` to see what compute packages are available in this file. For example, you'll start with a package called 'slurm', which you can use with looper by calling `looper --package slurm`. For many systems (SLURM, SGE, LFS, etc), the default divvy configuration will work out of the box. If you need to tweak things, the template system is flexible and you can configure it to run in any compute environment. That's all there is to it. - -Complete details on how to configure divvy are described in the [divvy documentation](http://divvy.databio.org). - -## Divvy config file locations - -Looper will by default will look for the divvy configuration file in `$DIVCFG`, but you can override this by specifying a path to other file with `--divvy` argument, like this: - -```bash -looper --divvy /path/to/env_cfg.yaml ... -``` diff --git a/docs/sample-annotation-sheet.md b/docs/sample-annotation-sheet.md deleted file mode 100644 index 0c8a3884f..000000000 --- a/docs/sample-annotation-sheet.md +++ /dev/null @@ -1,47 +0,0 @@ -# Sample annotation sheet - -The *sample annotation sheet* is a csv file containing information about all samples in a project. -This should be regarded as static and a project's most important metadata. -**One row corresponds to one pipeline run** (if there's just one pipeline run per sample, there's 1:1 correspondence between rows and samples as well.) - -A sample annotation sheet may contain any number of columns you need for your project. -You can think of these columns as *sample attributes*, and you may use these columns later in your pipelines or analysis. -For example, you could define a column called `organism` and use the resulting attribute on a sample to adjust the assembly used by a pipeline through which it's run. - -## Special columns - -Certain keyword columns are required or provide `looper`-specific features. -Any additional columns become attributes of your sample and will be part of the project's metadata for the samples. -Mostly, you have control over any other column names you want to add, but there are a few reserved column names: - -- `sample_name` - a **unique** string1 identifying each sample. This is **required** for `Sample` construction, -but it's the *only required column*. -- `organism` - a string identifying the organism ("human", "mouse", "mixed"). ***Recommended** but not required*. -- `library` - While not needed to build a `Sample`, this column is required for submission of job(s). -It specifies the source of data for the sample (e.g. ATAC-seq, RNA-seq, RRBS). -`looper` uses this information to determine which pipelines are relevant for the `Sample`. -- `data_source` - This column is used by default to specify the location of the input data file. -Usually you want your annotation sheet to specify the locations of files corresponding to each sample. -You can use this to simplify pointing to file locations with a neat string-replacement method that keeps things clean and portable. -For more details, see the [derived columns page](derived-columns.md). -Really, you just need any column specifying at least 1 data file for input. This is **required** for `looper` to submit job(s) for a `Sample`. -- `toggle` - If the value of this column is not 1, `looper` will not submit the pipeline for that sample. -This enables you to submit a subset of samples. - -Here's an **example** annotation sheet: - -```CSV -sample_name, library, organism, flowcell, lane, BSF_name, data_source -"albt_0h", "RRBS", "albatross", "BSFX0190", "1", "albt_0h", "bsf_sample" -"albt_1h", "RRBS", "albatross", "BSFX0190", "1", "albt_1h", "bsf_sample" -"albt_2h", "RRBS", "albatross", "BSFX0190", "1", "albt_2h", "bsf_sample" -"albt_3h", "RRBS", "albatross", "BSFX0190", "1", "albt_3h", "bsf_sample" -"frog_0h", "RRBS", "frog", "", "", "", "frog_data" -"frog_1h", "RRBS", "frog", "", "", "", "frog_data" -"frog_2h", "RRBS", "frog", "", "", "", "frog_data" -"frog_3h", "RRBS", "frog", "", "", "", "frog_data" - -``` - -1 The sample name should contain no whitespace. If it does, an error will be thrown. -Similarly, `looper` will not allow any duplicate entries under sample_name. diff --git a/docs/support.md b/docs/support.md deleted file mode 100644 index f844c3557..000000000 --- a/docs/support.md +++ /dev/null @@ -1,5 +0,0 @@ -# Support - -Please use the [issue tracker at GitHub](https://github.com/pepkit/looper/issues) to file bug reports or feature requests. - -Looper supports Python 2.7 and Python 3, and has been tested in Linux. If you clone this repository and then an attempt at local installation, e.g. with `pip install --upgrade ./`, fails, this may be due to an issue with `setuptools` and `six`. A `FileNotFoundError` (Python 3) or an `IOError` (Python2), with a message/traceback about a nonexistent `METADATA` file means that this is even more likely the cause. To get around this, you can first manually `pip install --upgrade six` or `pip install six==1.11.0`, as upgrading from `six` from 1.10.0 to 1.11.0 resolves this issue, then retry the `looper` installation. diff --git a/docs/usage.md b/docs/usage.md deleted file mode 100644 index c8c58a5fe..000000000 --- a/docs/usage.md +++ /dev/null @@ -1,669 +0,0 @@ -# Usage reference - -Looper doesn't just run pipelines; it can also check and summarize the progress of your jobs, as well as remove all files created by them. - -Each task is controlled by one of the following commands: `run`, `rerun`, `runp` , `table`,`report`, `destroy`, `check`, `clean`, `inspect`, `init` - -- `looper run`: Runs pipelines for each sample, for each pipeline. This will use your `compute` settings to build and submit scripts to your specified compute environment, or run them sequentially on your local computer. - -- `looper runp`: Runs pipelines for each pipeline for project. - -- `looper rerun`: Exactly the same as `looper run`, but only runs jobs with a failed flag. - -- `looper report`: Summarize your project results in a form of browsable HTML pages. - -- `looper table`: This command parses all key-value results reported in the each sample `stats.tsv` and collates them into a large summary matrix, which it saves in the project output directory. This creates such a matrix for each pipeline type run on the project, and a combined master summary table - -- `looper check`: Checks the run progress of the current project. This will display a summary of job status; which pipelines are currently running on which samples, which have completed, which have failed, etc. - -- `looper destroy`: Deletes all output results for this project. - -- `looper inspect`: Display the Project or Sample information - -- `looper init`: Initialize a looper dotfile (`.looper.yaml`) in the current directory - - -Here you can see the command-line usage instructions for the main looper command and for each subcommand: -## `looper --help` -```console -usage: looper [-h] [-v] [--silent] [--verbosity VERBOSITY] [--logdev] - {run,rerun,runp,table,report,destroy,check,clean,init,init_piface,link,inspect} - ... - -Looper Pydantic Argument Parser - -commands: - {run,rerun,runp,table,report,destroy,check,clean,init,init_piface,link,inspect} - run Run or submit sample jobs. - rerun Resubmit sample jobs with failed flags. - runp Run or submit project jobs. - table Write summary stats table for project samples. - report Create browsable HTML report of project results. - destroy Remove output files of the project. - check Check flag status of current runs. - clean Run clean scripts of already processed jobs. - init Initialize looper config file. - init_piface Initialize generic pipeline interface. - link Create directory of symlinks for reported results. - inspect Print information about a project. - -optional arguments: - --silent Whether to silence logging (default: False) - --verbosity VERBOSITY - Alternate mode of expression for logging level that - better accords with intuition about how to convey - this. (default: None) - --logdev Whether to log in development mode; possibly among - other behavioral changes to logs handling, use a more - information-rich message format template. (default: - False) - -help: - -h, --help show this help message and exit - -v, --version show program's version number and exit -``` - -## `looper run --help` -```console -usage: looper run [-h] [-i] [-t TIME_DELAY] [-d] [-x COMMAND_EXTRA] - [-y COMMAND_EXTRA_OVERRIDE] [-u LUMP] [-n LUMP_N] - [-j LUMP_J] [--divvy DIVVY] [-f] [-c COMPUTE [COMPUTE ...]] - [--package PACKAGE] [--settings SETTINGS] - [--exc-flag EXC_FLAG [EXC_FLAG ...]] - [--sel-flag SEL_FLAG [SEL_FLAG ...]] [--sel-attr SEL_ATTR] - [--sel-incl SEL_INCL [SEL_INCL ...]] [--sel-excl SEL_EXCL] - [-l LIMIT] [-k SKIP] [--pep-config PEP_CONFIG] - [-o OUTPUT_DIR] [--config-file CONFIG_FILE] - [--looper-config LOOPER_CONFIG] - [-S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...]] - [-P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...]] - [--pipestat PIPESTAT] [--amend AMEND [AMEND ...]] - [--project] - -optional arguments: - -i, --ignore-flags Ignore run status flags (default: False) - -t TIME_DELAY, --time-delay TIME_DELAY - Time delay in seconds between job submissions (min: 0, - max: 30) (default: 0) - -d, --dry-run Don't actually submit jobs (default: False) - -x COMMAND_EXTRA, --command-extra COMMAND_EXTRA - String to append to every command (default: ) - -y COMMAND_EXTRA_OVERRIDE, --command-extra-override COMMAND_EXTRA_OVERRIDE - Same as command-extra, but overrides values in PEP - (default: ) - -u LUMP, --lump LUMP Total input file size (GB) to batch into one job - (default: None) - -n LUMP_N, --lump-n LUMP_N - Number of commands to batch into one job (default: - None) - -j LUMP_J, --lump-j LUMP_J - Lump samples into number of jobs. (default: None) - --divvy DIVVY Path to divvy configuration file. Default=$DIVCFG env - variable. Currently: not set (default: None) - -f, --skip-file-checks - Do not perform input file checks (default: False) - -c COMPUTE [COMPUTE ...], --compute COMPUTE [COMPUTE ...] - List of key-value pairs (k1=v1) (default: []) - --package PACKAGE Name of computing resource package to use (default: - None) - --settings SETTINGS Path to a YAML settings file with compute settings - (default: ) - --exc-flag EXC_FLAG [EXC_FLAG ...] - Sample exclusion flag (default: []) - --sel-flag SEL_FLAG [SEL_FLAG ...] - Sample selection flag (default: []) - --sel-attr SEL_ATTR Attribute for sample exclusion OR inclusion (default: - toggle) - --sel-incl SEL_INCL [SEL_INCL ...] - Include only samples with these values (default: []) - --sel-excl SEL_EXCL Exclude samples with these values (default: ) - -l LIMIT, --limit LIMIT - Limit to n samples (default: None) - -k SKIP, --skip SKIP Skip samples by numerical index (default: None) - --pep-config PEP_CONFIG - PEP configuration file (default: None) - -o OUTPUT_DIR, --output-dir OUTPUT_DIR - Output directory (default: None) - --config-file CONFIG_FILE - Project configuration file (default: None) - --looper-config LOOPER_CONFIG - Looper configuration file (YAML) (default: None) - -S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...], --sample-pipeline-interfaces SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...] - Paths to looper sample pipeline interfaces (default: - []) - -P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...], --project-pipeline-interfaces PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...] - Paths to looper project pipeline interfaces (default: - []) - --pipestat PIPESTAT Path to pipestat files. (default: None) - --amend AMEND [AMEND ...] - List of amendments to activate (default: []) - --project Is this command executed for project-level? (default: - False) - -help: - -h, --help show this help message and exit -``` - -## `looper runp --help` -```console -usage: looper runp [-h] [-i] [-t TIME_DELAY] [-d] [-x COMMAND_EXTRA] - [-y COMMAND_EXTRA_OVERRIDE] [-u LUMP] [-n LUMP_N] - [--divvy DIVVY] [-f] [-c COMPUTE [COMPUTE ...]] - [--package PACKAGE] [--settings SETTINGS] - [--exc-flag EXC_FLAG [EXC_FLAG ...]] - [--sel-flag SEL_FLAG [SEL_FLAG ...]] [--sel-attr SEL_ATTR] - [--sel-incl SEL_INCL [SEL_INCL ...]] [--sel-excl SEL_EXCL] - [-l LIMIT] [-k SKIP] [--pep-config PEP_CONFIG] - [-o OUTPUT_DIR] [--config-file CONFIG_FILE] - [--looper-config LOOPER_CONFIG] - [-S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...]] - [-P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...]] - [--pipestat PIPESTAT] [--amend AMEND [AMEND ...]] - [--project] - -optional arguments: - -i, --ignore-flags Ignore run status flags (default: False) - -t TIME_DELAY, --time-delay TIME_DELAY - Time delay in seconds between job submissions (min: 0, - max: 30) (default: 0) - -d, --dry-run Don't actually submit jobs (default: False) - -x COMMAND_EXTRA, --command-extra COMMAND_EXTRA - String to append to every command (default: ) - -y COMMAND_EXTRA_OVERRIDE, --command-extra-override COMMAND_EXTRA_OVERRIDE - Same as command-extra, but overrides values in PEP - (default: ) - -u LUMP, --lump LUMP Total input file size (GB) to batch into one job - (default: None) - -n LUMP_N, --lump-n LUMP_N - Number of commands to batch into one job (default: - None) - --divvy DIVVY Path to divvy configuration file. Default=$DIVCFG env - variable. Currently: not set (default: None) - -f, --skip-file-checks - Do not perform input file checks (default: False) - -c COMPUTE [COMPUTE ...], --compute COMPUTE [COMPUTE ...] - List of key-value pairs (k1=v1) (default: []) - --package PACKAGE Name of computing resource package to use (default: - None) - --settings SETTINGS Path to a YAML settings file with compute settings - (default: ) - --exc-flag EXC_FLAG [EXC_FLAG ...] - Sample exclusion flag (default: []) - --sel-flag SEL_FLAG [SEL_FLAG ...] - Sample selection flag (default: []) - --sel-attr SEL_ATTR Attribute for sample exclusion OR inclusion (default: - toggle) - --sel-incl SEL_INCL [SEL_INCL ...] - Include only samples with these values (default: []) - --sel-excl SEL_EXCL Exclude samples with these values (default: ) - -l LIMIT, --limit LIMIT - Limit to n samples (default: None) - -k SKIP, --skip SKIP Skip samples by numerical index (default: None) - --pep-config PEP_CONFIG - PEP configuration file (default: None) - -o OUTPUT_DIR, --output-dir OUTPUT_DIR - Output directory (default: None) - --config-file CONFIG_FILE - Project configuration file (default: None) - --looper-config LOOPER_CONFIG - Looper configuration file (YAML) (default: None) - -S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...], --sample-pipeline-interfaces SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...] - Paths to looper sample pipeline interfaces (default: - []) - -P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...], --project-pipeline-interfaces PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...] - Paths to looper project pipeline interfaces (default: - []) - --pipestat PIPESTAT Path to pipestat files. (default: None) - --amend AMEND [AMEND ...] - List of amendments to activate (default: []) - --project Is this command executed for project-level? (default: - False) - -help: - -h, --help show this help message and exit -``` - -## `looper rerun --help` -```console -usage: looper rerun [-h] [-i] [-t TIME_DELAY] [-d] [-x COMMAND_EXTRA] - [-y COMMAND_EXTRA_OVERRIDE] [-u LUMP] [-n LUMP_N] - [-j LUMP_J] [--divvy DIVVY] [-f] - [-c COMPUTE [COMPUTE ...]] [--package PACKAGE] - [--settings SETTINGS] [--exc-flag EXC_FLAG [EXC_FLAG ...]] - [--sel-flag SEL_FLAG [SEL_FLAG ...]] [--sel-attr SEL_ATTR] - [--sel-incl SEL_INCL [SEL_INCL ...]] [--sel-excl SEL_EXCL] - [-l LIMIT] [-k SKIP] [--pep-config PEP_CONFIG] - [-o OUTPUT_DIR] [--config-file CONFIG_FILE] - [--looper-config LOOPER_CONFIG] - [-S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...]] - [-P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...]] - [--pipestat PIPESTAT] [--amend AMEND [AMEND ...]] - [--project] - -optional arguments: - -i, --ignore-flags Ignore run status flags (default: False) - -t TIME_DELAY, --time-delay TIME_DELAY - Time delay in seconds between job submissions (min: 0, - max: 30) (default: 0) - -d, --dry-run Don't actually submit jobs (default: False) - -x COMMAND_EXTRA, --command-extra COMMAND_EXTRA - String to append to every command (default: ) - -y COMMAND_EXTRA_OVERRIDE, --command-extra-override COMMAND_EXTRA_OVERRIDE - Same as command-extra, but overrides values in PEP - (default: ) - -u LUMP, --lump LUMP Total input file size (GB) to batch into one job - (default: None) - -n LUMP_N, --lump-n LUMP_N - Number of commands to batch into one job (default: - None) - -j LUMP_J, --lump-j LUMP_J - Lump samples into number of jobs. (default: None) - --divvy DIVVY Path to divvy configuration file. Default=$DIVCFG env - variable. Currently: not set (default: None) - -f, --skip-file-checks - Do not perform input file checks (default: False) - -c COMPUTE [COMPUTE ...], --compute COMPUTE [COMPUTE ...] - List of key-value pairs (k1=v1) (default: []) - --package PACKAGE Name of computing resource package to use (default: - None) - --settings SETTINGS Path to a YAML settings file with compute settings - (default: ) - --exc-flag EXC_FLAG [EXC_FLAG ...] - Sample exclusion flag (default: []) - --sel-flag SEL_FLAG [SEL_FLAG ...] - Sample selection flag (default: []) - --sel-attr SEL_ATTR Attribute for sample exclusion OR inclusion (default: - toggle) - --sel-incl SEL_INCL [SEL_INCL ...] - Include only samples with these values (default: []) - --sel-excl SEL_EXCL Exclude samples with these values (default: ) - -l LIMIT, --limit LIMIT - Limit to n samples (default: None) - -k SKIP, --skip SKIP Skip samples by numerical index (default: None) - --pep-config PEP_CONFIG - PEP configuration file (default: None) - -o OUTPUT_DIR, --output-dir OUTPUT_DIR - Output directory (default: None) - --config-file CONFIG_FILE - Project configuration file (default: None) - --looper-config LOOPER_CONFIG - Looper configuration file (YAML) (default: None) - -S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...], --sample-pipeline-interfaces SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...] - Paths to looper sample pipeline interfaces (default: - []) - -P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...], --project-pipeline-interfaces PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...] - Paths to looper project pipeline interfaces (default: - []) - --pipestat PIPESTAT Path to pipestat files. (default: None) - --amend AMEND [AMEND ...] - List of amendments to activate (default: []) - --project Is this command executed for project-level? (default: - False) - -help: - -h, --help show this help message and exit -``` - -## `looper report --help` -```console -usage: looper report [-h] [--portable] [--settings SETTINGS] - [--exc-flag EXC_FLAG [EXC_FLAG ...]] - [--sel-flag SEL_FLAG [SEL_FLAG ...]] - [--sel-attr SEL_ATTR] - [--sel-incl SEL_INCL [SEL_INCL ...]] - [--sel-excl SEL_EXCL] [-l LIMIT] [-k SKIP] - [--pep-config PEP_CONFIG] [-o OUTPUT_DIR] - [--config-file CONFIG_FILE] - [--looper-config LOOPER_CONFIG] - [-S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...]] - [-P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...]] - [--pipestat PIPESTAT] [--amend AMEND [AMEND ...]] - [--project] - -optional arguments: - --portable Makes html report portable. (default: False) - --settings SETTINGS Path to a YAML settings file with compute settings - (default: ) - --exc-flag EXC_FLAG [EXC_FLAG ...] - Sample exclusion flag (default: []) - --sel-flag SEL_FLAG [SEL_FLAG ...] - Sample selection flag (default: []) - --sel-attr SEL_ATTR Attribute for sample exclusion OR inclusion (default: - toggle) - --sel-incl SEL_INCL [SEL_INCL ...] - Include only samples with these values (default: []) - --sel-excl SEL_EXCL Exclude samples with these values (default: ) - -l LIMIT, --limit LIMIT - Limit to n samples (default: None) - -k SKIP, --skip SKIP Skip samples by numerical index (default: None) - --pep-config PEP_CONFIG - PEP configuration file (default: None) - -o OUTPUT_DIR, --output-dir OUTPUT_DIR - Output directory (default: None) - --config-file CONFIG_FILE - Project configuration file (default: None) - --looper-config LOOPER_CONFIG - Looper configuration file (YAML) (default: None) - -S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...], --sample-pipeline-interfaces SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...] - Paths to looper sample pipeline interfaces (default: - []) - -P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...], --project-pipeline-interfaces PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...] - Paths to looper project pipeline interfaces (default: - []) - --pipestat PIPESTAT Path to pipestat files. (default: None) - --amend AMEND [AMEND ...] - List of amendments to activate (default: []) - --project Is this command executed for project-level? (default: - False) - -help: - -h, --help show this help message and exit -``` - -## `looper table --help` -```console -usage: looper table [-h] [--settings SETTINGS] - [--exc-flag EXC_FLAG [EXC_FLAG ...]] - [--sel-flag SEL_FLAG [SEL_FLAG ...]] [--sel-attr SEL_ATTR] - [--sel-incl SEL_INCL [SEL_INCL ...]] [--sel-excl SEL_EXCL] - [-l LIMIT] [-k SKIP] [--pep-config PEP_CONFIG] - [-o OUTPUT_DIR] [--config-file CONFIG_FILE] - [--looper-config LOOPER_CONFIG] - [-S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...]] - [-P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...]] - [--pipestat PIPESTAT] [--amend AMEND [AMEND ...]] - [--project] - -optional arguments: - --settings SETTINGS Path to a YAML settings file with compute settings - (default: ) - --exc-flag EXC_FLAG [EXC_FLAG ...] - Sample exclusion flag (default: []) - --sel-flag SEL_FLAG [SEL_FLAG ...] - Sample selection flag (default: []) - --sel-attr SEL_ATTR Attribute for sample exclusion OR inclusion (default: - toggle) - --sel-incl SEL_INCL [SEL_INCL ...] - Include only samples with these values (default: []) - --sel-excl SEL_EXCL Exclude samples with these values (default: ) - -l LIMIT, --limit LIMIT - Limit to n samples (default: None) - -k SKIP, --skip SKIP Skip samples by numerical index (default: None) - --pep-config PEP_CONFIG - PEP configuration file (default: None) - -o OUTPUT_DIR, --output-dir OUTPUT_DIR - Output directory (default: None) - --config-file CONFIG_FILE - Project configuration file (default: None) - --looper-config LOOPER_CONFIG - Looper configuration file (YAML) (default: None) - -S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...], --sample-pipeline-interfaces SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...] - Paths to looper sample pipeline interfaces (default: - []) - -P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...], --project-pipeline-interfaces PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...] - Paths to looper project pipeline interfaces (default: - []) - --pipestat PIPESTAT Path to pipestat files. (default: None) - --amend AMEND [AMEND ...] - List of amendments to activate (default: []) - --project Is this command executed for project-level? (default: - False) - -help: - -h, --help show this help message and exit -``` - -## `looper inspect --help` -```console -usage: looper inspect [-h] [--settings SETTINGS] - [--exc-flag EXC_FLAG [EXC_FLAG ...]] - [--sel-flag SEL_FLAG [SEL_FLAG ...]] - [--sel-attr SEL_ATTR] - [--sel-incl SEL_INCL [SEL_INCL ...]] - [--sel-excl SEL_EXCL] [-l LIMIT] [-k SKIP] - [--pep-config PEP_CONFIG] [-o OUTPUT_DIR] - [--config-file CONFIG_FILE] - [--looper-config LOOPER_CONFIG] - [-S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...]] - [-P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...]] - [--pipestat PIPESTAT] [--amend AMEND [AMEND ...]] - [--project] - -optional arguments: - --settings SETTINGS Path to a YAML settings file with compute settings - (default: ) - --exc-flag EXC_FLAG [EXC_FLAG ...] - Sample exclusion flag (default: []) - --sel-flag SEL_FLAG [SEL_FLAG ...] - Sample selection flag (default: []) - --sel-attr SEL_ATTR Attribute for sample exclusion OR inclusion (default: - toggle) - --sel-incl SEL_INCL [SEL_INCL ...] - Include only samples with these values (default: []) - --sel-excl SEL_EXCL Exclude samples with these values (default: ) - -l LIMIT, --limit LIMIT - Limit to n samples (default: None) - -k SKIP, --skip SKIP Skip samples by numerical index (default: None) - --pep-config PEP_CONFIG - PEP configuration file (default: None) - -o OUTPUT_DIR, --output-dir OUTPUT_DIR - Output directory (default: None) - --config-file CONFIG_FILE - Project configuration file (default: None) - --looper-config LOOPER_CONFIG - Looper configuration file (YAML) (default: None) - -S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...], --sample-pipeline-interfaces SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...] - Paths to looper sample pipeline interfaces (default: - []) - -P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...], --project-pipeline-interfaces PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...] - Paths to looper project pipeline interfaces (default: - []) - --pipestat PIPESTAT Path to pipestat files. (default: None) - --amend AMEND [AMEND ...] - List of amendments to activate (default: []) - --project Is this command executed for project-level? (default: - False) - -help: - -h, --help show this help message and exit -``` - -## `looper init --help` -```console -usage: looper init [-h] [-f] [-o OUTPUT_DIR] [--pep-config PEP_CONFIG] - [-S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...]] - [-P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...]] - -optional arguments: - -f, --force-yes Provide upfront confirmation of destruction intent, to - skip console query. Default=False (default: False) - -o OUTPUT_DIR, --output-dir OUTPUT_DIR - Output directory (default: None) - --pep-config PEP_CONFIG - PEP configuration file (default: None) - -S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...], --sample-pipeline-interfaces SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...] - Paths to looper sample pipeline interfaces (default: - []) - -P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...], --project-pipeline-interfaces PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...] - Paths to looper project pipeline interfaces (default: - []) - -help: - -h, --help show this help message and exit -``` - -## `looper destroy --help` -```console -usage: looper destroy [-h] [-d] [-f] [--settings SETTINGS] - [--exc-flag EXC_FLAG [EXC_FLAG ...]] - [--sel-flag SEL_FLAG [SEL_FLAG ...]] - [--sel-attr SEL_ATTR] - [--sel-incl SEL_INCL [SEL_INCL ...]] - [--sel-excl SEL_EXCL] [-l LIMIT] [-k SKIP] - [--pep-config PEP_CONFIG] [-o OUTPUT_DIR] - [--config-file CONFIG_FILE] - [--looper-config LOOPER_CONFIG] - [-S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...]] - [-P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...]] - [--pipestat PIPESTAT] [--amend AMEND [AMEND ...]] - [--project] - -optional arguments: - -d, --dry-run Don't actually submit jobs (default: False) - -f, --force-yes Provide upfront confirmation of destruction intent, to - skip console query. Default=False (default: False) - --settings SETTINGS Path to a YAML settings file with compute settings - (default: ) - --exc-flag EXC_FLAG [EXC_FLAG ...] - Sample exclusion flag (default: []) - --sel-flag SEL_FLAG [SEL_FLAG ...] - Sample selection flag (default: []) - --sel-attr SEL_ATTR Attribute for sample exclusion OR inclusion (default: - toggle) - --sel-incl SEL_INCL [SEL_INCL ...] - Include only samples with these values (default: []) - --sel-excl SEL_EXCL Exclude samples with these values (default: ) - -l LIMIT, --limit LIMIT - Limit to n samples (default: None) - -k SKIP, --skip SKIP Skip samples by numerical index (default: None) - --pep-config PEP_CONFIG - PEP configuration file (default: None) - -o OUTPUT_DIR, --output-dir OUTPUT_DIR - Output directory (default: None) - --config-file CONFIG_FILE - Project configuration file (default: None) - --looper-config LOOPER_CONFIG - Looper configuration file (YAML) (default: None) - -S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...], --sample-pipeline-interfaces SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...] - Paths to looper sample pipeline interfaces (default: - []) - -P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...], --project-pipeline-interfaces PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...] - Paths to looper project pipeline interfaces (default: - []) - --pipestat PIPESTAT Path to pipestat files. (default: None) - --amend AMEND [AMEND ...] - List of amendments to activate (default: []) - --project Is this command executed for project-level? (default: - False) - -help: - -h, --help show this help message and exit -``` - -## `looper check --help` -```console -usage: looper check [-h] [--describe-codes] [--itemized] - [-f FLAGS [FLAGS ...]] [--settings SETTINGS] - [--exc-flag EXC_FLAG [EXC_FLAG ...]] - [--sel-flag SEL_FLAG [SEL_FLAG ...]] [--sel-attr SEL_ATTR] - [--sel-incl SEL_INCL [SEL_INCL ...]] [--sel-excl SEL_EXCL] - [-l LIMIT] [-k SKIP] [--pep-config PEP_CONFIG] - [-o OUTPUT_DIR] [--config-file CONFIG_FILE] - [--looper-config LOOPER_CONFIG] - [-S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...]] - [-P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...]] - [--pipestat PIPESTAT] [--amend AMEND [AMEND ...]] - [--project] - -optional arguments: - --describe-codes Show status codes description. Default=False (default: - False) - --itemized Show detailed overview of sample statuses. - Default=False (default: False) - -f FLAGS [FLAGS ...], --flags FLAGS [FLAGS ...] - Only check samples based on these status flags. - (default: []) - --settings SETTINGS Path to a YAML settings file with compute settings - (default: ) - --exc-flag EXC_FLAG [EXC_FLAG ...] - Sample exclusion flag (default: []) - --sel-flag SEL_FLAG [SEL_FLAG ...] - Sample selection flag (default: []) - --sel-attr SEL_ATTR Attribute for sample exclusion OR inclusion (default: - toggle) - --sel-incl SEL_INCL [SEL_INCL ...] - Include only samples with these values (default: []) - --sel-excl SEL_EXCL Exclude samples with these values (default: ) - -l LIMIT, --limit LIMIT - Limit to n samples (default: None) - -k SKIP, --skip SKIP Skip samples by numerical index (default: None) - --pep-config PEP_CONFIG - PEP configuration file (default: None) - -o OUTPUT_DIR, --output-dir OUTPUT_DIR - Output directory (default: None) - --config-file CONFIG_FILE - Project configuration file (default: None) - --looper-config LOOPER_CONFIG - Looper configuration file (YAML) (default: None) - -S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...], --sample-pipeline-interfaces SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...] - Paths to looper sample pipeline interfaces (default: - []) - -P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...], --project-pipeline-interfaces PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...] - Paths to looper project pipeline interfaces (default: - []) - --pipestat PIPESTAT Path to pipestat files. (default: None) - --amend AMEND [AMEND ...] - List of amendments to activate (default: []) - --project Is this command executed for project-level? (default: - False) - -help: - -h, --help show this help message and exit -``` - -## `looper clean --help` -```console -usage: looper clean [-h] [-d] [-f] [--settings SETTINGS] - [--exc-flag EXC_FLAG [EXC_FLAG ...]] - [--sel-flag SEL_FLAG [SEL_FLAG ...]] [--sel-attr SEL_ATTR] - [--sel-incl SEL_INCL [SEL_INCL ...]] [--sel-excl SEL_EXCL] - [-l LIMIT] [-k SKIP] [--pep-config PEP_CONFIG] - [-o OUTPUT_DIR] [--config-file CONFIG_FILE] - [--looper-config LOOPER_CONFIG] - [-S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...]] - [-P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...]] - [--pipestat PIPESTAT] [--amend AMEND [AMEND ...]] - [--project] - -optional arguments: - -d, --dry-run Don't actually submit jobs (default: False) - -f, --force-yes Provide upfront confirmation of destruction intent, to - skip console query. Default=False (default: False) - --settings SETTINGS Path to a YAML settings file with compute settings - (default: ) - --exc-flag EXC_FLAG [EXC_FLAG ...] - Sample exclusion flag (default: []) - --sel-flag SEL_FLAG [SEL_FLAG ...] - Sample selection flag (default: []) - --sel-attr SEL_ATTR Attribute for sample exclusion OR inclusion (default: - toggle) - --sel-incl SEL_INCL [SEL_INCL ...] - Include only samples with these values (default: []) - --sel-excl SEL_EXCL Exclude samples with these values (default: ) - -l LIMIT, --limit LIMIT - Limit to n samples (default: None) - -k SKIP, --skip SKIP Skip samples by numerical index (default: None) - --pep-config PEP_CONFIG - PEP configuration file (default: None) - -o OUTPUT_DIR, --output-dir OUTPUT_DIR - Output directory (default: None) - --config-file CONFIG_FILE - Project configuration file (default: None) - --looper-config LOOPER_CONFIG - Looper configuration file (YAML) (default: None) - -S SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...], --sample-pipeline-interfaces SAMPLE_PIPELINE_INTERFACES [SAMPLE_PIPELINE_INTERFACES ...] - Paths to looper sample pipeline interfaces (default: - []) - -P PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...], --project-pipeline-interfaces PROJECT_PIPELINE_INTERFACES [PROJECT_PIPELINE_INTERFACES ...] - Paths to looper project pipeline interfaces (default: - []) - --pipestat PIPESTAT Path to pipestat files. (default: None) - --amend AMEND [AMEND ...] - List of amendments to activate (default: []) - --project Is this command executed for project-level? (default: - False) - -help: - -h, --help show this help message and exit -``` - diff --git a/docs/usage.template b/docs/usage.template deleted file mode 100644 index 59ba47b50..000000000 --- a/docs/usage.template +++ /dev/null @@ -1,26 +0,0 @@ -# Usage reference - -Looper doesn't just run pipelines; it can also check and summarize the progress of your jobs, as well as remove all files created by them. - -Each task is controlled by one of the following commands: `run`, `rerun`, `runp` , `table`,`report`, `destroy`, `check`, `clean`, `inspect`, `init` - -- `looper run`: Runs pipelines for each sample, for each pipeline. This will use your `compute` settings to build and submit scripts to your specified compute environment, or run them sequentially on your local computer. - -- `looper runp`: Runs pipelines for each pipeline for project. - -- `looper rerun`: Exactly the same as `looper run`, but only runs jobs with a failed flag. - -- `looper report`: Summarize your project results in a form of browsable HTML pages. - -- `looper table`: This command parses all key-value results reported in the each sample `stats.tsv` and collates them into a large summary matrix, which it saves in the project output directory. This creates such a matrix for each pipeline type run on the project, and a combined master summary table - -- `looper check`: Checks the run progress of the current project. This will display a summary of job status; which pipelines are currently running on which samples, which have completed, which have failed, etc. - -- `looper destroy`: Deletes all output results for this project. - -- `looper inspect`: Display the Project or Sample information - -- `looper init`: Initialize a looper dotfile (`.looper.yaml`) in the current directory - - -Here you can see the command-line usage instructions for the main looper command and for each subcommand: diff --git a/docs/using-geofetch.md b/docs/using-geofetch.md deleted file mode 100644 index 113b1252d..000000000 --- a/docs/using-geofetch.md +++ /dev/null @@ -1,35 +0,0 @@ -# How to set up a new GEO project - -You can use [geofetch](http://geofetch.databio.org) to quickly set up a project to run with looper. - -## Download data - -``` -geofetch -i GSE69993 --just-metadata -m metadata -``` - -## Initialize looper - -Make it easier to run looper without specifying the config - -``` -looper init metadata/*.yaml -``` - -## Convert to fastq - -Now, you can convert the files from sra into fastq format: - -``` -looper run --amend sra_convert -``` - -## Run pipeline - -Add a pipeline interface to link to a project - -(Experimental) - -``` -looper mod "pipeline_interfaces: /path/to/piface.yaml" -``` diff --git a/docs/variable-namespaces.md b/docs/variable-namespaces.md deleted file mode 100644 index b3e2b2a8a..000000000 --- a/docs/variable-namespaces.md +++ /dev/null @@ -1,120 +0,0 @@ -# Looper variable namespaces - -## Populating the templates - -Looper creates job scripts using [concentric templates](concentric-templates.md) consisting of a *command template* and a *submission template*. This layered design allows us to decouple the computing environment from the pipeline, which improves portability. The task of running jobs can be thought of as simply populating the templates with variables. These variables are pooled from several sources: - -1. the command line, where the user provides any on-the-fly variables for a particular run. -2. the PEP, which provides information on the project and samples. -3. the pipeline interface, which provides information on the pipeline to run. -4. the divvy config file, which provides information on the computing environment. - -Variables from these sources are used to populate the templates to construct the commands to run. To keep things organized, looper groups the variables into namespaces. These namespaces are used first to populate the command template, which produces a built command. This command is then treated as a variable in itself, which is pooled with the other variables to populate the submission template. Looper provides 6 variable namespaces for populating the templates: - -## 1. project - -The `project` namespace contains all PEP config attributes. For example, if you have a config file like this: - -```yaml -pep_version: 2.0.0 -my_variable: 123 -``` - -Then `project.my_variable` would have value `123`. You can use the project namespace to refer to any information in the project. You can use `project.looper` to refer to any attributes in the `looper` section of the PEP. - -## 2. sample or samples - -For sample-level pipelines, the `sample` namespace contains all PEP post-processing sample attributes for the given sample. For project-level pipelines, looper constructs a single job for an entire project, so there is no `sample` namespace; instead, there is a `samples` (plural) namespace, which is a list of all the samples in the project. This can be useful if you need to iterate through all the samples in your command template. - -## 3. pipeline - -Everything under `pipeline` in the pipeline interface for this pipeline. This simply provides a convenient way to annotate pipeline-level variables for use in templates. - -## 4. looper - -The `looper` namespace consists of automatic variables created by looper: - -**paths:** - -- `output_dir` -- parent output directory provided in `project.looper.output_dir` in the project configuration file -- `results_subdir` -- the path to the results directory. It is a sub directory of `output_dir` called `project.looper.results_subdir` or "results_pipeline" by default -- `sample_output_folder` -- a sample-specific or project-specific output folder (`results_subdir`/`sample.sample_name`) -- `piface_dir` -- directory the pipeline interface has been read from -- `pep_config` -- path to the project configuration file used for this looper run -- `log_file` -- an automatically created log file path, to be stored in the looper submission subdirectory - -**others:** - -- `total_input_size` -- the sum of file sizes for all files marked as input files in the input schema -- `command` -- the result of populating the command template -- `job_name` -- job name made by concatenating the pipeline identifier and unique sample name - -The `looper.command` value is what enables the two-layer template system, whereby the output of the command template is used as input to the submission template. - -## 5. compute - -The `compute` namespace consists of a group of variables relevant for computing resources. The `compute` namespace has a unique behavior: it aggregates variables from several sources in a priority order, overriding values with more specific ones as priority increases. The list of variable sources in priority order is: - -1. Looper CLI (`--compute` or `--settings` for on-the-fly settings) -2. PEP config, `project.looper.compute` section -3. Pipeline interface, `compute` section -4. Activated divvy compute package (`--package` CLI argument) - -So, the compute namespace is first populated with any variables from the selected divvy compute package. It then updates this with settings given in the `compute` section of the pipeline interface. It then updates from the PEP `project.looper.compute`, and then finally anything passed to `--compute` on the looper CLI. This provides a way to modulate looper behavior at the level of a computing environment, a pipeline, a project, or a run, in that order. - -## 6. pipestat - -The `pipestat` namespace conists of a group of variables that reflect the [pipestat](http://pipestat.databio.org) configuration for a submission. - -1. results_file (`pipestat.file`) -2. record_id (`pipestat.record_identifier`) -3. config (`pipestat.config_path`) - -## Mapping variables to submission templates using divvy adapters - -One remaining issue is how to map variables from the looper variable namespaces onto the variables used in divvy templates. Divvy is decoupled from looper, and its templates are completely customizable, so they do not necessarily understand how to connect to looper variables into divvy templates. The default divvy templates use variables like `{CODE}`, `{JOBNAME}`, and `{LOGFILE}`, among others. A user may customize rename these or add custom variables names in divvy templates. How do we map the looper variables onto these arbitrary divvy template variables? Through divvy adapters. - -These variables are linked to looper namespaces via *divvy adapters*. Here are the default divvy adapters: - -```yaml -adapters: - CODE: looper.command - JOBNAME: looper.job_name - CORES: compute.cores - LOGFILE: looper.log_file - TIME: compute.time - MEM: compute.mem - DOCKER_ARGS: compute.docker_args - DOCKER_IMAGE: compute.docker_image - SINGULARITY_IMAGE: compute.singularity_image - SINGULARITY_ARGS: compute.singularity_args -``` - -The divvy adapters is a section in the divvy configuration file that links the divvy template variable (left side) to any other arbitrary variable names (right side). This example, we've populated the adapters with links to the namespaced input variables provided by looper (right side). You can adjust this section in your configuration file to map any variables into your submission template. - -## Best practices on storing compute variables - -Since compute variables can be stored in several places, it can be confusing to know where you should put things. Here are some guidelines: - -### Partition or queue name - -Because the partition or queue name is relative to your environment, we don't usually specify this in the `resources` section, but rather, in the `pepenv` config. - -### DIVCFG config file - -Variables that describes settings of a **compute environment** should go in the `DIVCFG` file. Any attributes in the activated compute package will be available to populate template variables. For example, the `partition` attribute is specified in many of our default `DIVCFG` files; that attribute is used to populate a template `{PARTITION}` variable. This is what enables pipelines to work in any compute environment, since we have no control over what your partitions are named. You can also use this to change SLURM queues on-the-fly. - -### Pipeline interface - -Variables that are **specific to a pipeline** can be defined in the `pipeline interface` file, `compute` section.As an example of a variable pulled from the `compute` section, we defined in our `pipeline_interface.yaml` a variable pointing to the singularity or docker image that can be used to run the pipeline, like this: - -```yaml -compute: - singularity_image: /absolute/path/to/images/image -``` - -Now, this variable will be available for use in a template as `{SINGULARITY_IMAGE}`. This makes sense to put in the pipeline interface because it is specific to this pipeline. This path should probably be absolute, because a relative path will be interpreted as relative to the working directory where your job is executed (*not* relative to the pipeline interface). This section is also useful for adjusting the amount of resources we need to request from a resource manager like SLURM. For example: `{MEM}`, `{CORES}`, and `{TIME}` are all defined frequently in this section, and they vary for different input file sizes. - -### Project config - -Finally, project-level variables can also be populated from the `compute` section of a project config file. This would enable you to make project-specific compute changes (such as billing a particular project to a particular SLURM resource account). diff --git a/docs/writing-a-pipeline-interface.md b/docs/writing-a-pipeline-interface.md deleted file mode 100644 index 7a9585eb4..000000000 --- a/docs/writing-a-pipeline-interface.md +++ /dev/null @@ -1,34 +0,0 @@ ---- -title: Pipeline interface specification ---- - -# Writing a pipeline interface - -## Introduction - -If you want to use looper to run samples in a PEP through an arbitrary shell command, you will need to write a pipeline interface. Here is a basic walkthrough to write a simple interface file. Once you've been through this, you can consult the formal [pipeline interface format specification](pipeline-interface-specification.md) for further details and reference. - -## Example - -Let's start with a simple example from the [hello_looper repository](https://github.com/pepkit/hello_looper): - -```yaml -pipeline_name: count_lines -pipeline_type: sample -var_templates: - pipeline: {looper.piface_dir}/count_lines.sh -command_template: {pipeline.var_templates.pipeline} {sample.file} -``` - -You can edit this to start your own interface. - -First, think of a unique name for your pipeline and put it in `pipeline_name`. This will be used for messaging and identification. - -Next, choose a `pipeline_type`, which can be either "sample" or "project". Most likely, you're writing a sample pipeline, but you can read more about [sample and project pipelines](pipeline-tiers.md) if you like. - -Next, we need to set the `pipeline` path to our script. This path is relative to the pipeline interface file, so you need to put the pipeline interface somewhere specific relative to the pipeline; perhaps in the same folder or in a parent folder. -Note: previous versions used the `path` variable instead of `var_templates: pipeline:`. However, path functionality will be deprecated in the future. - -Finally, populate the `command_template`. You can use the full power of Jinja2 Python templates here, but most likely you'll just need to use a few variables using curly braces. In this case, we refer to the `count_lines.sh` script with `{pipeline.var_templates.pipeline}`, which points directly to the `pipeline` variable defined above. Then, we use `{sample.file}` to refer to the `file` column in the sample table specified in the PEP. This pipeline thus takes a single positional command-line argument. You can make the command template much more complicated and refer to any sample or project attributes, as well as a bunch of [other variables made available by looper](variable-namespaces.md). - -Now, you have a basic functional pipeline interface. There are many more advanced features you can use to make your pipeline more powerful, such as providing a schema to specify inputs or outputs, making input-size-dependent compute settings, and more. For complete details, consult the formal [pipeline interface format specification](pipeline-interface-specification.md). diff --git a/docs_jupyter/build/.gitignore b/docs_jupyter/build/.gitignore deleted file mode 100644 index d6b7ef32c..000000000 --- a/docs_jupyter/build/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -* -!.gitignore diff --git a/docs_jupyter/cli_divvy.ipynb b/docs_jupyter/cli_divvy.ipynb deleted file mode 100644 index 5b027bf62..000000000 --- a/docs_jupyter/cli_divvy.ipynb +++ /dev/null @@ -1,390 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# command-line tutorial\n", - "\n", - "`Divvy` also provides a command-line interface that gives you the same power as the python API. You can use `--help` to get a list of the command-line options:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "version: 0.5.0\n", - "usage: divvy [-h] [--version] [--verbosity V] [--silent] [--logdev]\n", - " {write,init,list,submit} ...\n", - "\n", - "divvy - write compute job scripts that can be submitted to any computing\n", - "resource\n", - "\n", - "positional arguments:\n", - " {write,init,list,submit}\n", - " write Write a job script\n", - " init Initialize a new divvy config file\n", - " list List available compute packages\n", - " submit Write and then submit a job script\n", - "\n", - "optional arguments:\n", - " -h, --help show this help message and exit\n", - " --version show program's version number and exit\n", - " --verbosity V Set logging level (1-5 or logging module level name)\n", - " --silent Silence logging. Overrides verbosity.\n", - " --logdev Expand content of logging message format.\n", - "\n", - "https://divvy.databio.org\n" - ] - } - ], - "source": [ - "divvy --help" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# The `list` command\n", - "\n", - "Let's first use `divvy list` to show us our available computing packages:" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using default config. No config found in env var: ['DIVCFG', 'PEPENV']\n", - "Using divvy config: /home/nsheff/.local/lib/python2.7/site-packages/divvy/default_config/divvy_config.yaml\n", - "Available compute packages:\n", - "\n", - "default\n", - "slurm\n", - "singularity_slurm\n", - "singularity\n", - "local\n", - "docker\n" - ] - }, - { - "ename": "", - "evalue": "1", - "output_type": "error", - "traceback": [] - } - ], - "source": [ - "divvy list" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# The `write` command\n", - "\n", - "Use `divvy write` to actually write a new script using a template. To do this, you'll need to provide 3 things: a template (which comes from your compute package), a settings file with variables, and an outfile.\n", - "\n", - "\n", - "## The settings file\n", - "\n", - "The settings argument is where you can pass an existing `yaml` file with key-value pairs. Here's a simple example:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "time: 4-0-0\n", - "logfile: results.log\n", - "cores: 6\n", - "partition: large_mem\n", - "mem: 16G\n" - ] - } - ], - "source": [ - "cat settings.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's take a look at the template we are going to use by activating the `slurm` package" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "#!/bin/bash\n", - "#SBATCH --job-name='{JOBNAME}'\n", - "#SBATCH --output='{LOGFILE}'\n", - "#SBATCH --mem='{MEM}'\n", - "#SBATCH --cpus-per-task='{CORES}'\n", - "#SBATCH --time='{TIME}'\n", - "#SBATCH --partition='{PARTITION}'\n", - "#SBATCH -m block\n", - "#SBATCH --ntasks=1\n", - "\n", - "echo 'Compute node:' `hostname`\n", - "echo 'Start time:' `date +'%Y-%m-%d %T'`\n", - "\n", - "{CODE}\n" - ] - } - ], - "source": [ - "cat ../divvy/default_config/divvy_templates/slurm_template.sub" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We use `divvy` to populate that template with our list of variables above, like this:" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using default config. No config found in env var: ['DIVCFG', 'PEPENV']\n", - "Using divvy config: /home/nsheff/.local/lib/python2.7/site-packages/divvy/default_config/divvy_config.yaml\n", - "Activating compute package 'slurm'\n", - "Loading settings file: settings.yaml\n", - "Writing script to /home/nsheff/code/divvy/docs_jupyter/test.sub\n" - ] - } - ], - "source": [ - "divvy write -p slurm -s settings.yaml -o test.sub" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we can take a look at what our sbumission scripts looks like." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "#!/bin/bash\n", - "#SBATCH --job-name='{JOBNAME}'\n", - "#SBATCH --output='results.log'\n", - "#SBATCH --mem='16G'\n", - "#SBATCH --cpus-per-task='6'\n", - "#SBATCH --time='4-0-0'\n", - "#SBATCH --partition='large_mem'\n", - "#SBATCH -m block\n", - "#SBATCH --ntasks=1\n", - "\n", - "echo 'Compute node:' `hostname`\n", - "echo 'Start time:' `date +'%Y-%m-%d %T'`\n", - "\n", - "{CODE}\n" - ] - } - ], - "source": [ - "cat test.sub" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We populated several variables, like `{LOGFILE}` and `{TIME}`, from the `settings.yaml` file. However, the `{CODE}` and `{JOBNAME}` variables are still unpopulated, so this submission script is incomplete. To remedy this, we'll use `divvy`'s command-line variable passing: any non-interpreted arguments passed to `divvy` are assumed to be variables to populate the template. These command-line variables are considered highest priority and so will override any values in the more distant locations. For example:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using default config. No config found in env var: ['DIVCFG', 'PEPENV']\n", - "Using divvy config: /home/nsheff/.local/lib/python2.7/site-packages/divvy/default_config/divvy_config.yaml\n", - "Activating compute package 'slurm'\n", - "Loading settings file: settings.yaml\n", - "Writing script to /home/nsheff/code/divvy/docs_jupyter/test.sub\n" - ] - } - ], - "source": [ - "divvy write -p slurm -s settings.yaml -o test.sub -c code=run-this-cmd jobname=12345 time=6-0-0" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "#!/bin/bash\n", - "#SBATCH --job-name='12345'\n", - "#SBATCH --output='results.log'\n", - "#SBATCH --mem='16G'\n", - "#SBATCH --cpus-per-task='6'\n", - "#SBATCH --time='6-0-0'\n", - "#SBATCH --partition='large_mem'\n", - "#SBATCH -m block\n", - "#SBATCH --ntasks=1\n", - "\n", - "echo 'Compute node:' `hostname`\n", - "echo 'Start time:' `date +'%Y-%m-%d %T'`\n", - "\n", - "run-this-cmd\n" - ] - } - ], - "source": [ - "cat test.sub" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now we have a complete script, which we can run with `sbatch test.sub`. Notice also that the `time` variable uses the one provided on the CLI rather than the one provided in the `settings.yaml` file, because the CLI has a higher priority.\n", - "\n", - "Variables can come from these 3 sources, in order of increasing priority: 1) compute package (defined in the `divvy` configuration file and selected with the `-p` or `--package` argument); 2) `settings.yaml` file, passed with `-s` or `--settings`; 3) any additional variables passed on the command line as key-value pairs to `-c`." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Submitting jobs\n", - "\n", - "Let's try actually submitting these jobs with `divvy submit`:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using default config. No config found in env var: ['DIVCFG', 'PEPENV']\n", - "Using divvy config: /home/nsheff/.local/lib/python2.7/site-packages/divvy/default_config/divvy_config.yaml\n", - "Activating compute package 'slurm'\n", - "Loading settings file: settings.yaml\n", - "Writing script to /home/nsheff/code/divvy/docs_jupyter/test.sub\n", - "sbatch test.sub\n", - "sh: 1: sbatch: not found\n" - ] - } - ], - "source": [ - "divvy submit -p slurm -s settings.yaml -o test.sub -c code=run-this-cmd jobname=12345 time=6-0-0" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The *slurm* package uses `sbatch` as its `submission_command`, but since I'm running this locally, it won't run as I have no `sbatch` command available. Let's try `local` instead:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Using default config. No config found in env var: ['DIVCFG', 'PEPENV']\n", - "Using divvy config: /home/nsheff/.local/lib/python2.7/site-packages/divvy/default_config/divvy_config.yaml\n", - "Activating compute package 'local'\n", - "Loading settings file: settings.yaml\n", - "Writing script to /home/nsheff/code/divvy/docs_jupyter/test.sub\n", - "sh test.sub\n", - "Compute node: zither\n", - "Start time: 2020-05-19 07:46:03\n", - "build\n", - "cli.ipynb\n", - "debug.ipynb\n", - "results.log\n", - "settings.yaml\n", - "test_local.sub\n", - "test_script.sub\n", - "test.sub\n", - "tutorial.ipynb\n" - ] - } - ], - "source": [ - "divvy submit -p local -s settings.yaml -o test.sub -c code=ls" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "There I switched the command to `ls`, which shows you a result of everything on this computer." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Bash", - "language": "bash", - "name": "bash" - }, - "language_info": { - "codemirror_mode": "shell", - "file_extension": ".sh", - "mimetype": "text/x-sh", - "name": "bash" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs_jupyter/debug_divvy.ipynb b/docs_jupyter/debug_divvy.ipynb deleted file mode 100644 index 050581e69..000000000 --- a/docs_jupyter/debug_divvy.ipynb +++ /dev/null @@ -1,56 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "If you want to explore `divvy` with more output, you can turn on debug mode mode like this:" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": false - }, - "outputs": [ - { - "data": { - "text/plain": [ - "" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "import divvy\n", - "\n", - "divvy.setup_divvy_logger(\"DEBUG\", devmode=True)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 2", - "language": "python", - "name": "python2" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 2 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs_jupyter/hello-world.ipynb b/docs_jupyter/hello-world.ipynb deleted file mode 100644 index e6119f62e..000000000 --- a/docs_jupyter/hello-world.ipynb +++ /dev/null @@ -1,524 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Hello World! example for looper\n", - "\n", - "This tutorial demonstrates how to install `looper` and use it to run a pipeline on a PEP project. \n", - "\n", - "## 1. Install the latest version of looper:\n", - "\n", - "```console\n", - "pip install --user --upgrade looper\n", - "```\n", - "\n", - "## 2. Download and unzip the hello_looper repository\n", - "\n", - "The [hello looper repository](http://github.com/pepkit/hello_looper) contains a basic functional example project (in `/project`) and a looper-compatible pipeline (in `/pipeline`) that can run on that project. Let's download and unzip it:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "--2023-11-08 17:27:01-- https://github.com/pepkit/hello_looper/archive/refs/heads/master.zip\n", - "Resolving github.com (github.com)... 140.82.114.3\n", - "Connecting to github.com (github.com)|140.82.114.3|:443... connected.\n", - "HTTP request sent, awaiting response... 302 Found\n", - "Location: https://codeload.github.com/pepkit/hello_looper/zip/refs/heads/master [following]\n", - "--2023-11-08 17:27:01-- https://codeload.github.com/pepkit/hello_looper/zip/refs/heads/master\n", - "Resolving codeload.github.com (codeload.github.com)... 140.82.113.10\n", - "Connecting to codeload.github.com (codeload.github.com)|140.82.113.10|:443... connected.\n", - "HTTP request sent, awaiting response... 200 OK\n", - "Length: unspecified [application/zip]\n", - "Saving to: ‘master.zip’\n", - "\n", - "master.zip [ <=> ] 13.37K --.-KB/s in 0.03s \n", - "\n", - "2023-11-08 17:27:01 (472 KB/s) - ‘master.zip’ saved [13693]\n", - "\n" - ] - } - ], - "source": [ - "!wget https://github.com/pepkit/hello_looper/archive/refs/heads/master.zip" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Archive: master.zip\r\n", - "73ef08e38d3e17fd3d4f940282c80e3ee4dbb91f\r\n", - " creating: hello_looper-master/\r\n", - " inflating: hello_looper-master/.gitignore \r\n", - " inflating: hello_looper-master/.looper.yaml \r\n", - " inflating: hello_looper-master/.looper_pephub.yaml \r\n", - " inflating: hello_looper-master/.looper_pipestat.yaml \r\n", - " inflating: hello_looper-master/.looper_pipestat_shell.yaml \r\n", - " inflating: hello_looper-master/README.md \r\n", - " creating: hello_looper-master/data/\r\n", - " inflating: hello_looper-master/data/frog1_data.txt \r\n", - " inflating: hello_looper-master/data/frog2_data.txt \r\n", - " inflating: hello_looper-master/looper_pipelines.md \r\n", - " creating: hello_looper-master/old_specification/\r\n", - " inflating: hello_looper-master/old_specification/README.md \r\n", - " creating: hello_looper-master/old_specification/data/\r\n", - " inflating: hello_looper-master/old_specification/data/frog1_data.txt \r\n", - " inflating: hello_looper-master/old_specification/data/frog2_data.txt \r\n", - " creating: hello_looper-master/old_specification/pipeline/\r\n", - " inflating: hello_looper-master/old_specification/pipeline/count_lines.sh \r\n", - " inflating: hello_looper-master/old_specification/pipeline/pipeline_interface.yaml \r\n", - " creating: hello_looper-master/old_specification/project/\r\n", - " inflating: hello_looper-master/old_specification/project/project_config.yaml \r\n", - " inflating: hello_looper-master/old_specification/project/sample_annotation.csv \r\n", - " creating: hello_looper-master/pipeline/\r\n", - " inflating: hello_looper-master/pipeline/count_lines.sh \r\n", - " inflating: hello_looper-master/pipeline/pipeline_interface.yaml \r\n", - " inflating: hello_looper-master/pipeline/pipeline_interface_project.yaml \r\n", - " creating: hello_looper-master/pipeline_pipestat/\r\n", - " inflating: hello_looper-master/pipeline_pipestat/count_lines.py \r\n", - " inflating: hello_looper-master/pipeline_pipestat/count_lines_pipestat.sh \r\n", - " inflating: hello_looper-master/pipeline_pipestat/pipeline_interface.yaml \r\n", - " inflating: hello_looper-master/pipeline_pipestat/pipeline_interface_shell.yaml \r\n", - " inflating: hello_looper-master/pipeline_pipestat/pipestat_output_schema.yaml \r\n", - " creating: hello_looper-master/project/\r\n", - " inflating: hello_looper-master/project/project_config.yaml \r\n", - " inflating: hello_looper-master/project/sample_annotation.csv \r\n" - ] - } - ], - "source": [ - "!unzip master.zip" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Run it\n", - "\n", - "Run it by changing to the directory and then invoking `looper run` on the project configuration file." - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Looper version: 1.5.2-dev\r\n", - "Command: run\r\n", - "Using default divvy config. You may specify in env var: ['DIVCFG']\r\n", - "Pipestat compatible: False\r\n", - "\u001b[36m## [1 of 2] sample: frog_1; pipeline: count_lines\u001b[0m\r\n", - "/home/drc/GITHUB/looper/master/looper/docs_jupyter/hello_looper-master/pipeline/count_lines.sh data/frog1_data.txt\r\n", - "Writing script to /home/drc/GITHUB/looper/master/looper/docs_jupyter/hello_looper-master/results/submission/count_lines_frog_1.sub\r\n", - "Job script (n=1; 0.00Gb): /home/drc/GITHUB/looper/master/looper/docs_jupyter/hello_looper-master/results/submission/count_lines_frog_1.sub\r\n", - "Compute node: databio\r\n", - "Start time: 2023-11-08 17:29:45\r\n", - "wc: data/frog1_data.txt: No such file or directory\r\n", - "Number of lines: \r\n", - "\u001b[36m## [2 of 2] sample: frog_2; pipeline: count_lines\u001b[0m\r\n", - "/home/drc/GITHUB/looper/master/looper/docs_jupyter/hello_looper-master/pipeline/count_lines.sh data/frog2_data.txt\r\n", - "Writing script to /home/drc/GITHUB/looper/master/looper/docs_jupyter/hello_looper-master/results/submission/count_lines_frog_2.sub\r\n", - "Job script (n=1; 0.00Gb): /home/drc/GITHUB/looper/master/looper/docs_jupyter/hello_looper-master/results/submission/count_lines_frog_2.sub\r\n", - "Compute node: databio\r\n", - "Start time: 2023-11-08 17:29:45\r\n", - "wc: data/frog2_data.txt: No such file or directory\r\n", - "Number of lines: \r\n", - "\r\n", - "Looper finished\r\n", - "Samples valid for job generation: 2 of 2\r\n", - "Commands submitted: 2 of 2\r\n", - "Jobs submitted: 2\r\n", - "{'Pipestat compatible': False, 'Commands submitted': '2 of 2', 'Jobs submitted': 2}\r\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!looper run --looper-config hello_looper-master/.looper.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Voila! You've run your very first pipeline across multiple samples using `looper`!" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Exploring the results\n", - "\n", - "Now, let's inspect the `hello_looper` repository you downloaded. It has 3 components, each in a subfolder:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "hello_looper-master/data/\r\n", - "├── frog1_data.txt\r\n", - "└── frog2_data.txt\r\n", - "hello_looper-master/pipeline/\r\n", - "├── count_lines.sh\r\n", - "└── pipeline_interface.yaml\r\n", - "hello_looper-master/project/\r\n", - "├── project_config.yaml\r\n", - "└── sample_annotation.csv\r\n", - "\r\n", - "0 directories, 6 files\r\n" - ] - } - ], - "source": [ - "!tree hello_looper-master/*/" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "These are:\n", - "\n", - " * `/data` -- contains 2 data files for 2 samples. These input files were each passed to the pipeline.\n", - " * `/pipeline` -- contains the script we want to run on each sample in our project. Our pipeline is a very simple shell script named `count_lines.sh`, which (duh!) counts the number of lines in an input file.\n", - " * `/project` -- contains 2 files that describe metadata for the project (`project_config.yaml`) and the samples (`sample_annotation.csv`). This particular project describes just two samples listed in the annotation file. These files together make up a [PEP](http://pep.databio.org)-formatted project, and can therefore be read by any PEP-compatible tool, including `looper`.\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "When we invoke `looper` from the command line we told it to `run project/project_config.yaml`. `looper` reads the [project/project_config.yaml](https://github.com/pepkit/hello_looper/blob/master/project/project_config.yaml) file, which points to a few things:\n", - "\n", - " * the [project/sample_annotation.csv](https://github.com/pepkit/hello_looper/blob/master/project/sample_annotation.csv) file, which specifies a few samples, their type, and path to data file\n", - " * the `output_dir`, which is where looper results are saved. Results will be saved in `$HOME/hello_looper_results`.\n", - " * the `pipeline_interface.yaml` file, ([pipeline/pipeline_interface.yaml](https://github.com/pepkit/hello_looper/blob/master/pipeline/pipeline_interface.yaml)), which tells looper how to connect to the pipeline ([pipeline/count_lines.sh](https://github.com/pepkit/hello_looper/blob/master/pipeline/)).\n", - "\n", - "The 3 folders (`data`, `project`, and `pipeline`) are modular; there is no need for these to live in any predetermined folder structure. For this example, the data and pipeline are included locally, but in practice, they are usually in a separate folder; you can point to anything (so data, pipelines, and projects may reside in distinct spaces on disk). You may also include more than one pipeline interface in your `project_config.yaml`, so in a looper project, many-to-many relationships are possible." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Looper config\n", - "\n", - "The [looper config](looper-config.md) contains paths to the project config, the output_dir as well as any dfine pipeline interfaces. " - ] - }, - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "pep_config: project/project_config.yaml # local path to pep config\r\n", - "# pep_config: pepkit/hello_looper:default # you can also use a pephub registry path\r\n", - "output_dir: \"results\"\r\n", - "pipeline_interfaces:\r\n", - " sample: pipeline/pipeline_interface.yaml\r\n" - ] - } - ], - "source": [ - "!cat hello_looper-master/.looper.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "\n", - "## Project Config\n", - "\n", - "The project config file contains the PEP version and sample annotation sheet. (see [defining a project](defining-a-project.md)).\n" - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "pep_version: 2.0.0\r\n", - "sample_table: sample_annotation.csv\r\n" - ] - } - ], - "source": [ - "!cat hello_looper-master/project/project_config.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Pipeline Interface\n", - "\n", - "The [pipeline interface](pipeline-interface-specification.md) shows the pipeline_name, pipeline_type, as well as the var_templates and command_templates used for this pipeline.\n" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "pipeline_name: count_lines\r\n", - "pipeline_type: sample\r\n", - "var_templates:\r\n", - " pipeline: '{looper.piface_dir}/count_lines.sh'\r\n", - "command_template: >\r\n", - " {pipeline.var_templates.pipeline} {sample.file}\r\n" - ] - } - ], - "source": [ - "!cat hello_looper-master/pipeline/pipeline_interface.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Alright, next let's explore what this pipeline stuck into our `output_dir`:\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "/home/nsheff/hello_looper_results\r\n", - "├── results_pipeline\r\n", - "└── submission\r\n", - " ├── count_lines.sh_frog_1.log\r\n", - " ├── count_lines.sh_frog_1.sub\r\n", - " ├── count_lines.sh_frog_2.log\r\n", - " ├── count_lines.sh_frog_2.sub\r\n", - " ├── frog_1.yaml\r\n", - " └── frog_2.yaml\r\n", - "\r\n", - "2 directories, 6 files\r\n" - ] - } - ], - "source": [ - "!tree $HOME/hello_looper_results" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "Inside of an `output_dir` there will be two directories:\n", - "\n", - "- `results_pipeline` - a directory with output of the pipeline(s), for each sample/pipeline combination (often one per sample)\n", - "- `submissions` - which holds a YAML representation of each sample and a log file for each submitted job\n", - "\n", - "From here to running hundreds of samples of various sample types is virtually the same effort!\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Running PEPs from PEPHub\n", - "\n", - "Looper also supports running a PEP from [PEPHub](https://pephub.databio.org/)!" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "pep_config: pepkit/hello_looper:default # pephub registry path or local path\r\n", - "output_dir: results\r\n", - "pipeline_interfaces:\r\n", - " sample: pipeline/pipeline_interface.yaml\r\n" - ] - } - ], - "source": [ - "!cat hello_looper-master/.looper_pephub.yaml" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Looper version: 1.5.2-dev\n", - "Command: run\n", - "Using default divvy config. You may specify in env var: ['DIVCFG']\n", - "No config key in Project, or reading project from dict\n", - "Processing project from dictionary...\n", - "Pipestat compatible: False\n", - "\u001b[36m## [1 of 2] sample: frog_1; pipeline: count_lines\u001b[0m\n", - "/home/drc/GITHUB/looper/master/looper/docs_jupyter/hello_looper-master/pipeline/count_lines.sh data/frog1_data.txt\n", - "Writing script to /home/drc/GITHUB/looper/master/looper/docs_jupyter/hello_looper-master/results/submission/count_lines_frog_1.sub\n", - "Job script (n=1; 0.00Gb): /home/drc/GITHUB/looper/master/looper/docs_jupyter/hello_looper-master/results/submission/count_lines_frog_1.sub\n", - "Compute node: databio\n", - "Start time: 2023-11-09 15:39:28\n", - "wc: data/frog1_data.txt: No such file or directory\n", - "Number of lines: \n", - "\u001b[36m## [2 of 2] sample: frog_2; pipeline: count_lines\u001b[0m\n", - "/home/drc/GITHUB/looper/master/looper/docs_jupyter/hello_looper-master/pipeline/count_lines.sh data/frog2_data.txt\n", - "Writing script to /home/drc/GITHUB/looper/master/looper/docs_jupyter/hello_looper-master/results/submission/count_lines_frog_2.sub\n", - "Job script (n=1; 0.00Gb): /home/drc/GITHUB/looper/master/looper/docs_jupyter/hello_looper-master/results/submission/count_lines_frog_2.sub\n", - "Compute node: databio\n", - "Start time: 2023-11-09 15:39:28\n", - "wc: data/frog2_data.txt: No such file or directory\n", - "Number of lines: \n", - "\n", - "Looper finished\n", - "Samples valid for job generation: 2 of 2\n", - "Commands submitted: 2 of 2\n", - "Jobs submitted: 2\n", - "{'Pipestat compatible': False, 'Commands submitted': '2 of 2', 'Jobs submitted': 2}\n", - "\u001b[0m" - ] - } - ], - "source": [ - "!looper run --looper-config hello_looper-master/.looper_pephub.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Pipestat compatible configurations\n", - "\n", - "Looper can also be used in tandem with [pipestat](https://pipestat.databio.org/en/latest/) to report pipeline results." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "pep_config: ./project/project_config.yaml # pephub registry path or local path\r\n", - "output_dir: ./results\r\n", - "pipeline_interfaces:\r\n", - " sample: ./pipeline_pipestat/pipeline_interface.yaml\r\n", - "pipestat:\r\n", - " results_file_path: results.yaml" - ] - } - ], - "source": [ - "!cat hello_looper-master/.looper_pipestat.yaml" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "\n", - "## A few more basic looper options\n", - "\n", - "Looper also provides a few other simple arguments that let you adjust what it does. You can find a [complete reference of usage](usage.md) in the docs. Here are a few of the more common options:\n", - "\n", - "For `looper run`:\n", - "\n", - "- `-d`: Dry run mode (creates submission scripts, but does not execute them) \n", - "- `--limit`: Only run a few samples \n", - "- `--lumpn`: Run several commands together as a single job. This is useful when you have a quick pipeline to run on many samples and want to group them.\n", - "\n", - "There are also other commands:\n", - "\n", - "- `looper check`: checks on the status (running, failed, completed) of your jobs\n", - "- `looper summarize`: produces an output file that summarizes your project results\n", - "- `looper destroy`: completely erases all results so you can restart\n", - "- `looper rerun`: rerun only jobs that have failed.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## On your own\n", - "\n", - "To use `looper` on your own, you will need to prepare 2 things: a **project** (metadata that define *what* you want to process), and **pipelines** (*how* to process data). To link your project to `looper`, you will need to [define a project](defining-a-project.md). You will want to either use pre-made `looper`-compatible pipelines or link your own custom-built pipelines. These docs will also show you how to connect your pipeline to your project.\n" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.12" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/docs_jupyter/tutorial_divvy.ipynb b/docs_jupyter/tutorial_divvy.ipynb deleted file mode 100644 index a9a3c044d..000000000 --- a/docs_jupyter/tutorial_divvy.ipynb +++ /dev/null @@ -1,413 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# python tutorial\n", - "\n", - "## Compute packages\n", - "\n", - "When you start `divvy`, you may provide a configuration file that specifies one or more *compute packages*. A compute package is just a set of a variables that contains information needed to run a job, such as a job submission template, the command that you use to submit a job (*e.g.* `sbatch` or `qsub`), and any other variables needed to fill the template (*e.g.* `partition` or `account`). You can find out [how to write your own divvy config file](../configuration), but for this tutorial, we'll just use the default.\n", - "\n", - "Start by importing `divvy`, and then create a new `ComputingConfiguration` object. If you provide no arguments, you'll just get a few default packages:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "scrolled": true - }, - "outputs": [], - "source": [ - "import divvy\n", - "\n", - "dcc = divvy.ComputingConfiguration()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This loads up the default compute package, and we see that there are a few other packages available. We can explore the compute settings in the loaded (`default`) package like this: " - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "submission_template: /home/nsheff/.local/lib/python3.5/site-packages/divvy/default_config/submit_templates/localhost_template.sub\n", - "submission_command: sh" - ] - }, - "execution_count": 2, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dcc.compute" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here you can see that a *compute package* is really a simple thing. In this case, it's just 2 key-value pairs. The `submission_template` key is a path to a template file, with these contents: \n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "#!/bin/bash\n", - "\n", - "echo 'Compute node:' `hostname`\n", - "echo 'Start time:' `date +'%Y-%m-%d %T'`\n", - "\n", - "{CODE} | tee {LOGFILE}\n", - "\n" - ] - } - ], - "source": [ - "with open(dcc.compute.submission_template) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can populate this simple template by passing values for the `{VARIABLE}` text in the template:" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Writing script to /home/nsheff/code/divvy/docs_jupyter/test_local.sub\n" - ] - }, - { - "data": { - "text/plain": [ - "'test_local.sub'" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dcc.write_script(\n", - " \"test_local.sub\", {\"code\": \"run-this-command\", \"logfile\": \"logfile.txt\"}\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now let's look at the contents of our populated template:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "#!/bin/bash\n", - "\n", - "echo 'Compute node:' `hostname`\n", - "echo 'Start time:' `date +'%Y-%m-%d %T'`\n", - "\n", - "run-this-command | tee logfile.txt\n", - "\n" - ] - } - ], - "source": [ - "with open(\"test_local.sub\") as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This function opens the template specified by the `submission_template` variable in the compute package, and then populates any template variables with values from the compute package. The original `{CODE}` and `{LOGFILE}` has been replaced by the variables we passed to `write_script()`.\n", - "\n", - "The other variable in the compute package is `submission_command`, which contains the shell instruction that would be used to submit this populated template; in this case, it's simply `sh` to run this script in the console. We can activate a different *compute_package* like this: " - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Activating compute package 'slurm'\n" - ] - }, - { - "data": { - "text/plain": [ - "True" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dcc.activate_package(\"slurm\")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "It returns 'True' to indicate that the activation has been successful. This will change our settings. Let's inspect the new package:" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": { - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/plain": [ - "submission_template: /home/nsheff/.local/lib/python3.5/site-packages/divvy/default_config/submit_templates/slurm_template.sub\n", - "submission_command: sbatch" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "dcc.compute" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Now that we've activated the package of interest, let's take a peek at the now-active `submission_template`:" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "#!/bin/bash\n", - "#SBATCH --job-name='{JOBNAME}'\n", - "#SBATCH --output='{LOGFILE}'\n", - "#SBATCH --mem='{MEM}'\n", - "#SBATCH --cpus-per-task='{CORES}'\n", - "#SBATCH --time='{TIME}'\n", - "#SBATCH --partition='{PARTITION}'\n", - "#SBATCH -m block\n", - "#SBATCH --ntasks=1\n", - "\n", - "echo 'Compute node:' `hostname`\n", - "echo 'Start time:' `date +'%Y-%m-%d %T'`\n", - "\n", - "{CODE}\n", - "\n" - ] - } - ], - "source": [ - "with open(dcc.compute.submission_template) as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this template there are a lot more variables to populate. If we don't populate them all, they will just be left in the template. Let's pass a value for the `code` variable and see how this changes the submission script output:" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Writing script to /home/nsheff/code/divvy/docs_jupyter/test_script.sub\n" - ] - } - ], - "source": [ - "s = dcc.write_script(\"test_script.sub\", {\"code\": \"yellow\"})" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here's the output. Notice that the `{CODE}` variable has been replaced with the word `yellow`:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "#!/bin/bash\n", - "#SBATCH --job-name='{JOBNAME}'\n", - "#SBATCH --output='{LOGFILE}'\n", - "#SBATCH --mem='{MEM}'\n", - "#SBATCH --cpus-per-task='{CORES}'\n", - "#SBATCH --time='{TIME}'\n", - "#SBATCH --partition='{PARTITION}'\n", - "#SBATCH -m block\n", - "#SBATCH --ntasks=1\n", - "\n", - "echo 'Compute node:' `hostname`\n", - "echo 'Start time:' `date +'%Y-%m-%d %T'`\n", - "\n", - "yellow\n", - "\n" - ] - } - ], - "source": [ - "with open(\"test_script.sub\") as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Using a priority list of variables\n", - "\n", - "Now, you can also pass more than one `Dict` object, in priority order, by just passing a list. Here, we'll pass 2 dicts, and any values in the 1st will override values in the 2nd:" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "Writing script to /home/nsheff/code/divvy/docs_jupyter/test_script.sub\n" - ] - } - ], - "source": [ - "s = dcc.write_script(\n", - " \"test_script.sub\", [{\"code\": \"red\"}, {\"code\": \"yellow\", \"time\": \"now\"}]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "#!/bin/bash\n", - "#SBATCH --job-name='{JOBNAME}'\n", - "#SBATCH --output='{LOGFILE}'\n", - "#SBATCH --mem='{MEM}'\n", - "#SBATCH --cpus-per-task='{CORES}'\n", - "#SBATCH --time='now'\n", - "#SBATCH --partition='{PARTITION}'\n", - "#SBATCH -m block\n", - "#SBATCH --ntasks=1\n", - "\n", - "echo 'Compute node:' `hostname`\n", - "echo 'Start time:' `date +'%Y-%m-%d %T'`\n", - "\n", - "red\n", - "\n" - ] - } - ], - "source": [ - "with open(\"test_script.sub\") as f:\n", - " print(f.read())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "In this case the value `red` took priority for the `code` variable, because it came first; but `time` was not overwritten in the first entry, so it is maintained. This allows for a cascading cumulative priority variable replacement." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.7.5" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/looper/__init__.py b/looper/__init__.py index fe751d02d..015da2c00 100644 --- a/looper/__init__.py +++ b/looper/__init__.py @@ -7,33 +7,45 @@ """ +from importlib.metadata import version + import logmuse logmuse.init_logger("looper") +__version__ = version("looper") + +# Lazy imports - only loaded when accessed +_lazy_imports = { + "DEFAULT_COMPUTE_RESOURCES_NAME": ".divvy", + "ComputingConfiguration": ".divvy", + "select_divvy_config": ".divvy", + "COMPUTE_KEY": ".divvy", # NEW_COMPUTE_KEY + "SubmissionConductor": ".conductor", + "write_submission_yaml": ".conductor", + "PipelineInterface": ".pipeline_interface", + "write_custom_template": ".plugins", + "write_sample_yaml": ".plugins", + "write_sample_yaml_cwl": ".plugins", + "write_sample_yaml_prj": ".plugins", + "Project": ".project", +} + + +def __getattr__(name): + if name in _lazy_imports: + module_path = _lazy_imports[name] + import importlib + + module = importlib.import_module(module_path, __package__) + value = getattr(module, name if name != "COMPUTE_KEY" else "NEW_COMPUTE_KEY") + globals()[name] = value # Cache for subsequent access + return value + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + +def __dir__(): + return list(_lazy_imports.keys()) + ["__version__"] -from .divvy import ComputingConfiguration, select_divvy_config -from .divvy import DEFAULT_COMPUTE_RESOURCES_NAME -from .divvy import NEW_COMPUTE_KEY as COMPUTE_KEY - -from ._version import __version__ -from .conductor import ( - SubmissionConductor, - write_submission_yaml, -) -from .plugins import ( - write_sample_yaml, - write_sample_yaml_cwl, - write_sample_yaml_prj, - write_custom_template, -) -from .const import * -from .pipeline_interface import PipelineInterface -from .project import Project - -# Not used here, but make this the main import interface between peppy and -# looper, so that other modules within this package need not worry about -# the locations of some of the peppy declarations. Effectively, concentrate -# the connection between peppy and looper here, to the extent possible. __all__ = [ "Project", diff --git a/looper/__main__.py b/looper/__main__.py index 3e9816554..ff3d95f85 100644 --- a/looper/__main__.py +++ b/looper/__main__.py @@ -1,7 +1,6 @@ import sys from .cli_pydantic import main -from .cli_divvy import main as divvy_main if __name__ == "__main__": try: diff --git a/looper/_version.py b/looper/_version.py deleted file mode 100644 index c9ded3fc2..000000000 --- a/looper/_version.py +++ /dev/null @@ -1,2 +0,0 @@ -__version__ = "2.0.3" -# You must change the version in parser = pydantic_argparse.ArgumentParser in cli_pydantic.py!!! diff --git a/looper/api/README.md b/looper/api/README.md new file mode 100644 index 000000000..e0cc9359f --- /dev/null +++ b/looper/api/README.md @@ -0,0 +1,34 @@ +# Looper HTTP API + +## Overview + +This API provides an HTTP interface for running the `looper` commands, allowing users to interact with Looper via HTTP requests. + +## Usage +### Running the server +Run the app: +```bash +looper-serve [--host ] [--port ] +``` + +> [!NOTE] +This assumes that all files specified in the arguments are available on the file system of the machine that is running the HTTP API server. Best make sure you use absolute file paths in all `looper` YAML configuration files. + +### Sending requests +To test this, you can clone the [`hello_looper`](https://github.com/pepkit/hello_looper) repository and then run (for example) the following in a second terminal: +```bash +curl -X POST -H "Content-Type: application/json" -d '{"run": {"time_delay": 5}, "looper_config": "/path/to/hello_looper/.looper.yaml"}' "http://127.0.0.1:8000" +``` +This will return a six-letter job ID, say `abc123`. Then get the result / output of the run with +```bash +curl -X GET -v localhost:8000/status/abc123 +``` +For better visualization / readability, you can post-process the output by piping it to `jq` (` | jq -r .console_output`). + +## API Documentation +The API documentation is automatically generated and can be accessed in your web browser: + +Swagger UI: http://127.0.0.1:8000/docs +ReDoc: http://127.0.0.1:8000/redoc + +Explore the API documentation to understand available endpoints, request parameters, and response formats. diff --git a/tests/smoketests/__init__.py b/looper/api/__init__.py similarity index 100% rename from tests/smoketests/__init__.py rename to looper/api/__init__.py diff --git a/looper/api/main.py b/looper/api/main.py new file mode 100644 index 000000000..36f56473c --- /dev/null +++ b/looper/api/main.py @@ -0,0 +1,144 @@ +import secrets +from argparse import ArgumentParser, Namespace +from typing import Dict + +import fastapi +import pydantic +import uvicorn +from fastapi import FastAPI, HTTPException + +from looper.api import stdout_redirects +from looper.cli_pydantic import run_looper +from looper.command_models.commands import SUPPORTED_COMMANDS, TopLevelParser + +stdout_redirects.enable_proxy() + + +class Job(pydantic.BaseModel): + id: str = pydantic.Field( + default_factory=lambda: secrets.token_urlsafe(4), + description="The unique identifier of the job", + ) + status: str = pydantic.Field( + default="in_progress", + description="The current status of the job. Can be `in_progress`, `completed`, or `failed`.", + ) + console_output: str | None = pydantic.Field( + default=None, + description="Console output produced by `looper` while performing the requested action", + ) + error: str | None = pydantic.Field( + default=None, + description="Error message if the job failed", + ) + + +app = FastAPI(validate_model=True) +jobs: Dict[str, Job] = {} + + +def background_async(top_level_model: TopLevelParser, job_id: str) -> None: + argparse_namespace = create_argparse_namespace(top_level_model) + output_stream = stdout_redirects.redirect() + + try: + run_looper(argparse_namespace) + jobs[job_id].status = "completed" + except Exception as e: + jobs[job_id].status = "failed" + jobs[job_id].error = str(e) + finally: + # Here, we should call `stdout_redirects.stop_redirect()`, but that fails for reasons discussed + # in the following issue: https://github.com/python/cpython/issues/80374 + # But this *seems* not to pose any problems. + jobs[job_id].console_output = output_stream.getvalue() + + +def create_argparse_namespace(top_level_model: TopLevelParser) -> Namespace: + """ + Converts a TopLevelParser instance into an argparse.Namespace object. + + This function takes a TopLevelParser instance, and converts it into an + argparse.Namespace object compatible with run_looper(). + + :param TopLevelParser top_level_model: An instance of the TopLevelParser model + :return argparse.Namespace: An argparse.Namespace object representing + the parsed command-line arguments. + """ + namespace = Namespace() + + # Find which command was specified and set it + command_name = None + for cmd in SUPPORTED_COMMANDS: + cmd_value = getattr(top_level_model, cmd.name, None) + if cmd_value is not None: + command_name = cmd.name + # Add all command arguments to the namespace + for argname, value in vars(cmd_value).items(): + setattr(namespace, argname, value) + break + + namespace.command = command_name + + # Add top-level arguments + namespace.silent = top_level_model.silent + namespace.verbosity = top_level_model.verbosity + namespace.logdev = top_level_model.logdev + + return namespace + + +@app.post( + "/", + status_code=202, + summary="Run Looper", + description="Start a `looper` command with arguments specified in " + "`top_level_model` in the background and return a job identifier.", +) +async def main_endpoint( + top_level_model: TopLevelParser, background_tasks: fastapi.BackgroundTasks +) -> Dict: + job = Job() + jobs[job.id] = job + background_tasks.add_task(background_async, top_level_model, job.id) + return {"job_id": job.id} + + +@app.get( + "/status/{job_id}", + summary="Get job status", + description="Retrieve the status of a job based on its unique identifier.", +) +async def get_status(job_id: str) -> Job: + if job_id not in jobs: + raise HTTPException(status_code=404, detail=f"Job '{job_id}' not found") + return jobs[job_id] + + +@app.get( + "/jobs", + summary="List all jobs", + description="Retrieve a list of all submitted jobs with their IDs and statuses.", +) +async def list_jobs() -> Dict: + return {"jobs": [{"id": job.id, "status": job.status} for job in jobs.values()]} + + +def main() -> None: + parser = ArgumentParser("looper-serve", description="Run looper HTTP API server") + parser.add_argument( + "--host", + type=str, + default="0.0.0.0", + help="Host IP address to use (127.0.0.1 for local access only)", + ) + parser.add_argument( + "--port", type=int, default=8000, help="Port the server listens on" + ) + args = parser.parse_args() + + uvicorn.run(app, host=args.host, port=args.port) + + +if __name__ == "__main__": + main() diff --git a/looper/api/stdout_redirects.py b/looper/api/stdout_redirects.py new file mode 100644 index 000000000..924c9c5a6 --- /dev/null +++ b/looper/api/stdout_redirects.py @@ -0,0 +1,261 @@ +# ruff: noqa: E731 +# Copied from https://gitlab.com/yquemener/stdout-redirects +# +# copied from https://stackoverflow.com/a/43667367/1193986 +# +# (c) umichscoots 2017 +# License unsepcified. Assumed to be CC-by-sa as is StackOverflow's policy +# +# The class LocalProxy is taken from the werkzeug project +# https://raw.githubusercontent.com/pallets/werkzeug/ef545f0d0bf28cbad02066b4cb7471bea50a93ee/src/werkzeug/local.py +# It is licensed under the BSD-3-Clause License +# +# I guess that means the result is CC-by-SA + + +import copy +import sys +import threading +from io import StringIO +from typing import Any, Optional, Union + +# Save all of the objects for use later. +orig___stdout__ = sys.__stdout__ +orig___stderr__ = sys.__stderr__ +orig_stdout = sys.stdout +orig_stderr = sys.stderr +thread_proxies = {} + + +class LocalProxy: + """Acts as a proxy for a werkzeug local. Forwards all operations to + a proxied object. The only operations not supported for forwarding + are right handed operands and any kind of assignment. + Example usage:: + from werkzeug.local import Local + l = Local() + # these are proxies + request = l('request') + user = l('user') + from werkzeug.local import LocalStack + _response_local = LocalStack() + # this is a proxy + response = _response_local() + Whenever something is bound to l.user / l.request the proxy objects + will forward all operations. If no object is bound a :exc:`RuntimeError` + will be raised. + To create proxies to :class:`Local` or :class:`LocalStack` objects, + call the object as shown above. If you want to have a proxy to an + object looked up by a function, you can (as of Werkzeug 0.6.1) pass + a function to the :class:`LocalProxy` constructor:: + session = LocalProxy(lambda: get_current_request().session) + .. versionchanged:: 0.6.1 + The class can be instantiated with a callable as well now. + """ + + __slots__ = ("__local", "__dict__", "__name__", "__wrapped__") + + def __init__( + self, + local: Union[Any, "LocalProxy"], + name: Optional[str] = None, + ) -> None: + object.__setattr__(self, "_LocalProxy__local", local) + object.__setattr__(self, "__name__", name) + if callable(local) and not hasattr(local, "__release_local__"): + # "local" is a callable that is not an instance of Local or + # LocalManager: mark it as a wrapped function. + object.__setattr__(self, "__wrapped__", local) + + def _get_current_object( + self, + ) -> object: + """Return the current object. This is useful if you want the real + object behind the proxy at a time for performance reasons or because + you want to pass the object into a different context. + """ + if not hasattr(self.__local, "__release_local__"): + return self.__local() + try: + return getattr(self.__local, self.__name__) + except AttributeError: + raise RuntimeError(f"no object bound to {self.__name__}") + + @property + def __dict__(self): + try: + return self._get_current_object().__dict__ + except RuntimeError: + raise AttributeError("__dict__") + + def __repr__(self) -> str: + try: + obj = self._get_current_object() + except RuntimeError: + return f"<{type(self).__name__} unbound>" + return repr(obj) + + def __bool__(self) -> bool: + try: + return bool(self._get_current_object()) + except RuntimeError: + return False + + def __dir__(self): + try: + return dir(self._get_current_object()) + except RuntimeError: + return [] + + def __getattr__(self, name: str) -> Any: + if name == "__members__": + return dir(self._get_current_object()) + return getattr(self._get_current_object(), name) + + def __setitem__(self, key: Any, value: Any) -> None: + self._get_current_object()[key] = value # type: ignore + + def __delitem__(self, key): + del self._get_current_object()[key] + + __setattr__ = lambda x, n, v: setattr(x._get_current_object(), n, v) # type: ignore + __delattr__ = lambda x, n: delattr(x._get_current_object(), n) # type: ignore + __str__ = lambda x: str(x._get_current_object()) # type: ignore + __lt__ = lambda x, o: x._get_current_object() < o + __le__ = lambda x, o: x._get_current_object() <= o + __eq__ = lambda x, o: x._get_current_object() == o # type: ignore + __ne__ = lambda x, o: x._get_current_object() != o # type: ignore + __gt__ = lambda x, o: x._get_current_object() > o + __ge__ = lambda x, o: x._get_current_object() >= o + __hash__ = lambda x: hash(x._get_current_object()) # type: ignore + __call__ = lambda x, *a, **kw: x._get_current_object()(*a, **kw) + __len__ = lambda x: len(x._get_current_object()) + __getitem__ = lambda x, i: x._get_current_object()[i] + __iter__ = lambda x: iter(x._get_current_object()) + __contains__ = lambda x, i: i in x._get_current_object() + __add__ = lambda x, o: x._get_current_object() + o + __sub__ = lambda x, o: x._get_current_object() - o + __mul__ = lambda x, o: x._get_current_object() * o + __floordiv__ = lambda x, o: x._get_current_object() // o + __mod__ = lambda x, o: x._get_current_object() % o + __divmod__ = lambda x, o: x._get_current_object().__divmod__(o) + __pow__ = lambda x, o: x._get_current_object() ** o + __lshift__ = lambda x, o: x._get_current_object() << o + __rshift__ = lambda x, o: x._get_current_object() >> o + __and__ = lambda x, o: x._get_current_object() & o + __xor__ = lambda x, o: x._get_current_object() ^ o + __or__ = lambda x, o: x._get_current_object() | o + __div__ = lambda x, o: x._get_current_object().__div__(o) + __truediv__ = lambda x, o: x._get_current_object().__truediv__(o) + __neg__ = lambda x: -(x._get_current_object()) + __pos__ = lambda x: +(x._get_current_object()) + __abs__ = lambda x: abs(x._get_current_object()) + __invert__ = lambda x: ~(x._get_current_object()) + __complex__ = lambda x: complex(x._get_current_object()) + __int__ = lambda x: int(x._get_current_object()) + __long__ = lambda x: long(x._get_current_object()) # type: ignore # noqa + __float__ = lambda x: float(x._get_current_object()) + __oct__ = lambda x: oct(x._get_current_object()) + __hex__ = lambda x: hex(x._get_current_object()) + __index__ = lambda x: x._get_current_object().__index__() + __coerce__ = lambda x, o: x._get_current_object().__coerce__(x, o) + __enter__ = lambda x: x._get_current_object().__enter__() + __exit__ = lambda x, *a, **kw: x._get_current_object().__exit__(*a, **kw) + __radd__ = lambda x, o: o + x._get_current_object() + __rsub__ = lambda x, o: o - x._get_current_object() + __rmul__ = lambda x, o: o * x._get_current_object() + __rdiv__ = lambda x, o: o / x._get_current_object() + __rtruediv__ = __rdiv__ + __rfloordiv__ = lambda x, o: o // x._get_current_object() + __rmod__ = lambda x, o: o % x._get_current_object() + __rdivmod__ = lambda x, o: x._get_current_object().__rdivmod__(o) + __copy__ = lambda x: copy.copy(x._get_current_object()) + __deepcopy__ = lambda x, memo: copy.deepcopy(x._get_current_object(), memo) + + +def redirect(): + """ + Enables the redirect for the current thread's output to a single cStringIO + object and returns the object. + + :return: The StringIO object. + :rtype: ``cStringIO.StringIO`` + """ + # Get the current thread's identity. + ident = threading.currentThread().ident + + # Enable the redirect and return the cStringIO object. + thread_proxies[ident] = StringIO() + return thread_proxies[ident] + + +def stop_redirect(): + """ + Enables the redirect for the current thread's output to a single cStringIO + object and returns the object. + + :return: The final string value. + :rtype: ``str`` + """ + # Get the current thread's identity. + ident = threading.currentThread().ident + + # Only act on proxied threads. + if ident not in thread_proxies: + return + + # Read the value, close/remove the buffer, and return the value. + retval = thread_proxies[ident].getvalue() + thread_proxies[ident].close() + del thread_proxies[ident] + return retval + + +def _get_stream(original): + """ + Returns the inner function for use in the LocalProxy object. + + :param original: The stream to be returned if thread is not proxied. + :type original: ``file`` + :return: The inner function for use in the LocalProxy object. + :rtype: ``function`` + """ + + def proxy(): + """ + Returns the original stream if the current thread is not proxied, + otherwise we return the proxied item. + + :return: The stream object for the current thread. + :rtype: ``file`` + """ + # Get the current thread's identity. + ident = threading.currentThread().ident + + # Return the proxy, otherwise return the original. + return thread_proxies.get(ident, original) + + # Return the inner function. + return proxy + + +def enable_proxy(): + """ + Overwrites __stdout__, __stderr__, stdout, and stderr with the proxied + objects. + """ + sys.__stdout__ = LocalProxy(_get_stream(sys.__stdout__)) + sys.__stderr__ = LocalProxy(_get_stream(sys.__stderr__)) + sys.stdout = LocalProxy(_get_stream(sys.stdout)) + sys.stderr = LocalProxy(_get_stream(sys.stderr)) + + +def disable_proxy(): + """ + Overwrites __stdout__, __stderr__, stdout, and stderr with the original + objects. + """ + sys.__stdout__ = orig___stdout__ + sys.__stderr__ = orig___stderr__ + sys.stdout = orig_stdout + sys.stderr = orig_stderr diff --git a/looper/cli_divvy.py b/looper/cli_divvy.py index 1fa98b69e..33b67947a 100644 --- a/looper/cli_divvy.py +++ b/looper/cli_divvy.py @@ -1,21 +1,23 @@ -import logmuse import os import sys + +import logmuse import yaml +from ubiquerg import VersionInHelpParser, is_writable from yaml import SafeLoader -from ubiquerg import is_writable, VersionInHelpParser + from .const import ( DEFAULT_COMPUTE_RESOURCES_NAME, DEFAULT_CONFIG_FILEPATH, ) -from .divvy import select_divvy_config, ComputingConfiguration, divvy_init +from .divvy import ComputingConfiguration, divvy_init, select_divvy_config def build_argparser(): - """ - Builds argument parser. + """Builds argument parser. - :return argparse.ArgumentParser + Returns: + argparse.ArgumentParser: The argument parser. """ banner = ( @@ -99,7 +101,7 @@ def add_subparser(cmd, description): return parser -def main(): +def main() -> None: """Primary workflow for divvy CLI""" parser = logmuse.add_logging_options(build_argparser()) diff --git a/looper/cli_pydantic.py b/looper/cli_pydantic.py index 3ec094d0f..5da08a66b 100644 --- a/looper/cli_pydantic.py +++ b/looper/cli_pydantic.py @@ -1,70 +1,59 @@ """ -CLI script using `pydantic-argparse` for parsing of arguments +CLI script using pydantic-settings for CLI parsing. -Arguments / commands are defined in `command_models/` and are given, eventually, as -`pydantic` models, allowing for type-checking and validation of arguments. - -Note: this is only a test script so far, and coexists next to the current CLI -(`cli_looper.py`), which uses `argparse` directly. The goal is to eventually -replace the current CLI with a CLI based on above-mentioned `pydantic` models, -but whether this will happen with `pydantic-argparse` or another, possibly self- -written library is not yet clear. -It is well possible that this script will be removed again. +Arguments / commands are defined in `command_models/` as pydantic models, +allowing for type-checking and validation of arguments. """ -# Note: The following import is used for forward annotations (Python 3.8) -# to prevent potential 'TypeError' related to the use of the '|' operator -# with types. -from __future__ import annotations - +import os import sys +from argparse import Namespace + +from pydantic_settings import get_subcommand + +from .command_models.commands import SUPPORTED_COMMANDS, TopLevelParser + + +def flatten_args(args: TopLevelParser) -> Namespace: + """Convert pydantic-settings args to argparse.Namespace for compatibility. + + pydantic-settings produces a nested structure where subcommand args are + accessed via args.run.dry_run, args.check.flags, etc. The rest of looper + expects flat access (args.dry_run, args.flags). This function flattens + the active subcommand's arguments into a standard Namespace. + + Only one subcommand is ever active at a time, so there are no conflicts + between arguments with the same name on different subcommands. + """ + subcmd_args = get_subcommand(args, is_required=True) + + # Determine command name from the subcommand model + command = None + for cmd in SUPPORTED_COMMANDS: + if getattr(args, cmd.name, None) is subcmd_args: + command = cmd.name + break + + ns = Namespace( + command=command, + silent=args.silent, + verbosity=args.verbosity, + logdev=args.logdev, + ) + if subcmd_args is not None: + for k, v in subcmd_args.model_dump().items(): + setattr(ns, k, v) + return ns + -import logmuse -import pydantic_argparse -import yaml -from eido import inspect_project -from pephubclient import PEPHubClient -from pydantic_argparse.argparse.parser import ArgumentParser - -from . import __version__ - -from .command_models.arguments import ArgumentEnum - -from .command_models.commands import ( - SUPPORTED_COMMANDS, - TopLevelParser, - add_short_arguments, -) -from .const import * -from .divvy import DEFAULT_COMPUTE_RESOURCES_NAME, select_divvy_config -from .exceptions import * -from .looper import * -from .parser_types import * -from .project import Project, ProjectContext -from .utils import ( - dotfile_path, - enrich_args_via_cfg, - is_pephub_registry_path, - read_looper_config_file, - read_looper_dotfile, - initiate_looper_config, - init_generic_pipeline, - read_yaml_file, - inspect_looper_config_file, - is_PEP_file_type, - looper_config_tutorial, -) - -from typing import List, Tuple -from rich.console import Console - - -def opt_attr_pair(name: str) -> Tuple[str, str]: +def opt_attr_pair(name: str) -> tuple[str, str]: """Takes argument as attribute and returns as tuple of top-level or subcommand used.""" return f"--{name}", name.replace("-", "_") -def validate_post_parse(args: argparse.Namespace) -> List[str]: +def validate_post_parse( + args, sample_exclusion_optname: str, sample_inclusion_optname: str +) -> list[str]: """Checks if user is attempting to use mutually exclusive options.""" problems = [] used_exclusives = [ @@ -74,16 +63,11 @@ def validate_post_parse(args: argparse.Namespace) -> List[str]: [ "skip", "limit", - SAMPLE_EXCLUSION_OPTNAME, - SAMPLE_INCLUSION_OPTNAME, + sample_exclusion_optname, + sample_inclusion_optname, ], ) - # Depending on the subcommand used, the above options might either be in - # the top-level namespace or in the subcommand namespace (the latter due - # to a `modify_args_namespace()`) - if getattr( - args, attr, None - ) # or (getattr(args.run, attr, None) if hasattr(args, "run") else False) + if getattr(args, attr, None) ] if len(used_exclusives) > 1: problems.append( @@ -92,51 +76,95 @@ def validate_post_parse(args: argparse.Namespace) -> List[str]: return problems -# TODO rename to run_looper_via_cli for running lloper as a python library: -# https://github.com/pepkit/looper/pull/472#discussion_r1521970763 -def run_looper(args: TopLevelParser, parser: ArgumentParser, test_args=None): - # here comes adapted `cli_looper.py` code +def run_looper(args: Namespace, test_args=None): + """Run looper with parsed arguments. + + Args: + args: Flattened arguments from pydantic-settings + test_args: Optional test arguments for testing purposes + """ + # Lazy imports - only load when actually running commands + import logmuse + import yaml + from eido import inspect_project + from pephubclient import PEPHubClient + from rich.console import Console + + from . import __version__ + from .const import ( + CLI_KEY, + CLI_PROJ_ATTRS, + EXAMPLE_COMPUTE_SPEC_FMT, + PROJECT_PL_ARG, + SAMPLE_EXCLUSION_OPTNAME, + SAMPLE_INCLUSION_OPTNAME, + SAMPLE_PL_ARG, + PipelineLevel, + ) + from .divvy import DEFAULT_COMPUTE_RESOURCES_NAME, select_divvy_config + from .exceptions import ( + MisconfigurationException, + PipestatConfigurationException, + SampleFailedException, + ) + from .looper import ( + Checker, + Cleaner, + Collator, + Destroyer, + Linker, + Reporter, + Runner, + Tabulator, + ) + from .project import Project, ProjectContext + from .utils import ( + dotfile_path, + enrich_args_via_cfg, + init_generic_pipeline, + initiate_looper_config, + inspect_looper_config_file, + is_PEP_file_type, + is_pephub_registry_path, + looper_config_tutorial, + read_looper_config_file, + read_looper_dotfile, + read_yaml_file, + ) + global _LOGGER _LOGGER = logmuse.logger_via_cli(args, make_root=True) - # Find out which subcommand was used - supported_command_names = [cmd.name for cmd in SUPPORTED_COMMANDS] - subcommand_valued_args = [ - (arg, value) - for arg, value in vars(args).items() - if arg and arg in supported_command_names and value is not None - ] - # Only one subcommand argument will be not `None`, else we found a bug in `pydantic-argparse` - [(subcommand_name, subcommand_args)] = subcommand_valued_args - - cli_use_errors = validate_post_parse(subcommand_args) - if cli_use_errors: - parser.print_help(sys.stderr) - parser.error( - f"{len(cli_use_errors)} CLI use problem(s): {', '.join(cli_use_errors)}" - ) + subcommand_name = args.command if subcommand_name is None: - parser.print_help(sys.stderr) + print("No command specified. Use --help for usage.", file=sys.stderr) sys.exit(1) - if subcommand_name == "init": + cli_use_errors = validate_post_parse( + args, SAMPLE_EXCLUSION_OPTNAME, SAMPLE_INCLUSION_OPTNAME + ) + if cli_use_errors: + print(f"Error: {', '.join(cli_use_errors)}", file=sys.stderr) + print("Run 'looper --help' for usage information.", file=sys.stderr) + sys.exit(1) + if subcommand_name == "init": console = Console() console.clear() - console.rule(f"\n[magenta]Looper initialization[/magenta]") - selection = subcommand_args.generic + console.rule("\n[magenta]Looper initialization[/magenta]") + selection = args.generic if selection is True: console.clear() return int( not initiate_looper_config( dotfile_path(), - subcommand_args.pep_config, - subcommand_args.output_dir, - subcommand_args.sample_pipeline_interfaces, - subcommand_args.project_pipeline_interfaces, - subcommand_args.force_yes, + args.pep_config, + args.output_dir, + args.sample_pipeline_interfaces, + args.project_pipeline_interfaces, + args.force_yes, ) ) else: @@ -150,8 +178,8 @@ def run_looper(args: TopLevelParser, parser: ArgumentParser, test_args=None): looper_cfg_path = os.path.relpath(dotfile_path(), start=os.curdir) try: - if subcommand_args.config: - looper_config_dict = read_looper_config_file(subcommand_args.config) + if args.config: + looper_config_dict = read_looper_config_file(args.config) else: looper_config_dict = read_looper_dotfile() _LOGGER.info(f"Using looper config ({looper_cfg_path}).") @@ -161,120 +189,111 @@ def run_looper(args: TopLevelParser, parser: ArgumentParser, test_args=None): if looper_config_key == CLI_KEY: cli_modifiers_dict = looper_config_item else: - setattr(subcommand_args, looper_config_key, looper_config_item) + setattr(args, looper_config_key, looper_config_item) except OSError as e: - if subcommand_args.config: + if args.config: _LOGGER.warning( - f"\nLooper config file does not exist at given path {subcommand_args.config}. Use looper init to create one at {looper_cfg_path}." + f"\nLooper config file does not exist at given path {args.config}. Use looper init to create one at {looper_cfg_path}." ) else: _LOGGER.warning(e) sys.exit(1) - subcommand_args = enrich_args_via_cfg( + args = enrich_args_via_cfg( subcommand_name, - subcommand_args, - parser, + args, test_args=test_args, cli_modifiers=cli_modifiers_dict, ) # If project pipeline interface defined in the cli, change name to: "pipeline_interface" - if vars(subcommand_args)[PROJECT_PL_ARG]: - subcommand_args.pipeline_interfaces = vars(subcommand_args)[PROJECT_PL_ARG] + if getattr(args, PROJECT_PL_ARG, None): + args.pipeline_interfaces = getattr(args, PROJECT_PL_ARG) divcfg = ( - select_divvy_config(filepath=subcommand_args.divvy) - if hasattr(subcommand_args, "divvy") - else None + select_divvy_config(filepath=args.divvy) if hasattr(args, "divvy") else None ) # Ignore flags if user is selecting or excluding on flags: - if subcommand_args.sel_flag or subcommand_args.exc_flag: - subcommand_args.ignore_flags = True + if args.sel_flag or args.exc_flag: + args.ignore_flags = True # Initialize project - if is_PEP_file_type(subcommand_args.pep_config) and os.path.exists( - subcommand_args.pep_config - ): + if is_PEP_file_type(args.pep_config) and os.path.exists(args.pep_config): try: p = Project( - cfg=subcommand_args.pep_config, - amendments=subcommand_args.amend, + cfg=args.pep_config, + amendments=args.amend, divcfg_path=divcfg, runp=subcommand_name == "runp", **{ - attr: getattr(subcommand_args, attr) + attr: getattr(args, attr) for attr in CLI_PROJ_ATTRS - if attr in subcommand_args + if hasattr(args, attr) }, ) except yaml.parser.ParserError as e: _LOGGER.error(f"Project config parse failed -- {e}") sys.exit(1) - elif is_pephub_registry_path(subcommand_args.pep_config): - if vars(subcommand_args)[SAMPLE_PL_ARG]: + elif is_pephub_registry_path(args.pep_config): + if getattr(args, SAMPLE_PL_ARG, None): p = Project( - amendments=subcommand_args.amend, + amendments=args.amend, divcfg_path=divcfg, runp=subcommand_name == "runp", - project_dict=PEPHubClient().load_raw_pep( - registry_path=subcommand_args.pep_config - ), + project_dict=PEPHubClient().load_raw_pep(registry_path=args.pep_config), **{ - attr: getattr(subcommand_args, attr) + attr: getattr(args, attr) for attr in CLI_PROJ_ATTRS - if attr in subcommand_args + if hasattr(args, attr) }, ) else: raise MisconfigurationException( - f"`sample_pipeline_interface` is missing. Provide it in the parameters." + "`sample_pipeline_interface` is missing. Provide it in the parameters." ) else: raise MisconfigurationException( - f"Cannot load PEP. Check file path or registry path to pep." + "Cannot load PEP. Check file path or registry path to pep." ) selected_compute_pkg = p.selected_compute_package or DEFAULT_COMPUTE_RESOURCES_NAME if p.dcc is not None and not p.dcc.activate_package(selected_compute_pkg): _LOGGER.info( - "Failed to activate '{}' computing package. " - "Using the default one".format(selected_compute_pkg) + "Failed to activate '{}' computing package. Using the default one".format( + selected_compute_pkg + ) ) with ProjectContext( prj=p, - selector_attribute=subcommand_args.sel_attr, - selector_include=subcommand_args.sel_incl, - selector_exclude=subcommand_args.sel_excl, - selector_flag=subcommand_args.sel_flag, - exclusion_flag=subcommand_args.exc_flag, + selector_attribute=args.sel_attr, + selector_include=args.sel_incl, + selector_exclude=args.sel_excl, + selector_flag=args.sel_flag, + exclusion_flag=args.exc_flag, ) as prj: - # Check at the beginning if user wants to use pipestat and pipestat is configurable is_pipestat_configured = ( prj._check_if_pipestat_configured(pipeline_type=PipelineLevel.PROJECT.value) - if getattr(subcommand_args, "project", None) or subcommand_name == "runp" + if getattr(args, "project", None) or subcommand_name == "runp" else prj._check_if_pipestat_configured() ) if subcommand_name in ["run", "rerun"]: - if getattr(subcommand_args, "project", None): + if getattr(args, "project", None): _LOGGER.warning( "Project flag set but 'run' command was used. Please use 'runp' to run at project-level." ) rerun = subcommand_name == "rerun" run = Runner(prj) try: - # compute_kwargs = _proc_resources_spec(args) - compute_kwargs = _proc_resources_spec(subcommand_args) - - # TODO Shouldn't top level args and subcommand args be accessible on the same object? - return run( - subcommand_args, top_level_args=args, rerun=rerun, **compute_kwargs + compute_kwargs = _proc_resources_spec( + args, read_yaml_file, EXAMPLE_COMPUTE_SPEC_FMT, _LOGGER ) + + return run(args, rerun=rerun, **compute_kwargs) except SampleFailedException: sys.exit(1) except IOError: @@ -286,40 +305,42 @@ def run_looper(args: TopLevelParser, parser: ArgumentParser, test_args=None): raise if subcommand_name == "runp": - compute_kwargs = _proc_resources_spec(subcommand_args) + compute_kwargs = _proc_resources_spec( + args, read_yaml_file, EXAMPLE_COMPUTE_SPEC_FMT, _LOGGER + ) collate = Collator(prj) - collate(subcommand_args, **compute_kwargs) + collate(args, **compute_kwargs) return collate.debug if subcommand_name == "destroy": - return Destroyer(prj)(subcommand_args) + return Destroyer(prj)(args) if subcommand_name == "table": if is_pipestat_configured: - return Tabulator(prj)(subcommand_args) + return Tabulator(prj)(args) else: raise PipestatConfigurationException("table") if subcommand_name == "report": if is_pipestat_configured: - return Reporter(prj)(subcommand_args) + return Reporter(prj)(args) else: raise PipestatConfigurationException("report") if subcommand_name == "link": if is_pipestat_configured: - Linker(prj)(subcommand_args) + Linker(prj)(args) else: raise PipestatConfigurationException("link") if subcommand_name == "check": if is_pipestat_configured: - return Checker(prj)(subcommand_args) + return Checker(prj)(args) else: raise PipestatConfigurationException("check") if subcommand_name == "clean": - return Cleaner(prj)(subcommand_args) + return Cleaner(prj)(args) if subcommand_name == "inspect": # Inspect PEP from Eido @@ -335,50 +356,64 @@ def run_looper(args: TopLevelParser, parser: ArgumentParser, test_args=None): def main(test_args=None) -> dict: - parser = pydantic_argparse.ArgumentParser( - model=TopLevelParser, - prog="looper", - description="Looper: A job submitter for Portable Encapsulated Projects", - add_help=True, - version="2.0.3", - ) + """Main entry point for looper CLI. + + Uses pydantic-settings for CLI parsing. - parser = add_short_arguments(parser, ArgumentEnum) + Args: + test_args: Optional list of arguments for testing + Returns: + Result from run_looper + """ if test_args: - args = parser.parse_typed_args(args=test_args) + args = TopLevelParser(_cli_parse_args=test_args) else: - args = parser.parse_typed_args() + args = TopLevelParser() - return run_looper(args, parser, test_args=test_args) + flat_args = flatten_args(args) + return run_looper(flat_args, test_args=test_args) def main_cli() -> None: main() -def _proc_resources_spec(args): - """ - Process CLI-sources compute setting specification. There are two sources - of compute settings in the CLI alone: +def _proc_resources_spec( + args, read_yaml_file, example_compute_spec_fmt, logger +) -> dict[str, str]: + """Process CLI-sources compute setting specification. + + There are two sources of compute settings in the CLI alone: * YAML file (--settings argument) * itemized compute settings (--compute argument) - The itemized compute specification is given priority + The itemized compute specification is given priority. + + Args: + args (argparse.Namespace): Arguments namespace. + read_yaml_file: Function to read YAML files. + example_compute_spec_fmt: Example format string for error messages. + logger: Logger instance. + + Returns: + Mapping[str, str]: Binding between resource setting name and value. - :param argparse.Namespace: arguments namespace - :return Mapping[str, str]: binding between resource setting name and value - :raise ValueError: if interpretation of the given specification as encoding - of key-value pairs fails + Raises: + ValueError: If interpretation of the given specification as encoding + of key-value pairs fails. """ + import yaml + spec = getattr(args, "compute", None) settings = args.settings try: settings_data = read_yaml_file(settings) or {} except yaml.YAMLError: - _LOGGER.warning( - "Settings file ({}) does not follow YAML format," - " disregarding".format(settings) + logger.warning( + "Settings file ({}) does not follow YAML format, disregarding".format( + settings + ) ) settings_data = {} if not spec: @@ -400,7 +435,7 @@ def _proc_resources_spec(args): if bads: raise ValueError( "Could not correctly parse itemized compute specification. " - "Correct format: " + EXAMPLE_COMPUTE_SPEC_FMT + "Correct format: " + example_compute_spec_fmt ) elif isinstance(spec, dict): for key, value in spec.items(): diff --git a/looper/command_models/DEVELOPER.md b/looper/command_models/DEVELOPER.md deleted file mode 100644 index d71f7bf65..000000000 --- a/looper/command_models/DEVELOPER.md +++ /dev/null @@ -1,85 +0,0 @@ -# Developer documentation - -## Adding new command models - -To add a new model (command) to the project, follow these steps: - -1. Add new arguments in `looper/command_models/arguments.py` if necessary. - -- Add a new entry for the `ArgumentEnum` class. -- For example: - -```python -# arguments.py - -class ArgumentEnum(enum.Enum): - ... - - NEW_ARGUMENT = Argument( - name="new_argument", - default=(new_argument_type, "default_value"), - description="Description of the new argument", - ) - -``` - -2. Create a new command in the existing command creation logic in `looper/command_models/commands.py`. - -- Create a new `Command` instance. -- Create a `pydantic` model for this new command. -- Add the new `Command` instance to `SUPPORTED_COMMANDS`. -- For example: - -```python -NewCommandParser = Command( - "new_command", - MESSAGE_BY_SUBCOMMAND["new_command"], - [ - ... - ArgumentEnum.NEW_ARGUMENT.value, - # Add more arguments as needed for the new command - ], -) -NewCommandParserModel = NewCommandParser.create_model() - -SUPPORTED_COMMANDS = [..., NewCommandParser] -``` - -3. Update the new argument(s) and command in `TopLevelParser` from `looper/command_models/commands.py`. - -- Add a new field for the new command. -- Add a new field for the new argument(s). -- For example: - -```python -class TopLevelParser(pydantic.BaseModel): - - # commands - ... - new_command: Optional[NewCommandParserModel] = pydantic.Field(description=NewCommandParser.description) - - # arguments - ... - new_argument: Optional[new_argument_type] = ArgumentEnum.NEW_ARGUMENT.value.with_reduced_default() -``` - -## Special treatment for the `run` command - -The `run` command in our project requires special treatment to accommodate hierarchical namespaces -and properly handle its unique characteristics. Several functions have been adapted to ensure the -correct behavior of the run command, and similar adaptations may be necessary for other commands. - -For developers looking to understand the details of the special treatment given to the `run` -command and its associated changes, we recommend to inspect the following functions / part of the -code: -- `looper/cli_looper.py`: - - `make_hierarchical_if_needed()` - - assignment of the `divcfg` variable - - assignment of the `project_args` variable - - `_proc_resources_spec()` - - `validate_post_parse()` -- `looper/utils.py`: - - `enrich_args_via_cfg()` - -If you are adding new commands to the project / migrate existing commands to a `pydantic` model-based definition, adapt these parts of the codes with equivalent behavior for your new command. -Likewise, adapt argument accessions in the corresponding executor in `looper/looper.py` to take into account the hierarchical organization of the command's arguments. diff --git a/looper/command_models/README.md b/looper/command_models/README.md index dea00d8bd..dc4eb4a67 100644 --- a/looper/command_models/README.md +++ b/looper/command_models/README.md @@ -1,4 +1,36 @@ -# `pydantic`-based definitions of `looper` commands and their arguments +# pydantic-based definitions of looper commands and their arguments -With the goal of writing an HTTP API that is in sync with the `looper` CLI, this module defines `looper` commands as `pydantic` models and arguments as fields in there. -These can then be used by the [`pydantic-argparse`](https://pydantic-argparse.supimdos.com/) library to create a type-validated CLI (see `../cli_pydantic.py`), and by the future HTTP API for validating `POST`ed JSON data. Eventually, the `pydantic-argparse`-based CLI will replace the existing `argparse`-based CLI defined in `../cli_looper.py`. +This module defines looper commands as pydantic models for use with: +- `pydantic-settings` for CLI parsing (see `../cli_pydantic.py`) +- HTTP API for validating POST data (see `../api/`) + +## Key files + +- `commands.py` - Command definitions and `TopLevelParser` (pydantic-settings entry point) +- `arguments.py` - Argument definitions (`ArgumentEnum`) +- `messages.py` - Subcommand help text + +## Adding a new command + +1. Add arguments to `ArgumentEnum` in `arguments.py`: + ```python + NEW_ARGUMENT = Argument( + name="new_argument", + default=(str, "default_value"), + description="Description", + ) + ``` + +2. Create the command in `commands.py`: + ```python + NewCommandParser = Command("new_command", MESSAGE_BY_SUBCOMMAND["new_command"], [...]) + NewCommandParserModel = NewCommandParser.create_model() + SUPPORTED_COMMANDS.append(NewCommandParser) + ``` + +3. Add to `TopLevelParser`: + ```python + new_command: CliSubCommand[NewCommandParserModel] = Field(description=...) + ``` + +4. Handle the command in `../cli_pydantic.py` `run_looper()`. diff --git a/looper/command_models/__init__.py b/looper/command_models/__init__.py index 46d1c396b..989ba827e 100644 --- a/looper/command_models/__init__.py +++ b/looper/command_models/__init__.py @@ -1,6 +1,5 @@ """ -This package holds `pydantic` models that describe commands and their arguments. +This package holds pydantic models that describe commands and their arguments. -These can be used either by an HTTP API or with the `pydantic-argparse` -library to build a CLI. +These are used by pydantic-settings for CLI parsing and by the HTTP API. """ diff --git a/looper/command_models/arguments.py b/looper/command_models/arguments.py index 68c329772..360cea32c 100644 --- a/looper/command_models/arguments.py +++ b/looper/command_models/arguments.py @@ -1,73 +1,91 @@ """ -Argument definitions via a thin wrapper around `pydantic.fields.FieldInfo` +Argument definitions for CLI arguments/flags. + +Stores CLI argument metadata (name, type, default, description, alias) +for use in both pydantic-settings CLI and FastAPI interfaces. """ import enum import os -from copy import copy -from typing import Any, List - -import pydantic.v1 as pydantic +from typing import Any +import pydantic +from pydantic import AliasChoices -class Argument(pydantic.fields.FieldInfo): - """ - CLI argument / flag definition - This class is designed to define CLI arguments or flags. It leverages - Pydantic for data validation and serves as a source of truth for multiple - interfaces, including a CLI. +class Argument: + """CLI argument / flag definition. - Naively, one would think one could just subclass `pydantic.Field`, - but actually `pydantic.Field` is a function, and not a class. - `pydantic.Field()` returns a validated `FieldInfo` instance, - so we instead subclass `FieldInfo` directly and validate it in the - constructor. + This class stores CLI argument metadata for use in multiple interfaces: + - pydantic-settings CLI (via CliSubCommand) + - FastAPI HTTP API (via pydantic models) - :param str name: argument name, e.g. "ignore-args" - :param Any default: a tuple of the form (type, default_value). If the - default value is `...` (Ellipsis), then the argument is required. - :param str description: argument description, which will appear as the - help text for this argument - :param dict kwargs: additional keyword arguments supported by - `FieldInfo`. These are passed along as they are. + Args: + name (str): Argument name, e.g. "ignore-args". + default (Any): A tuple of the form (type, default_value). If the + default value is `...` (Ellipsis), then the argument is required. + description (str): Argument description, which will appear as the + help text for this argument. + alias (str | None): Short argument alias, e.g. "-i". """ def __init__( - self, name: str, default: Any, description: str, alias: str = None, **kwargs + self, + name: str, + default: Any, + description: str, + alias: str | None = None, ) -> None: self._name = name - super().__init__( - default=default, description=description, alias=alias, **kwargs - ) - self._validate() + self._default = default # tuple: (type, default_value) + self._description = description + self._alias = alias @property - def name(self): - """ - Argument name as used in the CLI, e.g. "ignore-args" - """ + def name(self) -> str: + """Argument name as used in the CLI, e.g. "ignore-args".""" return self._name + @property + def default(self) -> Any: + """Default value tuple (type, default_value).""" + return self._default + + @property + def description(self) -> str: + """Argument description / help text.""" + return self._description + + @property + def alias(self) -> str | None: + """Short argument alias, e.g. "-i".""" + return self._alias + def with_reduced_default(self) -> pydantic.fields.FieldInfo: """ - Convert to a `FieldInfo` instance with reduced default value - - Returns a copy of an instance, but with the `default` attribute - replaced by only the default value, without the type information. - This is required when using an instance in a direct `pydantic` - model definition, instead of creating a model dynamically using - `pydantic.create_model`. + Create a FieldInfo instance with the default value (not the type tuple). - TODO: this is due to this issue: - https://github.com/pydantic/pydantic/issues/2248#issuecomment-757448447 - and it's a bit tedious. + This is used when defining pydantic model fields directly, + where only the default value (not the type) is needed. + Uses AliasChoices to support kebab-case CLI flags (--dry-run) while + keeping underscore field names in Python (dry_run). """ - c = copy(self) - _, default_value = self.default - c.default = default_value - return c + _, default_value = self._default + # kebab-case version of the name for --dry-run style + long_name = self._name.replace("_", "-") + if self._alias: + return pydantic.Field( + default=default_value, + description=self._description, + validation_alias=AliasChoices(self._alias, long_name), + ) + # Even without alias, include kebab-case for CLI compatibility + return pydantic.Field( + default=default_value, + description=self._description, + validation_alias=AliasChoices(long_name), + ) class ArgumentEnum(enum.Enum): @@ -81,13 +99,13 @@ class ArgumentEnum(enum.Enum): IGNORE_FLAGS = Argument( name="ignore_flags", - alias="-i", + alias="i", default=(bool, False), description="Ignore run status flags", ) FORCE_YES = Argument( name="force_yes", - alias="-f", + alias="f", default=(bool, False), description="Provide upfront confirmation of destruction intent, to skip console query. Default=False", ) @@ -106,66 +124,69 @@ class ArgumentEnum(enum.Enum): FLAGS = Argument( name="flags", - alias="-f", - default=(List, []), + alias="f", + default=(list, []), description="Only check samples based on these status flags.", ) TIME_DELAY = Argument( name="time_delay", - alias="-t", + alias="t", default=(int, 0), description="Time delay in seconds between job submissions (min: 0, max: 30)", ) DRY_RUN = Argument( name="dry_run", - alias="-d", + alias="d", default=(bool, False), description="Don't actually submit jobs", ) COMMAND_EXTRA = Argument( name="command_extra", - alias="-x", + alias="x", default=(str, ""), description="String to append to every command", ) COMMAND_EXTRA_OVERRIDE = Argument( name="command_extra_override", - alias="-y", + alias="y", default=(str, ""), description="Same as command-extra, but overrides values in PEP", ) LUMP = Argument( name="lump", - alias="-u", - default=(float, None), + alias="u", + default=(float | None, None), description="Total input file size (GB) to batch into one job", ) LUMPN = Argument( name="lump_n", - alias="-n", - default=(int, None), + alias="n", + default=(int | None, None), description="Number of commands to batch into one job", ) LUMPJ = Argument( name="lump_j", - alias="-j", - default=(int, None), + alias="j", + default=(int | None, None), description="Lump samples into number of jobs.", ) LIMIT = Argument( - name="limit", alias="-l", default=(int, None), description="Limit to n samples" + name="limit", + alias="l", + default=(int | None, None), + description="Limit to n samples", ) SKIP = Argument( name="skip", - alias="-k", - default=(int, None), + alias="k", + default=(int | None, None), description="Skip samples by numerical index", ) CONFIG = Argument( name="config", - alias="-c", - default=(str, None), + alias="c", + default=(str | None, None), description="Looper configuration file (YAML)", ) SETTINGS = Argument( @@ -175,43 +196,49 @@ class ArgumentEnum(enum.Enum): ) PEP_CONFIG = Argument( name="pep_config", - default=(str, None), + default=(str | None, None), description="PEP configuration file", ) OUTPUT_DIR = Argument( name="output_dir", - alias="-o", - default=(str, None), + alias="o", + default=(str | None, None), description="Output directory", ) REPORT_OUTPUT_DIR = Argument( name="report_dir", - alias="-r", - default=(str, None), + alias="r", + default=(str | None, None), description="Set location for looper report and looper table outputs", ) GENERIC = Argument( name="generic", - alias="-g", + alias="g", default=(bool, False), description="Use generic looper config?", ) SAMPLE_PIPELINE_INTERFACES = Argument( name="sample_pipeline_interfaces", - alias="-S", - default=(List, []), + # Backwards compatibility note: Changed from -S to spi with pydantic-settings + # migration. Single-letter aliases are case-insensitive in pydantic-settings, + # causing conflicts with other arguments. + alias="spi", + default=(list, []), description="Paths to looper sample pipeline interfaces", ) PROJECT_PIPELINE_INTERFACES = Argument( name="project_pipeline_interfaces", - alias="-P", - default=(List, []), + # Backwards compatibility note: Changed from -P to ppi with pydantic-settings + # migration. Single-letter aliases are case-insensitive in pydantic-settings, + # causing conflicts with other arguments. + alias="ppi", + default=(list, []), description="Paths to looper project pipeline interfaces", ) AMEND = Argument( - name="amend", default=(List, []), description="List of amendments to activate" + name="amend", default=(list, []), description="List of amendments to activate" ) SEL_ATTR = Argument( name="sel_attr", @@ -220,7 +247,7 @@ class ArgumentEnum(enum.Enum): ) SEL_INCL = Argument( name="sel_incl", - default=(List, []), + default=(list, []), description="Include only samples with these values", ) SEL_EXCL = Argument( @@ -229,26 +256,26 @@ class ArgumentEnum(enum.Enum): description="Exclude samples with these values", ) SEL_FLAG = Argument( - name="sel_flag", default=(List, []), description="Sample selection flag" + name="sel_flag", default=(list, []), description="Sample selection flag" ) EXC_FLAG = Argument( - name="exc_flag", default=(List, []), description="Sample exclusion flag" + name="exc_flag", default=(list, []), description="Sample exclusion flag" ) SKIP_FILE_CHECKS = Argument( name="skip_file_checks", - alias="-f", + alias="f", # Restored: no conflict since run/rerun/runp don't use FORCE_YES default=(bool, False), description="Do not perform input file checks", ) PACKAGE = Argument( name="package", - alias="-p", - default=(str, None), + alias="p", + default=(str | None, None), description="Name of computing resource package to use", ) COMPUTE = Argument( name="compute", - default=(List, []), + default=(list, []), description="List of key-value pairs (k1=v1)", ) DIVVY = Argument( @@ -265,7 +292,7 @@ class ArgumentEnum(enum.Enum): ) VERBOSITY = Argument( name="verbosity", - default=(int, None), + default=(int | None, None), description="Alternate mode of expression for logging level that better " "accords with intuition about how to convey this.", ) @@ -278,7 +305,7 @@ class ArgumentEnum(enum.Enum): ) PIPESTAT = Argument( name="pipestat", - default=(str, None), + default=(str | None, None), description="Path to pipestat files.", ) PORTABLE = Argument( diff --git a/looper/command_models/commands.py b/looper/command_models/commands.py index 69312f0d6..2e1021415 100644 --- a/looper/command_models/commands.py +++ b/looper/command_models/commands.py @@ -1,44 +1,91 @@ """ `pydantic` models for `looper` commands and a wrapper class. + +Uses native pydantic v2 for model definitions. The CLI is built from +these models using argparse in cli_pydantic.py. """ +import json from dataclasses import dataclass -from typing import List, Optional, Type, Union +from typing import Annotated -import pydantic.v1 as pydantic +import pydantic +from pydantic import AliasChoices, BeforeValidator, Field +from pydantic_settings import BaseSettings, CliSubCommand, SettingsConfigDict -from ..const import MESSAGE_BY_SUBCOMMAND from .arguments import Argument, ArgumentEnum -from pydantic_argparse import ArgumentParser +from .messages import MESSAGE_BY_SUBCOMMAND # Local import, no looper/__init__.py + + +def deserialize_cli_list(v): + """Deserialize list values from pydantic-settings CLI parsing. + + pydantic-settings internally serializes all list values as JSON strings + (e.g., ["a"] becomes '["a"]') before passing to CliSubCommand models. + Since subcommands are instantiated directly (not through settings sources), + the automatic JSON deserialization doesn't happen. + + This is a pydantic-settings limitation, not a bug in our code. + See: https://github.com/pydantic/pydantic-settings/issues/335 + """ + if isinstance(v, list): + return v + if isinstance(v, str): + try: + parsed = json.loads(v) + if isinstance(parsed, list): + return parsed + except json.JSONDecodeError: + pass + return [x.strip() for x in v.split(",") if x.strip()] + return v + + +CliList = Annotated[list, BeforeValidator(deserialize_cli_list)] @dataclass class Command: - """ - Representation of a command + """Representation of a command. - :param str name: command name - :param str description: command description - :param list[Argument] arguments: list of arguments supported by this command + Args: + name (str): Command name. + description (str): Command description. + arguments (list[Argument]): List of arguments supported by this command. """ name: str description: str - arguments: List[Argument] + arguments: list[Argument] - def create_model(self) -> Type[pydantic.BaseModel]: + def create_model(self) -> type[pydantic.BaseModel]: """ - Creates a `pydantic` model for this command + Creates a `pydantic` model for this command. + + Uses AliasChoices to support kebab-case CLI flags (--dry-run) while + keeping underscore field names in Python (dry_run). """ - arguments = dict() + arguments = {} for arg in self.arguments: - # These gymnastics are necessary because of - # https://github.com/pydantic/pydantic/issues/2248#issuecomment-757448447 arg_type, arg_default_value = arg.default - arguments[arg.name] = ( - arg_type, - pydantic.Field(arg_default_value, description=arg.description), - ) + if arg_type is list: + arg_type = CliList + # kebab-case version of the name for --dry-run style + long_name = arg.name.replace("_", "-") + if arg.alias: + field = pydantic.Field( + arg_default_value, + description=arg.description, + validation_alias=AliasChoices(arg.alias, long_name), + ) + else: + # Even without alias, include kebab-case for CLI compatibility + field = pydantic.Field( + arg_default_value, + description=arg.description, + validation_alias=AliasChoices(long_name), + ) + arguments[arg.name] = (arg_type, field) return pydantic.create_model(self.name, **arguments) @@ -57,7 +104,6 @@ def create_model(self) -> Type[pydantic.BaseModel]: ArgumentEnum.SAMPLE_PIPELINE_INTERFACES.value, ArgumentEnum.PROJECT_PIPELINE_INTERFACES.value, ArgumentEnum.PIPESTAT.value, - ArgumentEnum.SETTINGS.value, ArgumentEnum.AMEND.value, ArgumentEnum.PROJECT_LEVEL.value, ] @@ -224,7 +270,7 @@ def create_model(self) -> Type[pydantic.BaseModel]: LinkParser.arguments.append(arg) InspectParser.arguments.append(arg) -# Create all Models +# Create all Models (for use with FastAPI) RunParserModel = RunParser.create_model() RerunParserModel = RerunParser.create_model() RunProjectParserModel = RunProjectParser.create_model() @@ -239,39 +285,6 @@ def create_model(self) -> Type[pydantic.BaseModel]: InitPifaceParserModel = InitPifaceParser.create_model() -def add_short_arguments( - parser: ArgumentParser, argument_enums: Type[ArgumentEnum] -) -> ArgumentParser: - """ - This function takes a parser object created under pydantic argparse and adds the short arguments AFTER the initial creation. - This is a workaround as pydantic-argparse does not currently support this during initial parser creation. - - :param ArgumentParser parser: parser before adding short arguments - :param Type[ArgumentEnum] argument_enums: enumeration of arguments that contain names and aliases - :return ArgumentParser parser: parser after short arguments have been added - """ - - for cmd in parser._subcommands.choices.keys(): - - for argument_enum in list(argument_enums): - # First check there is an alias for the argument otherwise skip - if argument_enum.value.alias: - short_key = argument_enum.value.alias - long_key = "--" + argument_enum.value.name.replace( - "_", "-" - ) # We must do this because the ArgumentEnum names are transformed during parser creation - if long_key in parser._subcommands.choices[cmd]._option_string_actions: - argument = parser._subcommands.choices[cmd]._option_string_actions[ - long_key - ] - argument.option_strings = (short_key, long_key) - parser._subcommands.choices[cmd]._option_string_actions[ - short_key - ] = argument - - return parser - - SUPPORTED_COMMANDS = [ RunParser, RerunParser, @@ -288,48 +301,54 @@ def add_short_arguments( ] -class TopLevelParser(pydantic.BaseModel): - """ - Top level parser that takes - - commands (run, runp, check...) - - arguments that are required no matter the subcommand - """ +class TopLevelParser(BaseSettings): + """A pipeline submission engine for PEP-formatted projects.""" - # commands - run: Optional[RunParserModel] = pydantic.Field(description=RunParser.description) - rerun: Optional[RerunParserModel] = pydantic.Field( - description=RerunParser.description + model_config = SettingsConfigDict( + cli_parse_args=True, + cli_prog_name="looper", + cli_kebab_case=True, # Use --dry-run not --dry_run + cli_implicit_flags=True, # Allow --dry-run without value (instead of --dry-run true) + cli_hide_none_type=True, # Hide {bool,null} type hints in help ) - runp: Optional[RunProjectParserModel] = pydantic.Field( - description=RunProjectParser.description + + # commands (CliSubCommand creates argparse subparsers - only one is used at a time) + run: CliSubCommand[RunParserModel] = Field(description=MESSAGE_BY_SUBCOMMAND["run"]) + rerun: CliSubCommand[RerunParserModel] = Field( + description=MESSAGE_BY_SUBCOMMAND["rerun"] ) - table: Optional[TableParserModel] = pydantic.Field( - description=TableParser.description + runp: CliSubCommand[RunProjectParserModel] = Field( + description=MESSAGE_BY_SUBCOMMAND["runp"] ) - report: Optional[ReportParserModel] = pydantic.Field( - description=ReportParser.description + table: CliSubCommand[TableParserModel] = Field( + description=MESSAGE_BY_SUBCOMMAND["table"] ) - destroy: Optional[DestroyParserModel] = pydantic.Field( - description=DestroyParser.description + report: CliSubCommand[ReportParserModel] = Field( + description=MESSAGE_BY_SUBCOMMAND["report"] ) - check: Optional[CheckParserModel] = pydantic.Field( - description=CheckParser.description + destroy: CliSubCommand[DestroyParserModel] = Field( + description=MESSAGE_BY_SUBCOMMAND["destroy"] ) - clean: Optional[CleanParserModel] = pydantic.Field( - description=CleanParser.description + check: CliSubCommand[CheckParserModel] = Field( + description=MESSAGE_BY_SUBCOMMAND["check"] ) - init: Optional[InitParserModel] = pydantic.Field(description=InitParser.description) - init_piface: Optional[InitPifaceParserModel] = pydantic.Field( - description=InitPifaceParser.description + clean: CliSubCommand[CleanParserModel] = Field( + description=MESSAGE_BY_SUBCOMMAND["clean"] ) - link: Optional[LinkParserModel] = pydantic.Field(description=LinkParser.description) - - inspect: Optional[InspectParserModel] = pydantic.Field( - description=InspectParser.description + init: CliSubCommand[InitParserModel] = Field( + description=MESSAGE_BY_SUBCOMMAND["init"] + ) + init_piface: CliSubCommand[InitPifaceParserModel] = Field( + description=MESSAGE_BY_SUBCOMMAND["init-piface"] + ) + link: CliSubCommand[LinkParserModel] = Field( + description=MESSAGE_BY_SUBCOMMAND["link"] + ) + inspect: CliSubCommand[InspectParserModel] = Field( + description=MESSAGE_BY_SUBCOMMAND["inspect"] ) - # Additional arguments for logging, added to ALL commands - # These must be used before the command - silent: Optional[bool] = ArgumentEnum.SILENT.value.with_reduced_default() - verbosity: Optional[int] = ArgumentEnum.VERBOSITY.value.with_reduced_default() - logdev: Optional[bool] = ArgumentEnum.LOGDEV.value.with_reduced_default() + # Additional arguments for logging + silent: bool | None = ArgumentEnum.SILENT.value.with_reduced_default() + verbosity: int | None = ArgumentEnum.VERBOSITY.value.with_reduced_default() + logdev: bool | None = ArgumentEnum.LOGDEV.value.with_reduced_default() diff --git a/looper/command_models/messages.py b/looper/command_models/messages.py new file mode 100644 index 000000000..71aa82dd5 --- /dev/null +++ b/looper/command_models/messages.py @@ -0,0 +1,20 @@ +"""Subcommand help messages for CLI. + +Extracted to avoid importing looper.const (which triggers heavy package imports) +during CLI startup for --help. +""" + +MESSAGE_BY_SUBCOMMAND = { + "run": "Run or submit sample jobs.", + "rerun": "Resubmit sample jobs with failed flags.", + "runp": "Run or submit project jobs.", + "table": "Write summary stats table for project samples.", + "report": "Create browsable HTML report of project results.", + "destroy": "Remove output files of the project.", + "check": "Check flag status of current runs.", + "clean": "Run clean scripts of already processed jobs.", + "inspect": "Print information about a project.", + "init": "Initialize looper config file.", + "init-piface": "Initialize generic pipeline interface.", + "link": "Create directory of symlinks for reported results.", +} diff --git a/looper/conductor.py b/looper/conductor.py index 268db5432..d3a4760b1 100644 --- a/looper/conductor.py +++ b/looper/conductor.py @@ -3,55 +3,78 @@ import importlib import logging import os -import subprocess +import shlex import signal -import psutil +import subprocess import sys +import threading import time -import yaml -from math import ceil from json import loads +from math import ceil from subprocess import check_output -from typing import * -from eido import read_schema, get_input_files_size +import psutil +import yaml +from eido import get_input_files_size, read_schema from eido.const import INPUT_FILE_SIZE_KEY, MISSING_KEY from jinja2.exceptions import UndefinedError - from peppy.const import CONFIG_KEY, SAMPLE_YAML_EXT from peppy.exceptions import RemoteYAMLError from pipestat import PipestatError from ubiquerg import expandpath +from yacman import YAMLConfigManager from yaml import dump -from yacman import FutureYAMLConfigManager as YAMLConfigManager -from .const import * +from .const import ( + EXTRA_PROJECT_CMD_TEMPLATE, + EXTRA_SAMPLE_CMD_TEMPLATE, + JOB_NAME_KEY, + NOT_SUB_MSG, + OUTDIR_KEY, + OUTPUT_SCHEMA_KEY, + PRE_SUBMIT_CMD_KEY, + PRE_SUBMIT_HOOK_KEY, + PRE_SUBMIT_PY_FUN_KEY, + PROJECT_PL_KEY, + RESULTS_SUBDIR_KEY, + SAMPLE_CWL_YAML_PATH_KEY, + SAMPLE_PL_KEY, + SUBMISSION_SUBDIR_KEY, + VAR_TEMPL_KEY, + PipelineLevel, +) from .exceptions import JobSubmissionException from .processed_project import populate_sample_paths from .utils import ( + expand_nested_var_templates, fetch_sample_flags, jinja_render_template_strictly, - expand_nested_var_templates, + render_inject_env_vars, ) -from .const import PipelineLevel - _LOGGER = logging.getLogger(__name__) -def _get_yaml_path(namespaces, template_key, default_name_appendix="", filename=None): - """ - Get a path to a YAML file for the sample. - - :param dict[dict]] namespaces: namespaces mapping - :param str template_key: the name of the key in 'var_templates' piface - section that points to a template to render to get the - user-provided target YAML path - :param str default_name_appendix: a string to append to insert in target - YAML file name: '{sample.sample_name}<>.yaml' - :param str filename: A filename without folders. If not provided, a - default name of sample_name.yaml will be used. - :return str: sample YAML file path +def _get_yaml_path( + namespaces: dict, + template_key: str, + default_name_appendix: str = "", + filename: str | None = None, +) -> str: + """Get a path to a YAML file for the sample. + + Args: + namespaces (dict[dict]): Namespaces mapping. + template_key (str): The name of the key in 'var_templates' piface + section that points to a template to render to get the + user-provided target YAML path. + default_name_appendix (str): A string to append to insert in target + YAML file name: '{sample.sample_name}<>.yaml'. + filename (str): A filename without folders. If not provided, a + default name of sample_name.yaml will be used. + + Returns: + str: Sample YAML file path. """ if ( VAR_TEMPL_KEY in namespaces["pipeline"] @@ -90,12 +113,19 @@ def _get_yaml_path(namespaces, template_key, default_name_appendix="", filename= return final_path -def write_pipestat_config(looper_pipestat_config_path, pipestat_config_dict): - """ - This writes a combined configuration file to be passed to a PipestatManager. - :param str looper_pipestat_config_path: path to the created pipestat configuration file - :param dict pipestat_config_dict: the dict containing key value pairs to be written to the pipestat configutation - return bool +def write_pipestat_config( + looper_pipestat_config_path: str, pipestat_config_dict: dict +) -> bool: + """Write a combined configuration file to be passed to a PipestatManager. + + Args: + looper_pipestat_config_path (str): Path to the created pipestat + configuration file. + pipestat_config_dict (dict): The dict containing key value pairs to be + written to the pipestat configuration. + + Returns: + bool: True if successful. """ if not os.path.exists(os.path.dirname(looper_pipestat_config_path)): @@ -113,12 +143,14 @@ def write_pipestat_config(looper_pipestat_config_path, pipestat_config_dict): return True -def write_submission_yaml(namespaces): - """ - Save all namespaces to YAML. +def write_submission_yaml(namespaces: dict) -> dict: + """Save all namespaces to YAML. - :param dict namespaces: variable namespaces dict - :return dict: sample namespace dict + Args: + namespaces (dict): Variable namespaces dict. + + Returns: + dict: Sample namespace dict. """ path = _get_yaml_path(namespaces, SAMPLE_CWL_YAML_PATH_KEY, "_submission") my_namespaces = {} @@ -145,54 +177,54 @@ def __init__( self, pipeline_interface, prj, - delay=0, - extra_args=None, - extra_args_override=None, - ignore_flags=False, - compute_variables=None, - max_cmds=None, - max_size=None, - max_jobs=None, - automatic=True, - collate=False, - ): - """ - Create a job submission manager. + delay: float = 0, + extra_args: str | None = None, + extra_args_override: str | None = None, + ignore_flags: bool = False, + compute_variables: dict | None = None, + max_cmds: int | None = None, + max_size: int | float | None = None, + max_jobs: int | float | None = None, + automatic: bool = True, + collate: bool = False, + ) -> None: + """Create a job submission manager. The most critical inputs are the pipeline interface and the pipeline key, which together determine which provide critical pipeline information like resource allocation packages and which pipeline will be overseen by this instance, respectively. - :param PipelineInterface pipeline_interface: Collection of important - data for one or more pipelines, like resource allocation packages - and option/argument specifications - :param prj: Project with which each sample being considered is - associated (what generated each sample) - :param float delay: Time (in seconds) to wait before submitting a job - once it's ready - :param str extra_args: string to pass to each job generated, - for example additional pipeline arguments - :param str extra_args_override: string to pass to each job generated, - for example additional pipeline arguments. This deactivates the - 'extra' functionality that appends strings defined in - Sample.command_extra and Project.looper.command_extra to the - command template. - :param bool ignore_flags: Whether to ignore flag files present in - the sample folder for each sample considered for submission - :param dict[str] compute_variables: A dict with variables that will be made - available to the compute package. For example, this should include - the name of the cluster partition to which job or jobs will be submitted - :param int | NoneType max_cmds: Upper bound on number of commands to - include in a single job script. - :param int | float | NoneType max_size: Upper bound on total file - size of inputs used by the commands lumped into single job script. - :param int | float | NoneType max_jobs: Upper bound on total number of jobs to - group samples for submission. - :param bool automatic: Whether the submission should be automatic once - the pool reaches capacity. - :param bool collate: Whether a collate job is to be submitted (runs on - the project level, rather that on the sample level) + Args: + pipeline_interface (PipelineInterface): Collection of important + data for one or more pipelines, like resource allocation packages + and option/argument specifications. + prj: Project with which each sample being considered is + associated (what generated each sample). + delay (float): Time (in seconds) to wait before submitting a job + once it's ready. + extra_args (str): String to pass to each job generated, + for example additional pipeline arguments. + extra_args_override (str): String to pass to each job generated, + for example additional pipeline arguments. This deactivates the + 'extra' functionality that appends strings defined in + Sample.command_extra and Project.looper.command_extra to the + command template. + ignore_flags (bool): Whether to ignore flag files present in + the sample folder for each sample considered for submission. + compute_variables (dict[str]): A dict with variables that will be made + available to the compute package. For example, this should include + the name of the cluster partition to which job or jobs will be submitted. + max_cmds (int | None): Upper bound on number of commands to + include in a single job script. + max_size (int | float | None): Upper bound on total file + size of inputs used by the commands lumped into single job script. + max_jobs (int | float | None): Upper bound on total number of jobs to + group samples for submission. + automatic (bool): Whether the submission should be automatic once + the pool reaches capacity. + collate (bool): Whether a collate job is to be submitted (runs on + the project level, rather that on the sample level). """ super(SubmissionConductor, self).__init__() @@ -224,8 +256,9 @@ def __init__( if self.extra_pipe_args: _LOGGER.debug( - "String appended to every pipeline command: " - "{}".format(self.extra_pipe_args) + "String appended to every pipeline command: {}".format( + self.extra_pipe_args + ) ) if max_jobs: @@ -258,32 +291,35 @@ def __init__( self._skipped_sample_pools = [] @property - def failed_samples(self): + def failed_samples(self) -> list[str]: return self._failed_sample_names @property - def num_cmd_submissions(self): - """ - Return the number of commands that this conductor has submitted. + def num_cmd_submissions(self) -> int: + """Return the number of commands that this conductor has submitted. - :return int: Number of commands submitted so far. + Returns: + int: Number of commands submitted so far. """ return self._num_cmds_submitted @property - def num_job_submissions(self): - """ - Return the number of jobs that this conductor has submitted. + def num_job_submissions(self) -> int: + """Return the number of jobs that this conductor has submitted. - :return int: Number of jobs submitted so far. + Returns: + int: Number of jobs submitted so far. """ return self._num_good_job_submissions - def is_project_submittable(self, force=False): - """ - Check whether the current project has been already submitted + def is_project_submittable(self, force: bool = False) -> bool: + """Check whether the current project has been already submitted. + + Args: + force (bool): Whether to force the project submission (ignore status/flags). - :param bool frorce: whether to force the project submission (ignore status/flags) + Returns: + bool: True if the project is submittable, False otherwise. """ psms = {} if self.prj.pipestat_configured_project: @@ -297,18 +333,21 @@ def is_project_submittable(self, force=False): return False return True - def add_sample(self, sample, rerun=False): - """ - Add a sample for submission to this conductor. - - :param peppy.Sample sample: sample to be included with this conductor's - currently growing collection of command submissions - :param bool rerun: whether the given sample is being rerun rather than - run for the first time - :return bool: Indication of whether the given sample was added to - the current 'pool.' - :raise TypeError: If sample subtype is provided but does not extend - the base Sample class, raise a TypeError. + def add_sample(self, sample, rerun: bool = False) -> list: + """Add a sample for submission to this conductor. + + Args: + sample (peppy.Sample): Sample to be included with this conductor's + currently growing collection of command submissions. + rerun (bool): Whether the given sample is being rerun rather than + run for the first time. + + Returns: + list: List of skip reasons if sample was not added. + + Raises: + TypeError: If sample subtype is provided but does not extend + the base Sample class. """ _LOGGER.debug( "Adding {} to conductor for {} to {}run".format( @@ -331,7 +370,7 @@ def add_sample(self, sample, rerun=False): use_this_sample = True # default to running this sample msg = None if rerun and sample_statuses == []: - msg = f"> Skipping sample because rerun requested, but no failed or waiting flag found." + msg = "> Skipping sample because rerun requested, but no failed or waiting flag found." use_this_sample = False if sample_statuses: status_str = ", ".join(sample_statuses) @@ -389,23 +428,26 @@ def add_sample(self, sample, rerun=False): return skip_reasons - def submit(self, force=False): - """ - Submit one or more commands as a job. + def submit(self, force: bool = False) -> bool: + """Submit one or more commands as a job. This call will submit the commands corresponding to the current pool of samples if and only if the argument to 'force' evaluates to a true value, or the pool of samples is full. - :param bool force: Whether submission should be done/simulated even - if this conductor's pool isn't full. - :return bool: Whether a job was submitted (or would've been if - not for dry run) + Args: + force (bool): Whether submission should be done/simulated even + if this conductor's pool isn't full. + + Returns: + bool: Whether a job was submitted (or would've been if + not for dry run). """ submitted = False # Override signal handler so that Ctrl+C can be used to gracefully terminate child process - signal.signal(signal.SIGINT, self._signal_int_handler) + if threading.current_thread() is threading.main_thread(): + signal.signal(signal.SIGINT, self._signal_int_handler) if not self._pool: _LOGGER.debug("No submission (no pooled samples): %s", self.pl_name) @@ -432,10 +474,29 @@ def submit(self, force=False): _LOGGER.info("Dry run, not submitted") elif self._rendered_ok: sub_cmd = self.prj.dcc.compute["submission_command"] - submission_command = "{} {}".format(sub_cmd, script) + + # Detect shell metacharacters that require shell=True + shell_chars = set("|&;<>()$`\\\"' \t\n*?[#~") + needs_shell = any(c in sub_cmd for c in shell_chars) and sub_cmd != "." + # Capture submission command return value so that we can # intercept and report basic submission failures; #167 - process = subprocess.Popen(submission_command, shell=True) + if sub_cmd == ".": + # Direct execution: run script through bash without a submission wrapper + _LOGGER.debug("Direct execution via bash: %s", script) + process = subprocess.Popen(["/bin/bash", script]) + elif needs_shell: + _LOGGER.debug( + "Shell execution (detected shell syntax): %s %s", + sub_cmd, + script, + ) + process = subprocess.Popen( + f"{sub_cmd} {script}", shell=True, executable="/bin/bash" + ) + else: + _LOGGER.debug("Direct execution: %s %s", sub_cmd, script) + process = subprocess.Popen(shlex.split(sub_cmd) + [script]) self.process_id = process.pid process.wait() if process.returncode != 0: @@ -462,30 +523,34 @@ def submit(self, force=False): return submitted - def _is_full(self, pool, size): - """ - Determine whether it's time to submit a job for the pool of commands. + def _is_full(self, pool: list, size: float) -> bool: + """Determine whether it's time to submit a job for the pool of commands. Instances of this class maintain a sort of 'pool' of commands that expands as each new command is added, until a time that it's deemed - 'full' and th + 'full'. + + Args: + pool: Collection of samples/commands. + size: Current total size. - :return bool: Whether this conductor's pool of commands is 'full' and - ready for submission, as determined by its parameterization + Returns: + bool: Whether this conductor's pool of commands is 'full' and + ready for submission, as determined by its parameterization. """ return self.max_cmds == len(pool) or size >= self.max_size @property - def _samples(self): - """ - Return a collection of pooled samples. + def _samples(self) -> list: + """Return a collection of pooled samples. - :return Iterable[str]: collection of samples currently in the active - pool for this submission conductor + Returns: + Iterable[str]: Collection of samples currently in the active + pool for this submission conductor. """ return [s for s in self._pool] - def _sample_lump_name(self, pool): + def _sample_lump_name(self, pool: list) -> str: """Determine how to refer to the 'sample' for this submission.""" if self.collate: return self.prj.name @@ -505,16 +570,21 @@ def _sample_lump_name(self, pool): # name concordant with 1-based, not 0-based indexing. return "lump{}".format(self._num_total_job_submissions + 1) - def _signal_int_handler(self, signal, frame): - """ - For catching interrupt (Ctrl +C) signals. Fails gracefully. + def _signal_int_handler(self, signal, frame) -> None: + """For catching interrupt (Ctrl +C) signals. Fails gracefully. + + Args: + signal: Signal received. + frame: Current stack frame. """ signal_type = "SIGINT" self._generic_signal_handler(signal_type) - def _generic_signal_handler(self, signal_type): - """ - Function for handling both SIGTERM and SIGINT + def _generic_signal_handler(self, signal_type: str) -> None: + """Function for handling both SIGTERM and SIGINT. + + Args: + signal_type (str): Type of signal received (SIGTERM or SIGINT). """ message = "Received " + signal_type + ". Failing gracefully..." _LOGGER.warning(msg=message) @@ -523,7 +593,7 @@ def _generic_signal_handler(self, signal_type): sys.exit(1) - def _terminate_current_subprocess(self): + def _terminate_current_subprocess(self) -> None: """This terminates the current sub process associated with self.process_id""" def pskill(proc_pid, sig=signal.SIGINT): @@ -570,15 +640,18 @@ def pskill(proc_pid, sig=signal.SIGINT): note = "was already terminated" _LOGGER.warning(msg=f"Child process {self.process_id} {note}.") - def _attend_process(self, proc, sleeptime): - """ - Waits on a process for a given time to see if it finishes, returns True - if it's still running after the given time or False as soon as it - returns. + def _attend_process(self, proc, sleeptime: float) -> bool: + """Wait on a process for a given time to see if it finishes. + + Returns True if it's still running after the given time or False as + soon as it returns. - :param psutil.Process proc: Process object opened by psutil.Popen() - :param float sleeptime: Time to wait - :return bool: True if process is still running; otherwise false + Args: + proc (psutil.Process): Process object opened by psutil.Popen(). + sleeptime (float): Time to wait. + + Returns: + bool: True if process is still running; otherwise false. """ try: proc.wait(timeout=int(sleeptime)) @@ -586,19 +659,23 @@ def _attend_process(self, proc, sleeptime): return True return False - def _jobname(self, pool): + def _jobname(self, pool: list) -> str: """Create the name for a job submission.""" return "{}_{}".format(self.pl_iface.pipeline_name, self._sample_lump_name(pool)) - def _build_looper_namespace(self, pool, size): - """ + def _build_looper_namespace(self, pool: list, size: float) -> YAMLConfigManager: + """Compile a mapping of looper/submission related settings. + Compile a mapping of looper/submission related settings for use in the command templates and in submission script creation in divvy (via adapters). - :param Iterable[peppy.Sample] pool: collection of sample instances - :param float size: cumulative size of the given pool - :return yacman.YAMLConfigManager: looper/submission related settings + Args: + pool (Iterable[peppy.Sample]): Collection of sample instances. + size (float): Cumulative size of the given pool. + + Returns: + yacman.YAMLConfigManager: Looper/submission related settings. """ settings = YAMLConfigManager() settings["config_file"] = self.prj.config_file @@ -623,7 +700,7 @@ def _build_looper_namespace(self, pool, size): if pl_config_file: if not os.path.isfile(pl_config_file): _LOGGER.error( - "Pipeline config file specified " "but not found: %s", + "Pipeline config file specified but not found: %s", pl_config_file, ) raise IOError(pl_config_file) @@ -633,16 +710,20 @@ def _build_looper_namespace(self, pool, size): return settings def _set_pipestat_namespace( - self, sample_name: Optional[str] = None + self, sample_name: str | None = None ) -> YAMLConfigManager: - """ + """Compile a mapping of pipestat-related settings. + Compile a mapping of pipestat-related settings for use in the command templates. Accessible via: {pipestat.attrname} - :param str sample_name: name of the sample to get the pipestat - namespace for. If not provided the pipestat namespace will - be determined based on the Project - :return yacman.YAMLConfigManager: pipestat namespace + Args: + sample_name (str): Name of the sample to get the pipestat + namespace for. If not provided the pipestat namespace will + be determined based on the Project. + + Returns: + yacman.YAMLConfigManager: Pipestat namespace. """ try: psm = self.pl_iface.psm @@ -667,13 +748,15 @@ def _set_pipestat_namespace( filtered_namespace = {k: v for k, v in full_namespace.items() if v} return YAMLConfigManager(filtered_namespace) - def write_script(self, pool, size): - """ - Create the script for job submission. + def write_script(self, pool: list, size: float) -> str: + """Create the script for job submission. - :param Iterable[peppy.Sample] pool: collection of sample instances - :param float size: cumulative size of the given pool - :return str: Path to the job submission script created. + Args: + pool (Iterable[peppy.Sample]): Collection of sample instances. + size (float): Cumulative size of the given pool. + + Returns: + str: Path to the job submission script created. """ # looper settings determination if self.collate: @@ -727,7 +810,7 @@ def write_script(self, pool, size): pl_iface[VAR_TEMPL_KEY] = self.pl_iface.render_var_templates( namespaces=namespaces ) - _LOGGER.debug(f"namespace pipelines: { pl_iface }") + _LOGGER.debug(f"namespace pipelines: {pl_iface}") namespaces["pipeline"]["var_templates"] = pl_iface[VAR_TEMPL_KEY] or {} @@ -754,7 +837,19 @@ def write_script(self, pool, size): self._num_good_job_submissions += 1 self._num_total_job_submissions += 1 - looper["command"] = "\n".join(commands) + # Render inject_env_vars and prepend export statements to command + inject_env_vars = self.pl_iface.get("inject_env_vars", {}) + env_exports = [] + if inject_env_vars: + rendered_env_vars = render_inject_env_vars(inject_env_vars, namespaces) + for var_name, var_value in rendered_env_vars.items(): + env_exports.append(f"export {var_name}={shlex.quote(var_value)}") + _LOGGER.debug("Injected env vars:\n{}".format("\n".join(env_exports))) + + # Build final command with env exports prepended + all_lines = env_exports + commands + looper["command"] = "\n".join(all_lines) + if self.collate: _LOGGER.debug("samples namespace:\n{}".format(self.prj.samples)) else: @@ -775,38 +870,42 @@ def write_script(self, pool, size): output_path=subm_base + ".sub", extra_vars=[{"looper": looper}] ) - def _reset_pool(self): + def _reset_pool(self) -> None: """Reset the state of the pool of samples""" self._pool = [] self._curr_size = 0 - def _reset_curr_skips(self): + def _reset_curr_skips(self) -> None: self._curr_skip_pool = [] self._curr_skip_size = 0 -def _use_sample(flag, skips): +def _use_sample(flag: bool, skips: list) -> bool: return flag and not skips -def _exec_pre_submit(piface, namespaces): - """ - Execute pre submission hooks defined in the pipeline interface +def _exec_pre_submit(piface, namespaces: dict) -> dict: + """Execute pre submission hooks defined in the pipeline interface. - :param PipelineInterface piface: piface, a source of pre_submit hooks to execute - :param dict[dict[]] namespaces: namspaces mapping - :return dict[dict[]]: updated namspaces mapping + Args: + piface (PipelineInterface): Piface, a source of pre_submit hooks to execute. + namespaces (dict[dict]): Namespaces mapping. + + Returns: + dict[dict]: Updated namespaces mapping. """ def _update_namespaces(x, y, cmd=False): - """ + """Update namespaces mapping with new values. + Update namespaces mapping with a dictionary of the same structure, that includes just the values that need to be updated. - :param dict[dict] x: namespaces mapping - :param dict[dict] y: mapping to update namespaces with - :param bool cmd: whether the mapping to update with comes from the - command template, used for messaging + Args: + x (dict[dict]): Namespaces mapping. + y (dict[dict]): Mapping to update namespaces with. + cmd (bool): Whether the mapping to update with comes from the + command template, used for messaging. """ if not y: return diff --git a/looper/const.py b/looper/const.py index bfa51309b..5b5de37bf 100644 --- a/looper/const.py +++ b/looper/const.py @@ -3,9 +3,8 @@ import os from enum import Enum -__author__ = "Databio lab" -__email__ = "nathan@code.databio.org" - +# Re-exported from command_models.messages for backwards compatibility +from .command_models.messages import MESSAGE_BY_SUBCOMMAND __all__ = [ "BUTTON_APPEARANCE_BY_FLAG", @@ -107,14 +106,18 @@ } -def _get_apperance_dict(type, templ=APPEARANCE_BY_FLAG): - """ - Based on the type of the HTML element provided construct the appearence - mapping using the template +def _get_apperance_dict(type: str, templ: dict = APPEARANCE_BY_FLAG) -> dict: + """Construct the appearance mapping using the template. + + Based on the type of the HTML element provided construct the appearance + mapping using the template. - :param dict templ: appearance template to populate - :param str type: type of HTML element to populate template with - :return dict: populated appearance template + Args: + type (str): Type of HTML element to populate template with. + templ (dict): Appearance template to populate. + + Returns: + dict: Populated appearance template. """ from copy import deepcopy @@ -133,7 +136,6 @@ def _get_apperance_dict(type, templ=APPEARANCE_BY_FLAG): # Compute-related (for divvy) COMPUTE_SETTINGS_VARNAME = ["DIVCFG"] DEFAULT_COMPUTE_RESOURCES_NAME = "default" -OLD_COMPUTE_KEY = "compute" NEW_COMPUTE_KEY = "compute_packages" DEFAULT_CONFIG_FILEPATH = os.path.join( os.path.dirname(__file__), "default_config", "divvy_config.yaml" @@ -256,21 +258,6 @@ def _get_apperance_dict(type, templ=APPEARANCE_BY_FLAG): SAMPLE_SELECTION_FLAG_OPTNAME = "sel-flag" SAMPLE_EXCLUSION_FLAG_OPTNAME = "exc-flag" -MESSAGE_BY_SUBCOMMAND = { - "run": "Run or submit sample jobs.", - "rerun": "Resubmit sample jobs with failed flags.", - "runp": "Run or submit project jobs.", - "table": "Write summary stats table for project samples.", - "report": "Create browsable HTML report of project results.", - "destroy": "Remove output files of the project.", - "check": "Check flag status of current runs.", - "clean": "Run clean scripts of already processed jobs.", - "inspect": "Print information about a project.", - "init": "Initialize looper config file.", - "init-piface": "Initialize generic pipeline interface.", - "link": "Create directory of symlinks for reported results.", -} - # Add project/sample enum diff --git a/looper/divvy.py b/looper/divvy.py index 84e66ed71..913f64517 100644 --- a/looper/divvy.py +++ b/looper/divvy.py @@ -3,31 +3,25 @@ import logging import os import shutil - - from shutil import copytree -from yacman import FutureYAMLConfigManager as YAMLConfigManager -from yacman import write_lock, FILEPATH_KEY, load_yaml, select_config +from yacman import YAMLConfigManager, load_yaml, select_config, write_lock from .const import ( COMPUTE_SETTINGS_VARNAME, DEFAULT_COMPUTE_RESOURCES_NAME, - NEW_COMPUTE_KEY, DEFAULT_CONFIG_FILEPATH, - DEFAULT_CONFIG_SCHEMA, + NEW_COMPUTE_KEY, ) from .utils import write_submit_script - _LOGGER = logging.getLogger(__name__) # This is the divvy.py submodule from divvy class ComputingConfiguration(YAMLConfigManager): - """ - Represents computing configuration objects. + """Represents computing configuration objects. The ComputingConfiguration class provides a computing configuration object that is an *in memory* representation of a `divvy` computing configuration @@ -35,23 +29,20 @@ class ComputingConfiguration(YAMLConfigManager): and retrieve computing configuration files, and use these values to populate job submission script templates. - :param str | Iterable[(str, object)] | Mapping[str, object] entries: config - Collection of key-value pairs. - :param str filepath: YAML file specifying computing package data. (the - `DIVCFG` file) + Args: + entries (str | Iterable[(str, object)] | Mapping[str, object]): Config + collection of key-value pairs. + filepath (str): YAML file specifying computing package data (the + `DIVCFG` file). """ def __init__( self, entries=None, wait_max=None, - strict_ro_locks=False, - schema_source=None, - validate_on_write=False, - ): - super().__init__( - entries, wait_max, strict_ro_locks, schema_source, validate_on_write - ) + strict_ro_locks: bool = False, + ) -> None: + super().__init__(entries, wait_max, strict_ro_locks) if "compute_packages" not in self: self["compute_packages"] = {} @@ -61,11 +52,11 @@ def __init__( self.setdefault("adapters", None) self.activate_package(DEFAULT_COMPUTE_RESOURCES_NAME) - def write(self, filename=None): + def write(self, filename: str | None = None) -> None: with write_lock(self) as locked_ym: locked_ym.rebase() locked_ym.write() - filename = filename or getattr(self, FILEPATH_KEY) + filename = filename or self.filepath filedir = os.path.dirname(filename) # For this object, we *also* have to write the template files for pkg_name, pkg in self["compute_packages"].items(): @@ -74,42 +65,42 @@ def write(self, filename=None): shutil.copyfile(pkg.submission_template, destfile) @property - def compute_env_var(self): - """ - Environment variable through which to access compute settings. + def compute_env_var(self) -> list[str]: + """Environment variable through which to access compute settings. - :return list[str]: names of candidate environment variables, for which - value may be path to compute settings file; first found is used. + Returns: + list[str]: Names of candidate environment variables, for which + value may be path to compute settings file; first found is used. """ return COMPUTE_SETTINGS_VARNAME @property - def default_config_file(self): - """ - Path to default compute environment settings file. + def default_config_file(self) -> str: + """Path to default compute environment settings file. - :return str: Path to default compute settings file + Returns: + str: Path to default compute settings file. """ return DEFAULT_CONFIG_FILEPATH # Warning: template cannot be a property, because otherwise # it will get treated as a PathExAttMap treats all properties, which # is that it will turn any double-slashes into single slashes. - def template(self): - """ - Get the currently active submission template. + def template(self) -> str: + """Get the currently active submission template. - :return str: submission script content template for current state + Returns: + str: Submission script content template for current state. """ with open(self.compute["submission_template"], "r") as f: return f.read() @property - def templates_folder(self): - """ - Path to folder with default submission templates. + def templates_folder(self) -> str: + """Path to folder with default submission templates. - :return str: path to folder with default submission templates + Returns: + str: Path to folder with default submission templates. """ if self.filepath: return os.path.join(os.path.dirname(self.filepath), "divvy_templates") @@ -118,17 +109,19 @@ def templates_folder(self): os.path.dirname(__file__), "default_config", "divvy_templates" ) - def activate_package(self, package_name): - """ - Activates a compute package. + def activate_package(self, package_name: str) -> bool: + """Activates a compute package. This copies the computing attributes from the configuration file into the `compute` attribute, where the class stores current compute settings. - :param str package_name: name for non-resource compute bundle, - the name of a subsection in an environment configuration file - :return bool: success flag for attempt to establish compute settings + Args: + package_name (str): Name for non-resource compute bundle, + the name of a subsection in an environment configuration file. + + Returns: + bool: Success flag for attempt to establish compute settings. """ # Hope that environment & environment compute are present. @@ -158,7 +151,6 @@ def activate_package(self, package_name): # but now, it makes more sense to do it here so we can piggyback on # the default update() method and not even have to do that. if not os.path.isabs(self.compute["submission_template"]): - try: if self.filepath: self.compute["submission_template"] = os.path.join( @@ -192,70 +184,72 @@ def activate_package(self, package_name): return False - def clean_start(self, package_name): - """ - Clear current active settings and then activate the given package. + def clean_start(self, package_name: str) -> bool: + """Clear current active settings and then activate the given package. - :param str package_name: name of the resource package to activate - :return bool: success flag + Args: + package_name (str): Name of the resource package to activate. + + Returns: + bool: Success flag. """ self.reset_active_settings() return self.activate_package(package_name) def get_active_package(self) -> YAMLConfigManager: - """ - Returns settings for the currently active compute package + """Returns settings for the currently active compute package. - :return YAMLConfigManager: data defining the active compute package + Returns: + YAMLConfigManager: Data defining the active compute package. """ return self.compute @property - def compute_packages(self): + def compute_packages(self) -> dict: return self["compute_packages"] - def list_compute_packages(self): - """ - Returns a list of available compute packages. + def list_compute_packages(self) -> set[str]: + """Returns a list of available compute packages. - :return set[str]: names of available compute packages + Returns: + set[str]: Names of available compute packages. """ return set(self["compute_packages"].keys()) - def reset_active_settings(self): - """ - Clear out current compute settings. + def reset_active_settings(self) -> bool: + """Clear out current compute settings. - :return bool: success flag + Returns: + bool: Success flag. """ self.compute = YAMLConfigManager() return True - def update_packages(self, config_file): - """ - Parse data from divvy configuration file. + def update_packages(self, config_file: str) -> bool: + """Parse data from divvy configuration file. Given a divvy configuration file, this function will update (not overwrite) existing compute packages with existing values. It does not affect any currently active settings. - :param str config_file: path to file with new divvy configuration data + Args: + config_file (str): Path to file with new divvy configuration data. """ entries = load_yaml(config_file) self.update(entries) return True def get_adapters(self) -> YAMLConfigManager: - """ - Get current adapters, if defined. + """Get current adapters, if defined. Adapters are sourced from the 'adapters' section in the root of the divvy configuration file and updated with an active compute package-specific set of adapters, if any defined in 'adapters' section under currently active compute package. - :return YAMLConfigManager: current adapters mapping + Returns: + YAMLConfigManager: Current adapters mapping. """ adapters = YAMLConfigManager() if "adapters" in self and self["adapters"] is not None: @@ -266,7 +260,7 @@ def get_adapters(self) -> YAMLConfigManager: _LOGGER.debug("No adapters determined in divvy configuration file.") return adapters - def submit(self, output_path, extra_vars=None): + def submit(self, output_path: str | None, extra_vars: list | None = None) -> None: if not output_path: import tempfile @@ -283,27 +277,32 @@ def submit(self, output_path, extra_vars=None): _LOGGER.info(submission_command) os.system(submission_command) - def write_script(self, output_path, extra_vars=None): - """ - Given currently active settings, populate the active template to write a - submission script. Additionally use the current adapters to adjust - the select of the provided variables - - :param str output_path: Path to file to write as submission script - :param Iterable[Mapping] extra_vars: A list of Dict objects with - key-value pairs with which to populate template fields. These will - override any values in the currently active compute package. - :return str: Path to the submission script file + def write_script(self, output_path: str, extra_vars: list | None = None) -> str: + """Given currently active settings, populate the active template to write a submission script. + + Additionally use the current adapters to adjust the select of the + provided variables. + + Args: + output_path (str): Path to file to write as submission script. + extra_vars (Iterable[Mapping]): A list of Dict objects with + key-value pairs with which to populate template fields. These will + override any values in the currently active compute package. + + Returns: + str: Path to the submission script file. """ def _get_from_dict(map, attrs): - """ - Get value from a possibly mapping using a list of its attributes + """Get value from a possibly mapping using a list of its attributes. + + Args: + map (collections.Mapping): Mapping to retrieve values from. + attrs (Iterable[str]): A list of attributes. - :param collections.Mapping map: mapping to retrieve values from - :param Iterable[str] attrs: a list of attributes - :return: value found in the the requested attribute or - None if one of the keys does not exist + Returns: + Value found in the the requested attribute or None if one of the + keys does not exist. """ for a in attrs: try: @@ -353,7 +352,7 @@ def _get_from_dict(map, attrs): return write_submit_script(output_path, self.template(), variables) - def _handle_missing_env_attrs(self, config_file, when_missing): + def _handle_missing_env_attrs(self, config_file: str, when_missing) -> None: """Default environment settings aren't required; warn, though.""" missing_env_attrs = [ attr @@ -371,17 +370,19 @@ def _handle_missing_env_attrs(self, config_file, when_missing): when_missing(message) -def select_divvy_config(filepath): - """ - Selects the divvy config file path to load. +def select_divvy_config(filepath: str | None) -> str: + """Selects the divvy config file path to load. This uses a priority ordering to first choose a config file path if it's given, but if not, then look in a priority list of environment variables and choose the first available file path to return. If none of these options succeed, the default config path will be returned. - :param str | NoneType filepath: direct file path specification - :return str: path to the config file to read + Args: + filepath (str | NoneType): Direct file path specification. + + Returns: + str: Path to the config file to read. """ divcfg = select_config( config_filepath=filepath, @@ -394,14 +395,14 @@ def select_divvy_config(filepath): return divcfg -def divvy_init(config_path, template_config_path): - """ - Initialize a genome config file. +def divvy_init(config_path: str, template_config_path: str) -> None: + """Initialize a genome config file. - :param str config_path: path to divvy configuration file to - create/initialize - :param str template_config_path: path to divvy configuration file to - copy FROM + Args: + config_path (str): Path to divvy configuration file to + create/initialize. + template_config_path (str): Path to divvy configuration file to + copy FROM. """ if not config_path: _LOGGER.error("You must specify a file path to initialize.") diff --git a/looper/exceptions.py b/looper/exceptions.py index 62b9e041e..3e6d52c4a 100644 --- a/looper/exceptions.py +++ b/looper/exceptions.py @@ -3,9 +3,6 @@ from abc import ABCMeta from collections.abc import Iterable -__author__ = "Vince Reuter" -__email__ = "vreuter@virginia.edu" - _all__ = [ "DuplicatePipelineKeyException", "InvalidResourceSpecificationException", @@ -34,35 +31,35 @@ class SampleFailedException(LooperError): class MisconfigurationException(LooperError): """Looper not properly configured""" - def __init__(self, key): + def __init__(self, key: str) -> None: super(MisconfigurationException, self).__init__(key) class RegistryPathException(LooperError): """Duplication of pipeline identifier precludes unique pipeline ref.""" - def __init__(self, msg): + def __init__(self, msg: str) -> None: super(RegistryPathException, self).__init__(msg) class DuplicatePipelineKeyException(LooperError): """Duplication of pipeline identifier precludes unique pipeline ref.""" - def __init__(self, key): + def __init__(self, key: str) -> None: super(DuplicatePipelineKeyException, self).__init__(key) class InvalidResourceSpecificationException(LooperError): """Pipeline interface resources--if present--needs default.""" - def __init__(self, reason): + def __init__(self, reason: str) -> None: super(InvalidResourceSpecificationException, self).__init__(reason) class JobSubmissionException(LooperError): """Error type for when job submission fails.""" - def __init__(self, sub_cmd, script): + def __init__(self, sub_cmd: str, script: str) -> None: self.script = script reason = "Error for command {} and script '{}'".format(sub_cmd, self.script) super(JobSubmissionException, self).__init__(reason) @@ -73,8 +70,8 @@ class PipestatConfigurationException(LooperError): def __init__( self, - sub_cmd, - ): + sub_cmd: str, + ) -> None: reason = "Pipestat must be configured for command {}".format(sub_cmd) super(PipestatConfigurationException, self).__init__(reason) @@ -82,18 +79,18 @@ def __init__( class MissingPipelineConfigurationException(LooperError): """A selected pipeline needs configuration data.""" - def __init__(self, pipeline): + def __init__(self, pipeline: str) -> None: super(MissingPipelineConfigurationException, self).__init__(pipeline) class PipelineInterfaceConfigError(LooperError): """Error with PipelineInterface config data during construction.""" - def __init__(self, context): - """ - For exception context, provide message or collection of missing sections. + def __init__(self, context: str | Iterable[str]) -> None: + """For exception context, provide message or collection of missing sections. - :param str | Iterable[str] context: + Args: + context (str | Iterable[str]): Message or collection of missing sections. """ if not isinstance(context, str) and isinstance(context, Iterable): context = "Missing section(s): {}".format(", ".join(context)) @@ -103,7 +100,7 @@ def __init__(self, context): class PipelineInterfaceRequirementsError(LooperError): """Invalid specification of pipeline requirements in interface config.""" - def __init__(self, typename_by_requirement): + def __init__(self, typename_by_requirement: dict) -> None: super(PipelineInterfaceRequirementsError, self).__init__( "{} invalid requirements: {}".format( len(typename_by_requirement), typename_by_requirement @@ -115,5 +112,5 @@ def __init__(self, typename_by_requirement): class LooperReportError(LooperError): """Looper reporting errors""" - def __init__(self, reason): + def __init__(self, reason: str) -> None: super(LooperReportError, self).__init__(reason) diff --git a/looper/looper.py b/looper/looper.py index cb3cb3014..f4537d721 100755 --- a/looper/looper.py +++ b/looper/looper.py @@ -5,13 +5,10 @@ import abc import argparse -import csv import glob import logging -import subprocess -import yaml import os -import pandas as _pd +import subprocess # Need specific sequence of actions for colorama imports? from colorama import init @@ -19,34 +16,41 @@ from .const import PipelineLevel init() -from shutil import rmtree - # from collections.abc import Mapping from collections import defaultdict +from shutil import rmtree + from colorama import Fore, Style from eido import validate_config, validate_sample from eido.exceptions import EidoValidationError -from jsonschema import ValidationError -from peppy.const import * from peppy.exceptions import RemoteYAMLError +from pipestat.exceptions import PipestatSummarizeError +from pipestat.reports import get_file_for_table from rich.color import Color from rich.console import Console from rich.table import Table from ubiquerg.cli_tools import query_yes_no - from .conductor import SubmissionConductor - -from .exceptions import * -from .const import * +from .const import ( + DEBUG_COMMANDS, + DEBUG_EIDO_VALIDATION, + DEBUG_JOBS, + NOT_SUB_MSG, + SUBMISSION_FAILURE_MESSAGE, +) +from .exceptions import ( + JobSubmissionException, + LooperReportError, + MisconfigurationException, + SampleFailedException, +) from .project import Project from .utils import ( - desired_samples_range_skipped, desired_samples_range_limited, + desired_samples_range_skipped, sample_folder, ) -from pipestat.reports import get_file_for_table -from pipestat.exceptions import PipestatSummarizeError _PKGNAME = "looper" _LOGGER = logging.getLogger(_PKGNAME) @@ -63,11 +67,11 @@ class Executor(object): __metaclass__ = abc.ABCMeta - def __init__(self, prj): - """ - The Project defines the instance; establish an iteration counter. + def __init__(self, prj: Project) -> None: + """The Project defines the instance; establish an iteration counter. - :param Project prj: Project with which to work/operate on + Args: + prj (Project): Project with which to work/operate on. """ super(Executor, self).__init__() self.prj = prj @@ -80,11 +84,11 @@ def __call__(self, *args, **kwargs): class Checker(Executor): - def __call__(self, args): - """ - Check Project status, using pipestat. + def __call__(self, args: argparse.Namespace) -> dict: + """Check Project status, using pipestat. - :param argparse.Namespace: arguments provided to the command + Args: + args (argparse.Namespace): Arguments provided to the command. """ # aggregate pipeline status data @@ -92,7 +96,6 @@ def __call__(self, args): psms = {} if getattr(args, "project", None): - for piface in self.prj.project_pipeline_interfaces: if piface.psm.pipeline_type == PipelineLevel.PROJECT.value: if piface.psm.pipeline_name not in psms: @@ -131,7 +134,7 @@ def __call__(self, args): title=table_title, width=len(table_title) + 10, ) - table.add_column(f"Status", justify="center") + table.add_column("Status", justify="center") table.add_column("Jobs count/total jobs", justify="center") for status_id in psms[pipeline_name].status_schema.keys(): status_list = list(pipeline_status.values()) @@ -171,7 +174,7 @@ def __call__(self, args): table = Table( show_header=True, header_style="bold magenta", - title=f"Status codes description", + title="Status codes description", width=len(psms[pipeline_name].status_schema_source) + 20, caption=f"Descriptions source: {psms[pipeline_name].status_schema_source}", ) @@ -190,12 +193,12 @@ def __call__(self, args): class Cleaner(Executor): """Remove all intermediate files (defined by pypiper clean scripts).""" - def __call__(self, args, preview_flag=True): - """ - Execute the file cleaning process. + def __call__(self, args: argparse.Namespace, preview_flag: bool = True) -> int: + """Execute the file cleaning process. - :param argparse.Namespace args: command-line options and arguments - :param bool preview_flag: whether to halt before actually removing files + Args: + args (argparse.Namespace): Command-line options and arguments. + preview_flag (bool): Whether to halt before actually removing files. """ self.counter.show(name=self.prj.name, type="project") for sample in self.prj.samples: @@ -251,12 +254,14 @@ def select_samples(prj: Project, args: argparse.Namespace): class Destroyer(Executor): """Destroyer of files and folders associated with Project's Samples""" - def __call__(self, args, preview_flag=True): - """ - Completely remove all output produced by any pipelines. + def __call__( + self, args: argparse.Namespace, preview_flag: bool = True + ) -> int | None: + """Completely remove all output produced by any pipelines. - :param argparse.Namespace args: command-line options and arguments - :param bool preview_flag: whether to halt before actually removing files + Args: + args (argparse.Namespace): Command-line options and arguments. + preview_flag (bool): Whether to halt before actually removing files. """ use_pipestat = ( @@ -317,22 +322,20 @@ def __call__(self, args, preview_flag=True): class Collator(Executor): """Submitter for project-level pipelines""" - def __init__(self, prj): - """ - Initializes an instance + def __init__(self, prj: Project) -> None: + """Initializes an instance. - :param Project prj: Project with which to work/operate on + Args: + prj (Project): Project with which to work/operate on. """ super(Executor, self).__init__() self.prj = prj - def __call__(self, args, **compute_kwargs): - """ - Matches collators by protocols, creates submission scripts - and submits them + def __call__(self, args: argparse.Namespace, **compute_kwargs) -> dict: + """Matches collators by protocols, creates submission scripts and submits them. - :param argparse.Namespace args: parsed command-line options and - arguments, recognized by looper + Args: + args (argparse.Namespace): Parsed command-line options and arguments, recognized by looper. """ jobs = 0 self.debug = {} @@ -378,16 +381,19 @@ def __call__(self, args, **compute_kwargs): class Runner(Executor): """The true submitter of pipelines""" - def __call__(self, args, top_level_args=None, rerun=False, **compute_kwargs): - """ - Do the Sample submission. - - :param argparse.Namespace args: parsed command-line options and - arguments, recognized by looper - :param list remaining_args: command-line options and arguments not - recognized by looper, germane to samples/pipelines - :param bool rerun: whether the given sample is being rerun rather than - run for the first time + def __call__( + self, + args: argparse.Namespace, + top_level_args=None, + rerun: bool = False, + **compute_kwargs, + ) -> dict: + """Do the Sample submission. + + Args: + args (argparse.Namespace): Parsed command-line options and arguments, recognized by looper. + remaining_args (list): Command-line options and arguments not recognized by looper, germane to samples/pipelines. + rerun (bool): Whether the given sample is being rerun rather than run for the first time. """ self.debug = {} # initialize empty dict for return values max_cmds = sum(list(map(len, self.prj._samples_by_interface.values()))) @@ -531,8 +537,9 @@ def __call__(self, args, top_level_args=None, rerun=False, **compute_kwargs): failed_sub_samples = samples_by_reason.get(SUBMISSION_FAILURE_MESSAGE) if failed_sub_samples: _LOGGER.info( - "\n{} samples with at least one failed job submission:" - " {}".format(len(failed_sub_samples), ", ".join(failed_sub_samples)) + "\n{} samples with at least one failed job submission: {}".format( + len(failed_sub_samples), ", ".join(failed_sub_samples) + ) ) # If failure keys are only added when there's at least one sample that @@ -560,10 +567,9 @@ def __call__(self, args, top_level_args=None, rerun=False, **compute_kwargs): class Reporter(Executor): """Combine project outputs into a browsable HTML report""" - def __call__(self, args): + def __call__(self, args: argparse.Namespace) -> dict: # initialize the report builder self.debug = {} - p = self.prj project_level = getattr(args, "project", None) portable = args.portable @@ -573,7 +579,6 @@ def __call__(self, args): psms = {} if project_level: - for piface in self.prj.project_pipeline_interfaces: if piface.psm.pipeline_type == PipelineLevel.PROJECT.value: if piface.psm.pipeline_name not in psms: @@ -616,9 +621,7 @@ def __call__(self, args): class Linker(Executor): """Create symlinks for reported results. Requires pipestat to be configured.""" - def __call__(self, args): - # initialize the report builder - p = self.prj + def __call__(self, args: argparse.Namespace) -> None: project_level = getattr(args, "project", None) link_dir = getattr(args, "output_dir", None) @@ -639,12 +642,13 @@ def __call__(self, args): class Tabulator(Executor): - """Project/Sample statistics and table output generator + """Project/Sample statistics and table output generator. - :return list[str|any] results: list containing output file paths of stats and objects + Returns: + list[str|any]: List containing output file paths of stats and objects. """ - def __call__(self, args): + def __call__(self, args: argparse.Namespace) -> list: # p = self.prj project_level = getattr(args, "project", None) report_dir = getattr(args, "report_dir", None) @@ -668,19 +672,17 @@ def __call__(self, args): return results -def _create_failure_message(reason, samples): +def _create_failure_message(reason: str, samples: set[str]) -> str: """Explain lack of submission for a single reason, 1 or more samples.""" return f"{Fore.LIGHTRED_EX + reason + Style.RESET_ALL}: {', '.join(samples)}" -def _remove_or_dry_run(paths, dry_run=False): - """ - Remove file or directory or just inform what would be removed in - case of dry run +def _remove_or_dry_run(paths: list | str, dry_run: bool = False) -> None: + """Remove file or directory or just inform what would be removed in case of dry run. - :param list|str paths: list of paths to files/dirs to be removed - :param bool dry_run: logical indicating whether the files should remain - untouched and message printed + Args: + paths (list|str): List of paths to files/dirs to be removed. + dry_run (bool): Logical indicating whether the files should remain untouched and message printed. """ paths = paths if isinstance(paths, list) else [paths] for path in paths: @@ -697,7 +699,9 @@ def _remove_or_dry_run(paths, dry_run=False): _LOGGER.info(path + " does not exist.") -def destroy_summary(prj, dry_run=False, project_level=False): +def destroy_summary( + prj: Project, dry_run: bool = False, project_level: bool = False +) -> None: """ Delete the summary files if not in dry run mode This function is for use with pipestat configured projects. @@ -760,29 +764,31 @@ def destroy_summary(prj, dry_run=False, project_level=False): class LooperCounter(object): - """ - Count samples as you loop through them, and create text for the - subcommand logging status messages. + """Count samples as you loop through them, and create text for the subcommand logging status messages. - :param int total: number of jobs to process + Args: + total (int): Number of jobs to process. """ - def __init__(self, total): + def __init__(self, total: int) -> None: self.count = 0 self.total = total - def show(self, name, type="sample", pipeline_name=None): - """ - Display sample counts status for a particular protocol type. + def show( + self, name: str, type: str = "sample", pipeline_name: str | None = None + ) -> str: + """Display sample counts status for a particular protocol type. The counts are running vs. total for the protocol within the Project, and as a side-effect of the call, the running count is incremented. - :param str name: name of the sample - :param str type: the name of the level of entity being displayed, - either project or sample - :param str pipeline_name: name of the pipeline - :return str: message suitable for logging a status update + Args: + name (str): Name of the sample. + type (str): The name of the level of entity being displayed, either project or sample. + pipeline_name (str): Name of the pipeline. + + Returns: + str: Message suitable for logging a status update. """ self.count += 1 return _submission_status_text( @@ -794,16 +800,21 @@ def show(self, name, type="sample", pipeline_name=None): color=Fore.CYAN, ) - def reset(self): + def reset(self) -> None: self.count = 0 - def __str__(self): + def __str__(self) -> str: return "LooperCounter of size {}".format(self.total) def _submission_status_text( - curr, total, name, pipeline_name=None, type="sample", color=Fore.CYAN -): + curr: int, + total: int, + name: str, + pipeline_name: str | None = None, + type: str = "sample", + color: str = Fore.CYAN, +) -> str: """Generate submission sample text for run or collate""" txt = color + f"## [{curr} of {total}] {type}: {name}" if pipeline_name: diff --git a/looper/parser_types.py b/looper/parser_types.py index 984049650..723baa1bb 100644 --- a/looper/parser_types.py +++ b/looper/parser_types.py @@ -3,7 +3,13 @@ from yacman import YAMLConfigManager -def html_range(caravel=False, min_val=0, max_val=10, step=1, value=0): +def html_range( + caravel: bool = False, + min_val: int = 0, + max_val: int = 10, + step: int = 1, + value: int = 0, +) -> callable: caravel_data = YAMLConfigManager( { "element_type": "range", @@ -28,19 +34,21 @@ def fun(x=None, caravel_data=caravel_data, caravel=caravel): return fun -def html_checkbox(caravel=False, checked=False): - """ - Create argument for type parameter on argparse.ArgumentParser.add_argument. +def html_checkbox(caravel: bool = False, checked: bool = False) -> callable: + """Create argument for type parameter on argparse.ArgumentParser.add_argument. + + Args: + caravel (bool): Whether this is being used in the caravel context. + checked (bool): Whether to add a particular key-value entry to a + collection used by caravel. - :param bool caravel: whether this is being used in the caravel context - :param bool checked: whether to add a particular key-value entry to a - collection used by caravel - :return callable: argument to the type parameter of an - argparse.ArgumentParser's add_argument method. + Returns: + callable: Argument to the type parameter of an + argparse.ArgumentParser's add_argument method. """ caravel_data = YAMLConfigManager({"element_type": "checkbox", "element_args": {}}) if checked: - caravel_data.add_entries({"element_args": {"checked": True}}) + caravel_data.update({"element_args": {"checked": True}}) def fun(x=None, caravel_data=caravel_data, caravel=caravel): return caravel_data if caravel else eval(x) @@ -48,15 +56,17 @@ def fun(x=None, caravel_data=caravel_data, caravel=caravel): return fun -def html_select(choices, caravel=False): - """ - Create argument for type parameter on argparse.ArgumentParser.add_argument. +def html_select(choices: list, caravel: bool = False) -> callable: + """Create argument for type parameter on argparse.ArgumentParser.add_argument. + + Args: + choices (list[object]): Collection of valid argument provisions via + to a particular CLI option. + caravel (bool): Whether this is being used in the caravel context. - :param list[object] choices: collection of valid argument provisions via - to a particular CLI option - :param bool caravel: whether this is being used in the caravel context - :return callable: argument to the type parameter of an - argparse.ArgumentParser's add_argument method. + Returns: + callable: Argument to the type parameter of an + argparse.ArgumentParser's add_argument method. """ if not isinstance(choices, list): raise TypeError( diff --git a/looper/pipeline_interface.py b/looper/pipeline_interface.py index f7f0793ea..87b453171 100644 --- a/looper/pipeline_interface.py +++ b/looper/pipeline_interface.py @@ -3,43 +3,52 @@ import os from collections.abc import Mapping from logging import getLogger -from warnings import warn import jsonschema import pandas as pd from eido import read_schema from peppy import utils as peputil from ubiquerg import expandpath, is_url -from yacman import load_yaml, YAMLConfigManager - -from .const import * +from yacman import YAMLConfigManager, load_yaml + +from .const import ( + COMPUTE_KEY, + DYN_VARS_KEY, + FILE_SIZE_COLNAME, + ID_COLNAME, + INPUT_SCHEMA_KEY, + LOOPER_KEY, + OUTPUT_SCHEMA_KEY, + PIFACE_SCHEMA_SRC, + PIPELINE_INTERFACE_PIPELINE_NAME_KEY, + RESOURCES_KEY, + SIZE_DEP_VARS_KEY, + VAR_TEMPL_KEY, +) from .exceptions import ( InvalidResourceSpecificationException, PipelineInterfaceConfigError, ) from .utils import render_nested_var_templates -__author__ = "Michal Stolarczyk" -__email__ = "michal@virginia.edu" - _LOGGER = getLogger(__name__) @peputil.copy class PipelineInterface(YAMLConfigManager): """ - This class parses, holds, and returns information for a yaml file that - specifies how to interact with each individual pipeline. This - includes both resources to request for cluster job submission, as well as - arguments to be passed from the sample annotation metadata to the pipeline - - :param str | Mapping config: path to file from which to parse - configuration data, or pre-parsed configuration data. - :param str pipeline_type: type of the pipeline, - must be either 'sample' or 'project'. + This class parses, holds, and returns information for a yaml file that specifies how to interact with each individual pipeline. + + This includes both resources to request for cluster job submission, as well as + arguments to be passed from the sample annotation metadata to the pipeline. + + Args: + config (str | Mapping): Path to file from which to parse configuration data, + or pre-parsed configuration data. + pipeline_type (str): Type of the pipeline, must be either 'sample' or 'project'. """ - def __init__(self, config, pipeline_type=None): + def __init__(self, config: str | Mapping, pipeline_type: str | None = None) -> None: super(PipelineInterface, self).__init__() if isinstance(config, Mapping): @@ -57,16 +66,59 @@ def __init__(self, config, pipeline_type=None): self.update(config) self._validate(schema_src=PIFACE_SCHEMA_SRC) self._expand_paths(["compute", "dynamic_variables_script_path"]) + self._validate_pipestat_handoff() @property - def pipeline_name(self): + def pipeline_name(self) -> str: return self[PIPELINE_INTERFACE_PIPELINE_NAME_KEY] - def render_var_templates(self, namespaces): + def _validate_pipestat_handoff(self) -> None: + """Validate that pipestat-enabled interfaces pass config to pipeline. + + Raises: + PipelineInterfaceConfigError: If output_schema present but no handoff mechanism. + """ + if OUTPUT_SCHEMA_KEY not in self: + return # Not pipestat-enabled, nothing to validate + + if self.get("pipestat_config_required") is False: + return # Explicitly disabled + + # Check for CLI handoff: {pipestat.config_file} or {pipestat.*} in command_template + cmd_template = self.get("command_template", "") + # Also check sample_interface and project_interface sections + sample_iface = self.get("sample_interface", {}) + project_iface = self.get("project_interface", {}) + sample_cmd = sample_iface.get("command_template", "") if sample_iface else "" + project_cmd = project_iface.get("command_template", "") if project_iface else "" + + has_cli_handoff = ( + "{pipestat." in cmd_template + or "{pipestat." in sample_cmd + or "{pipestat." in project_cmd + ) + + # Check for env var handoff: PIPESTAT_CONFIG in inject_env_vars + inject_env_vars = self.get("inject_env_vars", {}) + has_env_handoff = "PIPESTAT_CONFIG" in inject_env_vars + + if not has_cli_handoff and not has_env_handoff: + raise PipelineInterfaceConfigError( + f"Pipeline '{self.pipeline_name}' has output_schema but no pipestat config handoff.\n\n" + f"Add one of:\n" + f" 1. In command_template: --pipestat-config {{pipestat.config_file}}\n" + f" 2. In inject_env_vars:\n" + f" inject_env_vars:\n" + f' PIPESTAT_CONFIG: "{{pipestat.config_file}}"\n\n' + f"Or set 'pipestat_config_required: false' to disable this check." + ) + + def render_var_templates(self, namespaces: dict) -> dict: """ Render path templates under 'var_templates' in this pipeline interface. - :param dict namespaces: namespaces to use for rendering + Args: + namespaces (dict): Namespaces to use for rendering. """ try: curr_data = self[VAR_TEMPL_KEY] @@ -83,12 +135,15 @@ def render_var_templates(self, namespaces): var_templates = render_nested_var_templates(var_templates, namespaces) return var_templates - def get_pipeline_schemas(self, schema_key=INPUT_SCHEMA_KEY): + def get_pipeline_schemas(self, schema_key: str = INPUT_SCHEMA_KEY) -> str | None: """ Get path to the pipeline schema. - :param str schema_key: where to look for schemas in the pipeline iface - :return str: absolute path to the pipeline schema file + Args: + schema_key (str): Where to look for schemas in the pipeline iface. + + Returns: + str: Absolute path to the pipeline schema file. """ schema_source = None if schema_key in self: @@ -103,19 +158,23 @@ def get_pipeline_schemas(self, schema_key=INPUT_SCHEMA_KEY): ) return schema_source - def choose_resource_package(self, namespaces, file_size): + def choose_resource_package(self, namespaces: dict, file_size: float) -> dict: """ Select resource bundle for given input file size to given pipeline. - :param float file_size: Size of input data (in gigabytes). - :param Mapping[Mapping[str]] namespaces: namespaced variables to pass - as a context for fluid attributes command rendering - :return MutableMapping: resource bundle appropriate for given pipeline, - for given input file size - :raises ValueError: if indicated file size is negative, or if the - file size value specified for any resource package is negative - :raises InvalidResourceSpecificationException: if no default - resource package specification is provided + Args: + file_size (float): Size of input data (in gigabytes). + namespaces (Mapping[Mapping[str]]): Namespaced variables to pass as a context + for fluid attributes command rendering. + + Returns: + MutableMapping: Resource bundle appropriate for given pipeline, for given input file size. + + Raises: + ValueError: If indicated file size is negative, or if the file size value + specified for any resource package is negative. + InvalidResourceSpecificationException: If no default resource package + specification is provided. """ def _file_size_ante(name, data): @@ -132,7 +191,8 @@ def _file_size_ante(name, data): if fsize < 0: raise InvalidResourceSpecificationException( "Found negative value () in '{}' column; package '{}'".format( - fsize, FILE_SIZE_COLNAME, name + fsize, + FILE_SIZE_COLNAME, ) ) return fsize @@ -145,12 +205,13 @@ def _notify(msg): def _load_dynamic_vars(pipeline): """ - Render command string (jinja2 template), execute it in a subprocess - and return its result (JSON object) as a dict + Render command string (jinja2 template), execute it in a subprocess and return its result (JSON object) as a dict. - :param Mapping pipeline: pipeline dict - :return Mapping: a dict with attributes returned in the JSON - by called command + Args: + pipeline (Mapping): Pipeline dict. + + Returns: + Mapping: A dict with attributes returned in the JSON by called command. """ def _log_raise_latest(): @@ -197,11 +258,14 @@ def _log_raise_latest(): def _load_size_dep_vars(piface): """ - Read the resources from a TSV provided in the pipeline interface + Read the resources from a TSV provided in the pipeline interface. + + Args: + piface (looper.PipelineInterface): Currently processed piface. + section (str): Section of pipeline interface to process. - :param looper.PipelineInterface piface: currently processed piface - :param str section: section of pipeline interface to process - :return pandas.DataFrame: resources + Returns: + pandas.DataFrame: Resources. """ df = None if COMPUTE_KEY in piface and SIZE_DEP_VARS_KEY in piface[COMPUTE_KEY]: @@ -227,8 +291,9 @@ def _load_size_dep_vars(piface): # Ensure that we have a numeric value before attempting comparison. file_size = float(file_size) assert file_size >= 0, ValueError( - "Attempted selection of resource " - "package for negative file size: {}".format(file_size) + "Attempted selection of resource package for negative file size: {}".format( + file_size + ) ) fluid_resources = _load_dynamic_vars(self) @@ -277,22 +342,25 @@ def _load_size_dep_vars(piface): resources_data.update(project[LOOPER_KEY][COMPUTE_KEY][RESOURCES_KEY]) return resources_data - def _expand_paths(self, keys): + def _expand_paths(self, keys: list[str]) -> None: """ - Expand paths defined in the pipeline interface file + Expand paths defined in the pipeline interface file. - :param list keys: list of keys resembling the nested structure to get - to the pipeline interface attributre to expand + Args: + keys (list): List of keys resembling the nested structure to get to the + pipeline interface attribute to expand. """ def _get_from_dict(map, attrs): """ - Get value from a possibly nested mapping using a list of its attributes + Get value from a possibly nested mapping using a list of its attributes. - :param collections.Mapping map: mapping to retrieve values from - :param Iterable[str] attrs: a list of attributes - :return: value found in the the requested attribute or - None if one of the keys does not exist + Args: + map (collections.Mapping): Mapping to retrieve values from. + attrs (Iterable[str]): A list of attributes. + + Returns: + Value found in the requested attribute or None if one of the keys does not exist. """ for a in attrs: try: @@ -303,19 +371,21 @@ def _get_from_dict(map, attrs): def _set_in_dict(map, attrs, val): """ - Set value in a mapping, creating a possibly nested structure + Set value in a mapping, creating a possibly nested structure. + + Args: + map (collections.Mapping): Mapping to retrieve values from. + attrs (Iterable[str]): A list of attributes. + val: Value to set. - :param collections.Mapping map: mapping to retrieve values from - :param Iterable[str] attrs: a list of attributes - :param val: value to set - :return: value found in the the requested attribute or - None if one of the keys does not exist + Returns: + Value found in the requested attribute or None if one of the keys does not exist. """ for a in attrs: if a == attrs[-1]: map[a] = val break - map.setdefault(a, PXAM()) + map.setdefault(a, {}) map = map[a] raw_path = _get_from_dict(self, keys) @@ -341,14 +411,17 @@ def _set_in_dict(map, attrs, val): _LOGGER.debug("Expanded path: {}".format(pipe_path)) _set_in_dict(self, keys, pipe_path) - def _validate(self, schema_src, exclude_case=False, flavor="generic"): + def _validate( + self, schema_src: str, exclude_case: bool = False, flavor: str = "generic" + ) -> None: """ - Generic function to validate the object against a schema + Generic function to validate the object against a schema. - :param str schema_src: schema source to validate against, URL or path - :param bool exclude_case: whether to exclude validated objects - from the error. Useful when used ith large projects - :param str flavor: type of the pipeline schema to use + Args: + schema_src (str): Schema source to validate against, URL or path. + exclude_case (bool): Whether to exclude validated objects from the error. + Useful when used with large projects. + flavor (str): Type of the pipeline schema to use. """ schema_source = schema_src.format(flavor) for schema in read_schema(schema_source): diff --git a/looper/plugins.py b/looper/plugins.py index dc34283e0..3d628038e 100644 --- a/looper/plugins.py +++ b/looper/plugins.py @@ -1,22 +1,29 @@ import logging import os -from .const import * + from .conductor import _get_yaml_path +from .const import ( + SAMPLE_CWL_YAML_PATH_KEY, + SAMPLE_YAML_PATH_KEY, + SAMPLE_YAML_PRJ_PATH_KEY, +) _LOGGER = logging.getLogger(__name__) -def write_sample_yaml_prj(namespaces): - """ - Plugin: saves sample representation with project reference to YAML. +def write_sample_yaml_prj(namespaces: dict) -> dict: + """Plugin: saves sample representation with project reference to YAML. This plugin can be parametrized by providing the path value/template in 'pipeline.var_templates.sample_yaml_prj_path'. This needs to be a complete and absolute path to the file where sample YAML representation is to be stored. - :param dict namespaces: variable namespaces dict - :return dict: sample namespace dict + Args: + namespaces (dict): Variable namespaces dict. + + Returns: + dict: Sample namespace dict. """ sample = namespaces["sample"] sample.to_yaml( @@ -26,7 +33,7 @@ def write_sample_yaml_prj(namespaces): return {"sample": sample} -def write_custom_template(namespaces): +def write_custom_template(namespaces: dict) -> dict | None: """ Plugin: Populates a user-provided jinja template @@ -63,9 +70,8 @@ def load_template(pipeline): return {"sample": namespaces["sample"]} -def write_sample_yaml_cwl(namespaces): - """ - Plugin: Produce a cwl-compatible yaml representation of the sample +def write_sample_yaml_cwl(namespaces: dict) -> dict: + """Plugin: Produce a cwl-compatible yaml representation of the sample. Also adds the 'cwl_yaml' attribute to sample objects, which points to the file produced. @@ -75,8 +81,11 @@ def write_sample_yaml_cwl(namespaces): absolute path to the file where sample YAML representation is to be stored. - :param dict namespaces: variable namespaces dict - :return dict: updated variable namespaces dict + Args: + namespaces (dict): Variable namespaces dict. + + Returns: + dict: Updated variable namespaces dict. """ from eido import read_schema from ubiquerg import is_url @@ -132,25 +141,26 @@ def _get_schema_source( sample[dir_attr] = {"class": "Directory", "location": dir_attr_value} else: _LOGGER.warning( - "No 'input_schema' defined, producing a regular " - "sample YAML representation" + "No 'input_schema' defined, producing a regular sample YAML representation" ) _LOGGER.info("Writing sample yaml to {}".format(sample.sample_yaml_cwl)) sample.to_yaml(sample.sample_yaml_cwl) return {"sample": sample} -def write_sample_yaml(namespaces): - """ - Plugin: saves sample representation to YAML. +def write_sample_yaml(namespaces: dict) -> dict: + """Plugin: saves sample representation to YAML. This plugin can be parametrized by providing the path value/template in 'pipeline.var_templates.sample_yaml_path'. This needs to be a complete and absolute path to the file where sample YAML representation is to be stored. - :param dict namespaces: variable namespaces dict - :return dict: sample namespace dict + Args: + namespaces (dict): Variable namespaces dict. + + Returns: + dict: Sample namespace dict. """ sample = namespaces["sample"] sample["sample_yaml_path"] = _get_yaml_path( diff --git a/looper/processed_project.py b/looper/processed_project.py index 39b87fa0d..b863f10ed 100644 --- a/looper/processed_project.py +++ b/looper/processed_project.py @@ -4,9 +4,6 @@ but the report generation approach has changed. """ -__author__ = "Michal Stolarczyk" -__email__ = "michal@virginia.edu" - # import os # from collections.abc import Mapping # from copy import copy @@ -128,8 +125,8 @@ import os from logging import getLogger -from eido.const import * -from eido.exceptions import * +from eido.const import PROP_KEY +from eido.exceptions import EidoSchemaInvalidError from peppy.project import Project from peppy.sample import Sample @@ -139,27 +136,32 @@ PATH_LIKE = [PATH_KEY, THUMB_PATH_KEY] -def _get_path_sect_keys(mapping, keys=[PATH_KEY]): - """ - Get names of subsections in a mapping that contain collection of keys +def _get_path_sect_keys(mapping: dict, keys: list[str] = [PATH_KEY]) -> list[str]: + """Get names of subsections in a mapping that contain collection of keys. + + Args: + mapping (Mapping): Schema subsection to search for paths. + keys (Iterable[str]): Collection of keys to check for. - :param Mapping mapping: schema subsection to search for paths - :param Iterable[str] keys: collection of keys to check for - :return Iterable[str]: collection of keys to path-like sections + Returns: + Iterable[str]: Collection of keys to path-like sections. """ return [k for k, v in mapping.items() if bool(set(keys) & set(mapping[k]))] -def _populate_paths(object, schema, check_exist): - """ - Populate path-like object attributes with other object attributes - based on a defined template, e.g. '/Users/x/test_{name}/{genome}_file.txt' - - :param Mapping object: object with attributes to populate path template with - :param dict schema: schema with path attributes defined, e.g. - output of read_schema function - :param bool check_exist: whether the paths should be check for existence - :return Mapping: object with path templates populated +def _populate_paths(object, schema: dict, check_exist: bool) -> None: + """Populate path-like object attributes with other object attributes. + + Based on a defined template, e.g. '/Users/x/test_{name}/{genome}_file.txt' + + Args: + object (Mapping): Object with attributes to populate path template with. + schema (dict): Schema with path attributes defined, e.g. + output of read_schema function. + check_exist (bool): Whether the paths should be check for existence. + + Returns: + Mapping: Object with path templates populated. """ if PROP_KEY not in schema: raise EidoSchemaInvalidError("Schema is missing properties section.") @@ -172,8 +174,7 @@ def _populate_paths(object, schema, check_exist): populated = templ.format(**dict(object.items())) except Exception as e: _LOGGER.warning( - "Caught exception: {}.\n" - "Could not populate path: {}".format( + "Caught exception: {}.\nCould not populate path: {}".format( getattr(e, "message", repr(e)), templ ) ) @@ -188,16 +189,19 @@ def _populate_paths(object, schema, check_exist): ) -def populate_sample_paths(sample, schema, check_exist=False): - """ - Populate path-like Sample attributes with other object attributes - based on a defined template, e.g. '/Users/x/test_{name}/{genome}_file.txt' - - :param peppy.Sample sample: sample to populate paths in - :param Iterable[dict] schema: schema with path attributes defined, e.g. - output of read_schema function - :param bool check_exist: whether the paths should be check for existence - :return Mapping: Sample with path templates populated +def populate_sample_paths(sample, schema: dict, check_exist: bool = False) -> None: + """Populate path-like Sample attributes with other object attributes. + + Based on a defined template, e.g. '/Users/x/test_{name}/{genome}_file.txt' + + Args: + sample (peppy.Sample): Sample to populate paths in. + schema (Iterable[dict]): Schema with path attributes defined, e.g. + output of read_schema function. + check_exist (bool): Whether the paths should be check for existence. + + Returns: + Mapping: Sample with path templates populated. """ if not isinstance(sample, Sample): raise TypeError("Can only populate paths in peppy.Sample objects") @@ -206,30 +210,34 @@ def populate_sample_paths(sample, schema, check_exist=False): _populate_paths(sample, schema, check_exist) -def populate_project_paths(project, schema, check_exist=False): - """ - Populate path-like Project attributes with other object attributes - based on a defined template, e.g. '/Users/x/test_{name}/{genome}_file.txt' - - :param peppy.Project project: project to populate paths in - :param dict schema: schema with path attributes defined, e.g. - output of read_schema function - :param bool check_exist: whether the paths should be check for existence - :return Mapping: Project with path templates populated +def populate_project_paths(project, schema: dict, check_exist: bool = False) -> None: + """Populate path-like Project attributes with other object attributes. + + Based on a defined template, e.g. '/Users/x/test_{name}/{genome}_file.txt' + + Args: + project (peppy.Project): Project to populate paths in. + schema (dict): Schema with path attributes defined, e.g. + output of read_schema function. + check_exist (bool): Whether the paths should be check for existence. + + Returns: + Mapping: Project with path templates populated. """ if not isinstance(project, Project): raise TypeError("Can only populate paths in peppy.Project objects") _populate_paths(project, schema, check_exist) -def get_project_outputs(project, schema): - """ - Get project level outputs with path-like attributes populated with - project attributes +def get_project_outputs(project, schema: list[dict]): + """Get project level outputs with path-like attributes populated with project attributes. + + Args: + project (peppy.Project): Project to get outputs for. + schema (Iterable[dict]): Schema to source the outputs from. - :param peppy.Project project: - :param Iterable[dict] schema: - :return yacman.YAMLConfigManager: mapping with populated path-like attributes + Returns: + yacman.YAMLConfigManager: Mapping with populated path-like attributes. """ from yacman import YAMLConfigManager @@ -250,7 +258,8 @@ def get_project_outputs(project, schema): res[ps][p] = s[ps][p].format(**dict(project.items())) except Exception as e: _LOGGER.debug( - "Caught exception: {}.\n Could not populate {} " - "path".format(p, str(e)) + "Caught exception: {}.\n Could not populate {} path".format( + p, str(e) + ) ) return YAMLConfigManager(res) diff --git a/looper/project.py b/looper/project.py index 88de52e00..7b58a4e54 100644 --- a/looper/project.py +++ b/looper/project.py @@ -1,7 +1,7 @@ """Looper version of NGS project model.""" -import itertools import os +from typing import NoReturn from yaml import safe_load @@ -11,20 +11,45 @@ # cached_property was introduced in python 3.8 cached_property = property -from .divvy import ComputingConfiguration -from eido import PathAttrNotFoundError, read_schema +from eido import read_schema from jsonschema import ValidationError from pandas.core.common import flatten +from peppy import Project as peppyProject +from peppy.const import CONFIG_KEY from peppy.utils import make_abs_via_cfg from pipestat import PipestatManager from .conductor import write_pipestat_config - -from .exceptions import * +from .const import ( + CLI_PROJ_ATTRS, + COMPUTE_PACKAGE_KEY, + DRY_RUN_KEY, + EXTRA_KEY, + FILE_CHECKS_KEY, + INPUT_SCHEMA_KEY, + LOOPER_KEY, + OUTDIR_KEY, + OUTPUT_SCHEMA_KEY, + PIFACE_KEY_SELECTOR, + PIPELINE_INTERFACE_PIPELINE_NAME_KEY, + PIPELINE_INTERFACES_KEY, + PIPESTAT_KEY, + RESULTS_SUBDIR_KEY, + SAMPLE_PL_ARG, + SUBMISSION_SUBDIR_KEY, + PipelineLevel, +) +from .divvy import ComputingConfiguration +from .exceptions import MisconfigurationException, PipelineInterfaceConfigError from .pipeline_interface import PipelineInterface from .processed_project import populate_project_paths, populate_sample_paths -from .utils import * -from .const import PipelineLevel +from .utils import ( + expandpath, + fetch_sample_flags, + get_sample_status, + getLogger, + is_pephub_registry_path, +) __all__ = ["Project"] @@ -37,17 +62,18 @@ class ProjectContext(object): def __init__( self, prj, - selector_attribute=None, - selector_include=None, - selector_exclude=None, - selector_flag=None, - exclusion_flag=None, - ): + selector_attribute: str | None = None, + selector_include: list | str | None = None, + selector_exclude: list | str | None = None, + selector_flag: list | str | None = None, + exclusion_flag: list | str | None = None, + ) -> None: """Project and what to include/exclude defines the context.""" if not isinstance(selector_attribute, str): raise TypeError( - "Name of attribute for sample selection isn't a string: {} " - "({})".format(selector_attribute, type(selector_attribute)) + "Name of attribute for sample selection isn't a string: {} ({})".format( + selector_attribute, type(selector_attribute) + ) ) self.prj = prj self.include = selector_include @@ -96,21 +122,27 @@ def __exit__(self, *args): class Project(peppyProject): - """ - Looper-specific Project. - - :param str cfg: path to configuration file with data from - which Project is to be built - :param Iterable[str] amendments: name indicating amendment to use, optional - :param str divcfg_path: path to an environment configuration YAML file - specifying compute settings. - :param bool permissive: Whether a error should be thrown if - a sample input file(s) do not exist or cannot be open. - :param str compute_env_file: Environment configuration YAML file specifying - compute settings. + """Looper-specific Project. + + Args: + cfg (str): Path to configuration file with data from which Project is + to be built. + amendments (Iterable[str]): Name indicating amendment to use, optional. + divcfg_path (str): Path to an environment configuration YAML file + specifying compute settings. + permissive (bool): Whether a error should be thrown if a sample input + file(s) do not exist or cannot be open. + compute_env_file (str): Environment configuration YAML file specifying + compute settings. """ - def __init__(self, cfg=None, amendments=None, divcfg_path=None, **kwargs): + def __init__( + self, + cfg: str | None = None, + amendments=None, + divcfg_path: str | None = None, + **kwargs, + ) -> None: super(Project, self).__init__(cfg=cfg, amendments=amendments) prj_dict = kwargs.get("project_dict") pep_config = kwargs.get("pep_config", None) @@ -127,7 +159,7 @@ def __init__(self, cfg=None, amendments=None, divcfg_path=None, **kwargs): try: # For loading PEPs via CSV, Peppy cannot infer project name. - name = self.name + self.name # noqa: B018 except NotImplementedError: self.name = None @@ -156,29 +188,29 @@ def __init__(self, cfg=None, amendments=None, divcfg_path=None, **kwargs): self.make_project_dirs() @property - def piface_key(self): - """ - Name of the pipeline interface attribute for this project + def piface_key(self) -> str: + """Name of the pipeline interface attribute for this project. - :return str: name of the pipeline interface attribute + Returns: + str: Name of the pipeline interface attribute. """ return self._extra_cli_or_cfg(PIFACE_KEY_SELECTOR) or PIPELINE_INTERFACES_KEY @property - def selected_compute_package(self): - """ - Compute package name specified in object constructor + def selected_compute_package(self) -> str | None: + """Compute package name specified in object constructor. - :return str: compute package name + Returns: + str: Compute package name. """ return self._extra_cli_or_cfg(COMPUTE_PACKAGE_KEY) @property - def cli_pifaces(self): - """ - Collection of pipeline interface sources specified in object constructor + def cli_pifaces(self) -> list[str] | None: + """Collection of pipeline interface sources specified in object constructor. - :return list[str]: collection of pipeline interface sources + Returns: + list[str]: Collection of pipeline interface sources. """ x = self._extra_cli_or_cfg(self.piface_key) return ( @@ -188,23 +220,23 @@ def cli_pifaces(self): ) @property - def output_dir(self): - """ - Output directory for the project, specified in object constructor + def output_dir(self) -> str: + """Output directory for the project, specified in object constructor. - :return str: path to the output directory + Returns: + str: Path to the output directory. """ return self._extra_cli_or_cfg(OUTDIR_KEY, strict=True) - def _extra_cli_or_cfg(self, attr_name, strict=False): - """ - Get attribute value provided in kwargs in object constructor of from - looper section in the configuration file + def _extra_cli_or_cfg(self, attr_name: str, strict: bool = False): + """Get attribute value provided in kwargs in object constructor or from looper section in the configuration file. + + Args: + attr_name (str): Name of the attribute to get value for. + strict (bool): Whether a non-existent attribute is exceptional. - :param str attr_name: name of the attribute to get value for - :param bool strict: whether a non-existent attribute is exceptional - :raise MisconfigurationException: in strict mode, when no attribute - found + Raises: + MisconfigurationException: In strict mode, when no attribute found. """ try: result = self[EXTRA_KEY][attr_name] @@ -229,38 +261,41 @@ def _extra_cli_or_cfg(self, attr_name, strict=False): return @property - def results_folder(self): - """ - Path to the results folder for the project + def results_folder(self) -> str: + """Path to the results folder for the project. - :return str: path to the results folder in the output folder + Returns: + str: Path to the results folder in the output folder. """ return self._out_subdir_path(RESULTS_SUBDIR_KEY, default="results_pipeline") @property - def submission_folder(self): - """ - Path to the submission folder for the project + def submission_folder(self) -> str: + """Path to the submission folder for the project. - :return str: path to the submission in the output folder + Returns: + str: Path to the submission in the output folder. """ return self._out_subdir_path(SUBMISSION_SUBDIR_KEY, default="submission") def _out_subdir_path(self, key: str, default: str) -> str: - """ - Create a system path relative to the project output directory. + """Create a system path relative to the project output directory. + The values for the names of the subdirectories are sourced from kwargs passed to the object constructor. - :param str key: name of the attribute mapped to the value of interest - :param str default: if key not specified, a default to use - :return str: path to the folder + Args: + key (str): Name of the attribute mapped to the value of interest. + default (str): If key not specified, a default to use. + + Returns: + str: Path to the folder. """ parent = getattr(self, OUTDIR_KEY) child = getattr(self[EXTRA_KEY], key, default) or default return os.path.join(parent, child) - def make_project_dirs(self): + def make_project_dirs(self) -> None: """ Create project directory structure if it doesn't exist. """ @@ -279,12 +314,13 @@ def make_project_dirs(self): ) @cached_property - def project_pipeline_interface_sources(self): - """ - Get a list of all valid project-level pipeline interface sources - associated with this project. Sources that are file paths are expanded + def project_pipeline_interface_sources(self) -> list[str]: + """Get a list of all valid project-level pipeline interface sources associated with this project. + + Sources that are file paths are expanded. - :return list[str]: collection of valid pipeline interface sources: + Returns: + list[str]: Collection of valid pipeline interface sources. """ return ( [self._resolve_path_with_cfg(src) for src in self.cli_pifaces] @@ -293,16 +329,15 @@ def project_pipeline_interface_sources(self): ) @cached_property - def project_pipeline_interfaces(self): - """ - Flat list of all valid project-level interface objects associated - with this Project + def project_pipeline_interfaces(self) -> list: + """Flat list of all valid project-level interface objects associated with this Project. Note that only valid pipeline interfaces will show up in the result (ones that exist on disk/remotely and validate successfully - against the schema) + against the schema). - :return list[looper.PipelineInterface]: list of pipeline interfaces + Returns: + list[looper.PipelineInterface]: List of pipeline interfaces. """ return [ PipelineInterface(pi, pipeline_type=PipelineLevel.PROJECT.value) @@ -310,60 +345,63 @@ def project_pipeline_interfaces(self): ] @cached_property - def pipeline_interfaces(self): - """ - Flat list of all valid interface objects associated with this Project + def pipeline_interfaces(self) -> list: + """Flat list of all valid interface objects associated with this Project. Note that only valid pipeline interfaces will show up in the result (ones that exist on disk/remotely and validate successfully - against the schema) + against the schema). - :return list[looper.PipelineInterface]: list of pipeline interfaces + Returns: + list[looper.PipelineInterface]: List of pipeline interfaces. """ return [pi for ifaces in self._interfaces_by_sample.values() for pi in ifaces] @cached_property def pipeline_interface_sources(self): - """ - Get a list of all valid pipeline interface sources associated - with this project. Sources that are file paths are expanded + """Get a list of all valid pipeline interface sources associated with this project. - :return list[str]: collection of valid pipeline interface sources + Sources that are file paths are expanded. + + Returns: + list[str]: Collection of valid pipeline interface sources. """ return self._samples_by_interface.keys() @cached_property - def pipestat_configured(self): - """ - Whether pipestat configuration is complete for all sample pipelines + def pipestat_configured(self) -> bool: + """Whether pipestat configuration is complete for all sample pipelines. - :return bool: whether pipestat configuration is complete + Returns: + bool: Whether pipestat configuration is complete. """ return self._check_if_pipestat_configured() @cached_property - def pipestat_configured_project(self): - """ - Whether pipestat configuration is complete for all project pipelines + def pipestat_configured_project(self) -> bool: + """Whether pipestat configuration is complete for all project pipelines. - :return bool: whether pipestat configuration is complete + Returns: + bool: Whether pipestat configuration is complete. """ return self._check_if_pipestat_configured( pipeline_type=PipelineLevel.PROJECT.value ) - def get_sample_piface(self, sample_name): - """ - Get a list of pipeline interfaces associated with the specified sample. + def get_sample_piface(self, sample_name: str) -> list | None: + """Get a list of pipeline interfaces associated with the specified sample. Note that only valid pipeline interfaces will show up in the result (ones that exist on disk/remotely and validate successfully - against the schema) + against the schema). + + Args: + sample_name (str): Name of the sample to retrieve list of pipeline + interfaces for. - :param str sample_name: name of the sample to retrieve list of - pipeline interfaces for - :return list[looper.PipelineInterface]: collection of valid - pipeline interfaces associated with selected sample + Returns: + list[looper.PipelineInterface]: Collection of valid pipeline + interfaces associated with selected sample. """ try: return self._interfaces_by_sample[sample_name] @@ -371,14 +409,15 @@ def get_sample_piface(self, sample_name): return None @staticmethod - def get_schemas(pifaces, schema_key=INPUT_SCHEMA_KEY): - """ - Get the list of unique schema paths for a list of pipeline interfaces + def get_schemas(pifaces, schema_key: str = INPUT_SCHEMA_KEY) -> list[str]: + """Get the list of unique schema paths for a list of pipeline interfaces. + + Args: + pifaces (str | Iterable[str]): Pipeline interfaces to search schemas for. + schema_key (str): Where to look for schemas in the piface. - :param str | Iterable[str] pifaces: pipeline interfaces to search - schemas for - :param str schema_key: where to look for schemas in the piface - :return Iterable[str]: unique list of schema file paths + Returns: + Iterable[str]: Unique list of schema file paths. """ if isinstance(pifaces, str): pifaces = [pifaces] @@ -389,7 +428,9 @@ def get_schemas(pifaces, schema_key=INPUT_SCHEMA_KEY): schema_set.update([schema_file]) return list(schema_set) - def _check_if_pipestat_configured(self, pipeline_type=PipelineLevel.SAMPLE.value): + def _check_if_pipestat_configured( + self, pipeline_type: str = PipelineLevel.SAMPLE.value + ) -> bool: # First check if pipestat key is in looper_config, if not return false @@ -403,20 +444,21 @@ def _check_if_pipestat_configured(self, pipeline_type=PipelineLevel.SAMPLE.value # This should return True OR raise an exception at this point. return self._get_pipestat_configuration(pipeline_type) - def _get_pipestat_configuration(self, pipeline_type=PipelineLevel.SAMPLE.value): + def _get_pipestat_configuration( + self, pipeline_type: str = PipelineLevel.SAMPLE.value + ) -> bool: # First check if it already exists if pipeline_type == PipelineLevel.SAMPLE.value: for piface in self.pipeline_interfaces: - pipestat_config_path = self._check_for_existing_pipestat_config(piface) if not pipestat_config_path: self._create_pipestat_config(piface, pipeline_type) else: - piface.psm = PipestatManager( - config_file=pipestat_config_path, + piface.psm = PipestatManager.from_config( + config=pipestat_config_path, multi_pipelines=True, pipeline_type="sample", ) @@ -430,8 +472,8 @@ def _get_pipestat_configuration(self, pipeline_type=PipelineLevel.SAMPLE.value): if not pipestat_config_path: self._create_pipestat_config(prj_piface, pipeline_type) else: - prj_piface.psm = PipestatManager( - config_file=pipestat_config_path, + prj_piface.psm = PipestatManager.from_config( + config=pipestat_config_path, multi_pipelines=True, pipeline_type="project", ) @@ -442,7 +484,7 @@ def _get_pipestat_configuration(self, pipeline_type=PipelineLevel.SAMPLE.value): return True - def _check_for_existing_pipestat_config(self, piface): + def _check_for_existing_pipestat_config(self, piface) -> str | None: """ config files should be in looper output directory and named as: @@ -473,7 +515,7 @@ def _check_for_existing_pipestat_config(self, piface): else: return None - def _create_pipestat_config(self, piface, pipeline_type): + def _create_pipestat_config(self, piface, pipeline_type: str) -> None: """ Each piface needs its own config file and associated psm """ @@ -573,13 +615,13 @@ def _create_pipestat_config(self, piface, pipeline_type): # Two end goals, create a config file write_pipestat_config(pipestat_config_path, pipestat_config_dict) - piface.psm = PipestatManager( - config_file=pipestat_config_path, multi_pipelines=True + piface.psm = PipestatManager.from_config( + config=pipestat_config_path, multi_pipelines=True ) return None - def populate_pipeline_outputs(self): + def populate_pipeline_outputs(self) -> None: """ Populate project and sample output attributes based on output schemas that pipeline interfaces point to. @@ -597,17 +639,17 @@ def populate_pipeline_outputs(self): for schema in schemas: populate_project_paths(self, read_schema(schema)[0]) - def _get_linked_pifaces(self): - """ - Get linked sample pipeline interfaces by project pipeline interface. + def _get_linked_pifaces(self) -> dict[str, list[str]]: + """Get linked sample pipeline interfaces by project pipeline interface. These are indicated in project pipeline interface by 'linked_pipeline_interfaces' key. If a project pipeline interface - does not have such key defined, an empty list is returned for that - pipeline interface. + does not have such key defined, an empty list is returned for that + pipeline interface. - :return dict[list[str]]: mapping of sample pipeline interfaces - by project pipeline interfaces + Returns: + dict[list[str]]: Mapping of sample pipeline interfaces by project + pipeline interfaces. """ def _process_linked_piface(p, piface, prj_piface): @@ -634,12 +676,12 @@ def _process_linked_piface(p, piface, prj_piface): ) return linked_pifaces - def _piface_by_samples(self): - """ - Create a mapping of all defined interfaces in this Project by samples. + def _piface_by_samples(self) -> dict: + """Create a mapping of all defined interfaces in this Project by samples. - :return dict[str, list[PipelineInterface]]: a collection of pipeline - interfaces keyed by sample name + Returns: + dict[str, list[PipelineInterface]]: A collection of pipeline + interfaces keyed by sample name. """ pifaces_by_sample = {} for source, sample_names in self._samples_by_interface.items(): @@ -652,21 +694,23 @@ def _piface_by_samples(self): pifaces_by_sample.setdefault(sample_name, []).append(pi) return pifaces_by_sample - def _omit_from_repr(self, k, cls): - """ - Exclude the interfaces from representation. + def _omit_from_repr(self, k: str, cls: type) -> bool: + """Exclude the interfaces from representation. - :param str k: key of item to consider for omission - :param type cls: placeholder to comply with superclass signature + Args: + k (str): Key of item to consider for omission. + cls (type): Placeholder to comply with superclass signature. """ return super(Project, self)._omit_from_repr(k, cls) or k == "interfaces" - def _resolve_path_with_cfg(self, pth): - """ - Expand provided path and make it absolute using project config path + def _resolve_path_with_cfg(self, pth: str | None) -> str | None: + """Expand provided path and make it absolute using project config path. - :param str pth: path, possibly including env vars and/or relative - :return str: absolute path + Args: + pth (str): Path, possibly including env vars and/or relative. + + Returns: + str: Absolute path. """ if pth is None: return @@ -676,14 +720,14 @@ def _resolve_path_with_cfg(self, pth): _LOGGER.debug("Relative path made absolute: {}".format(pth)) return pth - def _samples_by_piface(self, piface_key): - """ - Create a collection of all samples with valid pipeline interfaces + def _samples_by_piface(self, piface_key: str) -> dict[str, set[str]]: + """Create a collection of all samples with valid pipeline interfaces. + + Args: + piface_key (str): Name of the attribute that holds pipeline interfaces. - :param str piface_key: name of the attribute that holds pipeline - interfaces - :return list[str]: a collection of samples keyed by pipeline interface - source + Returns: + list[str]: A collection of samples keyed by pipeline interface source. """ samples_by_piface = {} msgs = set() @@ -718,11 +762,11 @@ def _samples_by_piface(self, piface_key): _LOGGER.warning(msg) return samples_by_piface - def set_sample_piface(self, sample_piface: Union[List[str], str]) -> NoReturn: - """ - Add sample pipeline interfaces variable to object + def set_sample_piface(self, sample_piface: list[str] | str) -> NoReturn: + """Add sample pipeline interfaces variable to object. - :param list | str sample_piface: sample pipeline interface + Args: + sample_piface (list | str): Sample pipeline interface. """ self.config.setdefault("sample_modifiers", {}) self.config["sample_modifiers"].setdefault("append", {}) @@ -733,43 +777,43 @@ def set_sample_piface(self, sample_piface: Union[List[str], str]) -> NoReturn: def fetch_samples( prj, - selector_attribute=None, - selector_include=None, - selector_exclude=None, - selector_flag=None, - exclusion_flag=None, -): - """ - Collect samples of particular protocol(s). + selector_attribute: str | None = None, + selector_include: list | str | None = None, + selector_exclude: list | str | None = None, + selector_flag: list | str | None = None, + exclusion_flag: list | str | None = None, +) -> list: + """Collect samples of particular protocol(s). Protocols can't be both positively selected for and negatively selected against. That is, it makes no sense and is not allowed to specify both selector_include and selector_exclude protocols. On the - other hand, if - neither is provided, all of the Project's Samples are returned. + other hand, if neither is provided, all of the Project's Samples are returned. If selector_include is specified, Samples without a protocol will be - excluded, - but if selector_exclude is specified, protocol-less Samples will be + excluded, but if selector_exclude is specified, protocol-less Samples will be included. - :param Project prj: the Project with Samples to fetch - :param str selector_attribute: name of attribute on which to base the - fetch - :param Iterable[str] | str selector_include: protocol(s) of interest; - if specified, a Sample must - :param Iterable[str] | str selector_exclude: protocol(s) to include - :param Iterable[str] | str selector_flag: flag to select on, e.g. FAILED, COMPLETED - :param Iterable[str] | str exclusion_flag: flag to exclude on, e.g. FAILED, COMPLETED - :return list[Sample]: Collection of this Project's samples with - protocol that either matches one of those in selector_include, - or either - lacks a protocol or does not match one of those in selector_exclude - :raise TypeError: if both selector_include and selector_exclude - protocols are - specified; TypeError since it's basically providing two arguments - when only one is accepted, so remain consistent with vanilla - Python2; - also possible if name of attribute for selection isn't a string + Args: + prj (Project): The Project with Samples to fetch. + selector_attribute (str): Name of attribute on which to base the fetch. + selector_include (Iterable[str] | str): Protocol(s) of interest; if + specified, a Sample must. + selector_exclude (Iterable[str] | str): Protocol(s) to include. + selector_flag (Iterable[str] | str): Flag to select on, e.g. FAILED, + COMPLETED. + exclusion_flag (Iterable[str] | str): Flag to exclude on, e.g. FAILED, + COMPLETED. + + Returns: + list[Sample]: Collection of this Project's samples with protocol that + either matches one of those in selector_include, or either lacks a + protocol or does not match one of those in selector_exclude. + + Raises: + TypeError: If both selector_include and selector_exclude protocols are + specified; TypeError since it's basically providing two arguments + when only one is accepted, so remain consistent with vanilla Python2; + also possible if name of attribute for selection isn't a string. """ kept_samples = prj.samples @@ -796,7 +840,7 @@ def keep(s): # nonsense user error. if selector_include and selector_exclude: raise TypeError( - "Specify only selector_include or selector_exclude parameter, " "not both." + "Specify only selector_include or selector_exclude parameter, not both." ) if not isinstance(selector_attribute, str): @@ -899,7 +943,7 @@ def keep(s): return kept_samples -def make_set(items): +def make_set(items) -> list: if isinstance(items, str): items = [items] elif len(items) == 1: diff --git a/looper/schemas/pipeline_interface_schema_project.yaml b/looper/schemas/pipeline_interface_schema_project.yaml index 294e17aea..7f0a2aaa6 100644 --- a/looper/schemas/pipeline_interface_schema_project.yaml +++ b/looper/schemas/pipeline_interface_schema_project.yaml @@ -45,4 +45,13 @@ properties: singularity_image: type: string description: "Singularity image identifier" + inject_env_vars: + type: object + description: "Environment variables to inject into submission scripts. Keys are variable names, values are Jinja2 templates." + additionalProperties: + type: string + pipestat_config_required: + type: boolean + description: "If false, disables validation that pipestat config is passed to pipeline. Default true." + default: true required: [pipeline_name, pipeline_type, command_template] diff --git a/looper/schemas/pipeline_interface_schema_sample.yaml b/looper/schemas/pipeline_interface_schema_sample.yaml index a69a2ac7a..63b9d8b0c 100644 --- a/looper/schemas/pipeline_interface_schema_sample.yaml +++ b/looper/schemas/pipeline_interface_schema_sample.yaml @@ -45,4 +45,13 @@ properties: singularity_image: type: string description: "Singularity image identifier" + inject_env_vars: + type: object + description: "Environment variables to inject into submission scripts. Keys are variable names, values are Jinja2 templates." + additionalProperties: + type: string + pipestat_config_required: + type: boolean + description: "If false, disables validation that pipestat config is passed to pipeline. Default true." + default: true required: [pipeline_name, pipeline_type, command_template] diff --git a/looper/utils.py b/looper/utils.py index b5d904c52..85b1c4580 100644 --- a/looper/utils.py +++ b/looper/utils.py @@ -1,50 +1,71 @@ """Helpers without an obvious logical home.""" import argparse -from collections import defaultdict import glob import itertools -from logging import getLogger import os -from typing import * import re +from collections import defaultdict +from collections.abc import Iterable +from logging import getLogger import jinja2 import yaml -from peppy import Project as peppyProject -from peppy.const import * -from ubiquerg import convert_value, expandpath, parse_registry_path, deep_update from pephubclient.constants import RegistryPath +from peppy import Project as peppyProject +from peppy.const import CONFIG_KEY, NAME_KEY, SAMPLE_MODS_KEY from pydantic import ValidationError +from rich.console import Console +from rich.pretty import pprint +from ubiquerg import convert_value, deep_update, expandpath, parse_registry_path from yacman import load_yaml from yaml.parser import ParserError -from .const import * -from .command_models.commands import SUPPORTED_COMMANDS +from .const import ( + ALL_SUBCMD_KEY, + CLI_KEY, + FLAGS, + LOOPER_DOTFILE_NAME, + LOOPER_GENERIC_COUNT_LINES, + LOOPER_GENERIC_OUTPUT_SCHEMA, + LOOPER_GENERIC_PIPELINE, + LOOPER_KEY, + OUTDIR_KEY, + PEP_CONFIG_KEY, + PIPELINE_INTERFACES_KEY, + PIPESTAT_KEY, + PROJECT_PL_ARG, + SAMPLE_PL_ARG, + PipelineLevel, +) from .exceptions import MisconfigurationException, PipelineInterfaceConfigError -from rich.console import Console -from rich.pretty import pprint _LOGGER = getLogger(__name__) -def fetch_flag_files(prj=None, results_folder="", flags=FLAGS): - """ - Find all flag file paths for the given project. - - :param Project | AttributeDict prj: full Project or AttributeDict with - similar metadata and access/usage pattern - :param str results_folder: path to results folder, corresponding to the - 1:1 sample:folder notion that a looper Project has. That is, this - function uses the assumption that if results_folder rather than project - is provided, the structure of the file tree rooted at results_folder is - such that any flag files to be found are not directly within rootdir but - are directly within on of its first layer of subfolders. - :param Iterable[str] | str flags: Collection of flag names or single flag - name for which to fetch files - :return Mapping[str, list[str]]: collection of filepaths associated with - particular flag for samples within the given project - :raise TypeError: if neither or both of project and rootdir are given +def fetch_flag_files( + prj=None, results_folder: str = "", flags: Iterable[str] | str = FLAGS +) -> dict[str, list[str]]: + """Find all flag file paths for the given project. + + Args: + prj (Project | AttributeDict): Full Project or AttributeDict with + similar metadata and access/usage pattern. + results_folder (str): Path to results folder, corresponding to the + 1:1 sample:folder notion that a looper Project has. That is, this + function uses the assumption that if results_folder rather than project + is provided, the structure of the file tree rooted at results_folder is + such that any flag files to be found are not directly within rootdir but + are directly within one of its first layer of subfolders. + flags (Iterable[str] | str): Collection of flag names or single flag + name for which to fetch files. + + Returns: + Mapping[str, list[str]]: Collection of filepaths associated with + particular flag for samples within the given project. + + Raises: + TypeError: If neither or both of project and rootdir are given. """ if not (prj or results_folder) or (prj and results_folder): @@ -76,15 +97,20 @@ def fetch_flag_files(prj=None, results_folder="", flags=FLAGS): return files_by_flag -def fetch_sample_flags(prj, sample, pl_name, flag_dir=None): - """ - Find any flag files present for a sample associated with a project +def fetch_sample_flags( + prj, sample, pl_name: str, flag_dir: str | None = None +) -> list[str]: + """Find any flag files present for a sample associated with a project. + + Args: + prj (looper.Project): Project of interest. + sample (peppy.Sample): Sample object of interest. + pl_name (str): Name of the pipeline for which flag(s) should be found. + flag_dir: Flag directory path. - :param looper.Project prj: project of interest - :param peppy.Sample sample: sample object of interest - :param str pl_name: name of the pipeline for which flag(s) should be found - :return Iterable[str]: collection of flag file path(s) associated with the - given sample for the given project + Returns: + Iterable[str]: Collection of flag file path(s) associated with the + given sample for the given project. """ sfolder = flag_dir or sample_folder(prj=prj, sample=sample) if not os.path.isdir(sfolder): @@ -104,10 +130,15 @@ def fetch_sample_flags(prj, sample, pl_name, flag_dir=None): ] -def get_sample_status(sample, flags): - """ - get a sample status +def get_sample_status(sample: str, flags: list[str]) -> str | None: + """Get a sample status. + Args: + sample: Sample identifier. + flags: Collection of flag file paths. + + Returns: + str or None: Status string if found, None otherwise. """ statuses = [] @@ -127,9 +158,8 @@ def get_sample_status(sample, flags): return statuses[0] -def grab_project_data(prj): - """ - From the given Project, grab Sample-independent data. +def grab_project_data(prj) -> dict: + """From the given Project, grab Sample-independent data. There are some aspects of a Project of which it's beneficial for a Sample to be aware, particularly for post-hoc analysis. Since Sample objects @@ -138,8 +168,11 @@ def grab_project_data(prj): so for each Sample knowledge of Project data is limited. This method facilitates adoption of that conceptual model. - :param Project prj: Project from which to grab data - :return Mapping: Sample-independent data sections from given Project + Args: + prj (Project): Project from which to grab data. + + Returns: + Mapping: Sample-independent data sections from given Project. """ if not prj: return {} @@ -150,31 +183,39 @@ def grab_project_data(prj): _LOGGER.debug("Project lacks section '%s', skipping", CONFIG_KEY) -def sample_folder(prj, sample): - """ - Get the path to this Project's root folder for the given Sample. +def sample_folder(prj, sample) -> str: + """Get the path to this Project's root folder for the given Sample. + + Args: + prj (AttributeDict | Project): Project with which sample is associated. + sample (Mapping): Sample or sample data for which to get root output + folder path. - :param AttributeDict | Project prj: project with which sample is associated - :param Mapping sample: Sample or sample data for which to get root output - folder path. - :return str: this Project's root folder for the given Sample + Returns: + str: This Project's root folder for the given Sample. """ return os.path.join(prj.results_folder, sample[prj.sample_table_index]) -def get_file_for_project(prj, pipeline_name, appendix=None, directory=None): - """ - Create a path to the file for the current project. - Takes the possibility of amendment being activated at the time +def get_file_for_project( + prj, pipeline_name: str, appendix: str | None = None, directory: str | None = None +) -> str: + """Create a path to the file for the current project. + + Takes the possibility of amendment being activated at the time. Format of the output path: {output_dir}/{directory}/{p.name}_{pipeline_name}_{active_amendments}_{appendix} - :param looper.Project prj: project object - :param str pipeline_name: name of the pipeline to get the file for - :param str appendix: the appendix of the file to create the path for, - like 'objs_summary.tsv' for objects summary file - :return str: path to the file + Args: + prj (looper.Project): Project object. + pipeline_name (str): Name of the pipeline to get the file for. + appendix (str): The appendix of the file to create the path for, + like 'objs_summary.tsv' for objects summary file. + directory (str): Directory path component. + + Returns: + str: Path to the file. """ fp = os.path.join( prj.output_dir, directory or "", f"{prj[NAME_KEY]}_{pipeline_name}" @@ -185,36 +226,21 @@ def get_file_for_project(prj, pipeline_name, appendix=None, directory=None): return fp -def get_file_for_project_old(prj, appendix): - """ - Create a path to the file for the current project. - Takes the possibility of amendment being activated at the time +def jinja_render_template_strictly(template: str, namespaces: dict) -> str: + """Render a command string in the provided namespaces context. - :param looper.Project prj: project object - :param str appendix: the appendix of the file to create the path for, - like 'objs_summary.tsv' for objects summary file - :return str: path to the file - """ - fp = os.path.join(prj.output_dir, prj[NAME_KEY]) - if hasattr(prj, AMENDMENTS_KEY) and getattr(prj, AMENDMENTS_KEY): - fp += "_" + "_".join(getattr(prj, AMENDMENTS_KEY)) - fp += "_" + appendix - return fp + Strictly, which means that all the requested attributes must be + available in the namespaces. + Args: + template (str): Command template to be filled in with the + variables in the provided namespaces. For example: + "prog.py --name {project.name} --len {sample.len}". + namespaces (Mapping[Mapping[str]]): Context for command rendering. + Possible namespaces are: looper, project, sample, pipeline. -def jinja_render_template_strictly(template, namespaces): - """ - Render a command string in the provided namespaces context. - - Strictly, which means that all the requested attributes must be - available in the namespaces - - :param str template: command template do be filled in with the - variables in the provided namespaces. For example: - "prog.py --name {project.name} --len {sample.len}" - :param Mapping[Mapping[str] namespaces: context for command rendering. - Possible namespaces are: looper, project, sample, pipeline - :return str: rendered command + Returns: + str: Rendered command. """ def _finfun(x): @@ -242,12 +268,30 @@ def _finfun(x): return rendered -def read_yaml_file(filepath): +def render_inject_env_vars(inject_env_vars: dict, namespaces: dict) -> dict[str, str]: + """Render inject_env_vars templates to concrete values. + + Args: + inject_env_vars (dict): Mapping of variable names to Jinja2 templates. + namespaces (dict): Namespaces to use for rendering. + + Returns: + dict[str, str]: Rendered environment variable name-value pairs. """ - Read a YAML file + rendered = {} + for var_name, template in inject_env_vars.items(): + rendered[var_name] = jinja_render_template_strictly(template, namespaces) + return rendered + - :param str filepath: path to the file to read - :return dict: read data +def read_yaml_file(filepath: str) -> dict | None: + """Read a YAML file. + + Args: + filepath (str): Path to the file to read. + + Returns: + dict: Read data. """ data = None if os.path.exists(filepath): @@ -257,24 +301,24 @@ def read_yaml_file(filepath): def enrich_args_via_cfg( - subcommand_name, + subcommand_name: str, parser_args, - aux_parser, - test_args=None, - cli_modifiers=None, -): - """ - Read in a looper dotfile, pep config and set arguments. + test_args: dict | None = None, + cli_modifiers: dict | None = None, +) -> argparse.Namespace: + """Read in a looper dotfile, pep config and set arguments. Priority order: CLI > dotfile/config > pep_config > parser default - :param subcommand name: the name of the command used - :param argparse.Namespace parser_args: parsed args by the original parser - :param argparse.Namespace aux_parser: parsed args by the argument parser - with defaults suppressed - :param dict test_args: dict of args used for pytesting - :param dict cli_modifiers: dict of args existing if user supplied cli args in looper config file - :return argparse.Namespace: selected argument values + Args: + subcommand_name: The name of the command used. + parser_args (argparse.Namespace): Parsed args by the original parser. + test_args (dict): Dict of args used for pytesting. + cli_modifiers (dict): Dict of args existing if user supplied cli args + in looper config file. + + Returns: + argparse.Namespace: Selected argument values. """ # Did the user provide arguments in the PEP config? @@ -305,58 +349,53 @@ def enrich_args_via_cfg( _LOGGER.debug(msg=f"Merged CLI modifiers: {cfg_args_all}") result = argparse.Namespace() - if test_args: - cli_args, _ = aux_parser.parse_known_args(args=test_args) - - else: - cli_args, _ = aux_parser.parse_known_args() - - # If any CLI args were provided, make sure they take priority - if cli_args: - r = getattr(cli_args, subcommand_name) - for k, v in cfg_args_all.items(): - if k in r: - cfg_args_all[k] = getattr(r, k) + cli_args = parser_args # Use parser_args directly, already parsed def set_single_arg(argname, default_source_namespace, result_namespace): - if argname not in POSITIONAL or not hasattr(result, argname): - if argname in cli_args: - cli_provided_value = getattr(cli_args, argname) - r = ( - convert_value(cli_provided_value) - if isinstance(cli_provided_value, str) - else cli_provided_value - ) - elif cfg_args_all is not None and argname in cfg_args_all: - if isinstance(cfg_args_all[argname], list): - r = [convert_value(i) for i in cfg_args_all[argname]] - elif isinstance(cfg_args_all[argname], dict): - r = cfg_args_all[argname] - else: - r = convert_value(cfg_args_all[argname]) + # Priority: CLI > cfg_args_all (PEP config) > parser default + cli_value = getattr(cli_args, argname, None) + cfg_value = cfg_args_all.get(argname) if cfg_args_all else None + default_value = getattr(default_source_namespace, argname, None) + + if cli_value is not None and cli_value != default_value: + # CLI provided a non-default value - use it + r = convert_value(cli_value) if isinstance(cli_value, str) else cli_value + elif cfg_value is not None: + # PEP config provided a value + if isinstance(cfg_value, list): + r = [convert_value(i) for i in cfg_value] + elif isinstance(cfg_value, dict): + r = cfg_value else: - r = getattr(default_source_namespace, argname) - setattr(result_namespace, argname, r) - - for top_level_argname in vars(parser_args): - if top_level_argname not in [cmd.name for cmd in SUPPORTED_COMMANDS]: - # this argument is a top-level argument - set_single_arg(top_level_argname, parser_args, result) + r = convert_value(cfg_value) else: - # this argument actually is a subcommand - enriched_command_namespace = argparse.Namespace() - command_namespace = getattr(parser_args, top_level_argname) - if command_namespace: - for argname in vars(command_namespace): - set_single_arg( - argname, command_namespace, enriched_command_namespace - ) - setattr(result, top_level_argname, enriched_command_namespace) + # Use default + r = default_value + setattr(result_namespace, argname, r) + + # Copy all arguments from parser_args to result + for argname in vars(parser_args): + set_single_arg(argname, parser_args, result) + + # Also add any cfg_args that weren't in parser_args + if cfg_args_all: + for argname in cfg_args_all: + if not hasattr(result, argname): + cfg_value = cfg_args_all[argname] + if isinstance(cfg_value, list): + r = [convert_value(i) for i in cfg_value] + elif isinstance(cfg_value, dict): + r = cfg_value + else: + r = convert_value(cfg_value) + setattr(result, argname, r) + return result -def _get_subcommand_args(subcommand_name, parser_args): - """ +def _get_subcommand_args(subcommand_name: str, parser_args) -> dict | None: + """Get the union of values for the subcommand arguments. + Get the union of values for the subcommand arguments from Project.looper, Project.looper.cli. and Project.looper.cli.all. If any are duplicated, the above is the selection priority order. @@ -365,8 +404,11 @@ def _get_subcommand_args(subcommand_name, parser_args): with '_'), which strongly relies on argument parser using default destinations. - :param argparser.Namespace parser_args: argument namespace - :return dict: mapping of argument destinations to their values + Args: + parser_args (argparser.Namespace): Argument namespace. + + Returns: + dict: Mapping of argument destinations to their values. """ args = dict() cfg = peppyProject( @@ -412,9 +454,14 @@ def _get_subcommand_args(subcommand_name, parser_args): return args -def init_generic_pipeline(pipelinepath: Optional[str] = None): - """ - Create generic pipeline interface +def init_generic_pipeline(pipelinepath: str | None = None): + """Create generic pipeline interface. + + Args: + pipelinepath (str, optional): Path to pipeline directory. + + Returns: + bool: True if successful. """ console = Console() @@ -446,7 +493,7 @@ def init_generic_pipeline(pipelinepath: Optional[str] = None): }, } - console.rule(f"\n[magenta]Pipeline Interface[/magenta]") + console.rule("\n[magenta]Pipeline Interface[/magenta]") # Write file if not os.path.exists(dest_file): pprint(generic_pipeline_dict, expand_all=True) @@ -481,7 +528,7 @@ def init_generic_pipeline(pipelinepath: Optional[str] = None): }, } - console.rule(f"\n[magenta]Output Schema[/magenta]") + console.rule("\n[magenta]Output Schema[/magenta]") # Write file if not os.path.exists(dest_file): pprint(generic_output_schema_dict, expand_all=True) @@ -495,7 +542,7 @@ def init_generic_pipeline(pipelinepath: Optional[str] = None): f"Output schema file already exists [yellow]`{dest_file}`[/yellow]. Skipping creation.." ) - console.rule(f"\n[magenta]Example Pipeline Shell Script[/magenta]") + console.rule("\n[magenta]Example Pipeline Shell Script[/magenta]") # Create Generic countlines.sh if not pipelinepath: @@ -523,12 +570,15 @@ def init_generic_pipeline(pipelinepath: Optional[str] = None): return True -def read_looper_dotfile(): - """ - Read looper config file - :return str: path to the config file read from the dotfile - :raise MisconfigurationException: if the dotfile does not consist of the - required key pointing to the PEP +def read_looper_dotfile() -> dict: + """Read looper config file. + + Returns: + str: Path to the config file read from the dotfile. + + Raises: + MisconfigurationException: If the dotfile does not consist of the + required key pointing to the PEP. """ dot_file_path = dotfile_path(must_exist=True) return read_looper_config_file(looper_config_path=dot_file_path) @@ -538,24 +588,28 @@ def initiate_looper_config( looper_config_path: str, pep_path: str = None, output_dir: str = None, - sample_pipeline_interfaces: Union[List[str], str] = None, - project_pipeline_interfaces: Union[List[str], str] = None, - force=False, -): - """ - Initialize looper config file - - :param str looper_config_path: absolute path to the file to initialize - :param str pep_path: path to the PEP to be used in pipeline - :param str output_dir: path to the output directory - :param str|list sample_pipeline_interfaces: path or list of paths to sample pipeline interfaces - :param str|list project_pipeline_interfaces: path or list of paths to project pipeline interfaces - :param bool force: whether the existing file should be overwritten - :return bool: whether the file was initialized + sample_pipeline_interfaces: list[str] | str = None, + project_pipeline_interfaces: list[str] | str = None, + force: bool = False, +) -> bool: + """Initialize looper config file. + + Args: + looper_config_path (str): Absolute path to the file to initialize. + pep_path (str): Path to the PEP to be used in pipeline. + output_dir (str): Path to the output directory. + sample_pipeline_interfaces (str | list): Path or list of paths to + sample pipeline interfaces. + project_pipeline_interfaces (str | list): Path or list of paths to + project pipeline interfaces. + force (bool): Whether the existing file should be overwritten. + + Returns: + bool: Whether the file was initialized. """ console = Console() console.clear() - console.rule(f"\n[magenta]Looper initialization[/magenta]") + console.rule("\n[magenta]Looper initialization[/magenta]") if os.path.exists(looper_config_path) and not force: console.print( @@ -606,16 +660,16 @@ def initiate_looper_config( return True -def looper_config_tutorial(): - """ - Prompt a user through configuring a .looper.yaml file for a new project. +def looper_config_tutorial() -> bool: + """Prompt a user through configuring a .looper.yaml file for a new project. - :return bool: whether the file was initialized + Returns: + bool: Whether the file was initialized. """ console = Console() console.clear() - console.rule(f"\n[magenta]Looper initialization[/magenta]") + console.rule("\n[magenta]Looper initialization[/magenta]") looper_cfg_path = ".looper.yaml" # not changeable @@ -685,14 +739,12 @@ def looper_config_tutorial(): console.print("\n") - console.print( - f"""\ -[yellow]pep_config:[/yellow] {cfg['pep_config']} -[yellow]output_dir:[/yellow] {cfg['output_dir']} + console.print(f"""\ +[yellow]pep_config:[/yellow] {cfg["pep_config"]} +[yellow]output_dir:[/yellow] {cfg["output_dir"]} [yellow]pipeline_interfaces:[/yellow] - {piface_paths} -""" - ) +""") for piface_path in piface_paths: if not os.path.exists(piface_path): @@ -725,14 +777,21 @@ def looper_config_tutorial(): return True -def determine_pipeline_type(piface_path: str, looper_config_path: str): - """ - Read pipeline interface from disk and determine if it contains "sample_interface", "project_interface" or both +def determine_pipeline_type( + piface_path: str, looper_config_path: str +) -> tuple[list[str] | None, str | None]: + """Read pipeline interface and determine its type. + Read pipeline interface from disk and determine if it contains + "sample_interface", "project_interface" or both. - :param str piface_path: path to pipeline_interface - :param str looper_config_path: path to looper config file - :return Tuple[Union[str,None],Union[str,None]] : (pipeline type, resolved path) or (None, None) + Args: + piface_path (str): Path to pipeline_interface. + looper_config_path (str): Path to looper config file. + + Returns: + Tuple[Union[str, None], Union[str, None]]: (pipeline type, resolved path) + or (None, None). """ if piface_path is None: @@ -764,22 +823,28 @@ def determine_pipeline_type(piface_path: str, looper_config_path: str): if pipeline_types == []: raise PipelineInterfaceConfigError( - f"sample_interface and/or project_interface must be defined in each pipeline interface." + "sample_interface and/or project_interface must be defined in each pipeline interface." ) return pipeline_types, piface_path def read_looper_config_file(looper_config_path: str) -> dict: - """ + """Read Looper config file. + Read Looper config file which includes: - PEP config (local path or pephub registry path) - looper output dir - looper pipeline interfaces - :param str looper_config_path: path to looper config path - :return dict: looper config file content - :raise MisconfigurationException: incorrect configuration. + Args: + looper_config_path (str): Path to looper config path. + + Returns: + dict: Looper config file content. + + Raises: + MisconfigurationException: Incorrect configuration. """ return_dict = {} @@ -816,7 +881,6 @@ def read_looper_config_file(looper_config_path: str) -> dict: return_dict[CLI_KEY] = dp_data[CLI_KEY] if PIPELINE_INTERFACES_KEY in dp_data: - dp_data.setdefault(PIPELINE_INTERFACES_KEY, {}) all_pipeline_interfaces = dp_data.get(PIPELINE_INTERFACES_KEY) @@ -873,17 +937,21 @@ def read_looper_config_file(looper_config_path: str) -> dict: return return_dict -def dotfile_path(directory=os.getcwd(), must_exist=False): - """ - Get the path to the looper dotfile +def dotfile_path(directory: str = os.getcwd(), must_exist: bool = False) -> str: + """Get the path to the looper dotfile. If file existence is forced this function will look for it in - the directory parents + the directory parents. + + Args: + directory (str): Directory path to start the search in. + must_exist (bool): Whether the file must exist. - :param str directory: directory path to start the search in - :param bool must_exist: whether the file must exist - :return str: path to the dotfile - :raise OSError: if the file does not exist + Returns: + str: Path to the dotfile. + + Raises: + OSError: If the file does not exist. """ cur_dir = directory if not must_exist: @@ -895,15 +963,21 @@ def dotfile_path(directory=os.getcwd(), must_exist=False): if cur_dir == parent_dir: # root, file does not exist raise OSError( - "Looper dotfile ({}) not found in '{}' and all " - "its parents".format(LOOPER_DOTFILE_NAME, directory) + "Looper dotfile ({}) not found in '{}' and all its parents".format( + LOOPER_DOTFILE_NAME, directory + ) ) cur_dir = parent_dir def is_PEP_file_type(input_string: str) -> bool: - """ - Determines if the provided path is actually a file type that Looper can use for loading PEP + """Determines if the provided path is a file type that Looper can use for loading PEP. + + Args: + input_string (str): Path to check. + + Returns: + bool: True if the path is a valid PEP file type. """ PEP_FILE_TYPES = ["yaml", "csv"] @@ -913,13 +987,16 @@ def is_PEP_file_type(input_string: str) -> bool: def is_pephub_registry_path(input_string: str) -> bool: - """ - Check if input is a registry path to pephub - :param str input_string: path to the PEP (or registry path) - :return bool: True if input is a registry path + """Check if input is a registry path to pephub. + + Args: + input_string (str): Path to the PEP (or registry path). + + Returns: + bool: True if input is a registry path. """ try: - registry_path = RegistryPath(**parse_registry_path(input_string)) + RegistryPath(**parse_registry_path(input_string)) except (ValidationError, TypeError): return False return True @@ -943,7 +1020,7 @@ def __init__(self, lo: int, hi: int): ) def __eq__(self, other) -> bool: - return type(other) == type(self) and self.to_tuple() == other.to_tuple() + return type(other) is type(self) and self.to_tuple() == other.to_tuple() def __hash__(self) -> int: return hash(self.to_tuple()) @@ -954,7 +1031,7 @@ def __repr__(self) -> str: def __str__(self) -> str: return f"{self.__class__.__name__}: {self.to_tuple()}" - def to_tuple(self) -> Tuple[int, int]: + def to_tuple(self) -> tuple[int, int]: return self.lo, self.hi @property @@ -965,7 +1042,7 @@ def lo(self) -> int: def hi(self) -> int: return self._hi - def _invalidations(self) -> Iterable[str]: + def _invalidations(self) -> list[str]: problems = [] if self.lo < 1: problems.append(f"Interval must be on natural numbers: {self.lo}") @@ -979,12 +1056,15 @@ def to_range(self) -> Iterable[int]: return range(self.lo, self.hi + 1) @classmethod - def from_string(cls, s: str, upper_bound: int) -> "IntRange": - """ - Create an instance from a string, e.g. command-line argument. + def from_string(cls, s: str, upper_bound: int) -> "NatIntervalInclusive": + """Create an instance from a string, e.g. command-line argument. + + Args: + s (str): The string to parse as an interval. + upper_bound (int): The default upper bound. - :param str s: The string to parse as an interval - :param int upper_bound: the default upper bound + Returns: + IntRange: New instance created from the string. """ if upper_bound < 1: raise NatIntervalException(f"Upper bound must be positive: {upper_bound}") @@ -1018,18 +1098,20 @@ def from_string(cls, s: str, upper_bound: int) -> "IntRange": def desired_samples_range_limited(arg: str, num_samples: int) -> Iterable[int]: - """ - Create a contiguous interval of natural numbers. Used for _positive_ selection of samples. + """Create a contiguous interval of natural numbers for positive selection of samples. Interpret given arg as upper bound (1-based) if it's a single value, but take the minimum of that and the given number of samples. If arg is parseable as a range, use that. - :param str arg: CLI specification of a range of samples to use, or as the greatest - 1-based index of a sample to include - :param int num_samples: what to use as the upper bound on the 1-based index interval - if the given arg isn't a range but rather a single value. - :return: an iterable of 1-based indices into samples to select + Args: + arg (str): CLI specification of a range of samples to use, or as the greatest + 1-based index of a sample to include. + num_samples (int): What to use as the upper bound on the 1-based index interval + if the given arg isn't a range but rather a single value. + + Returns: + Iterable[int]: An iterable of 1-based indices into samples to select. """ try: upper_bound = min(int(arg), num_samples) @@ -1042,13 +1124,15 @@ def desired_samples_range_limited(arg: str, num_samples: int) -> Iterable[int]: def desired_samples_range_skipped(arg: str, num_samples: int) -> Iterable[int]: - """ - Create a contiguous interval of natural numbers. Used for _negative_ selection of samples. + """Create a contiguous interval of natural numbers for negative selection of samples. - :param str arg: CLI specification of a range of samples to use, or as the lowest - 1-based index of a sample to skip - :param int num_samples: highest 1-based index of samples to include - :return: an iterable of 1-based indices into samples to select + Args: + arg (str): CLI specification of a range of samples to use, or as the lowest + 1-based index of a sample to skip. + num_samples (int): Highest 1-based index of samples to include. + + Returns: + Iterable[int]: An iterable of 1-based indices into samples to select. """ try: lower_bound = int(arg) @@ -1064,15 +1148,18 @@ def desired_samples_range_skipped(arg: str, num_samples: int) -> Iterable[int]: return intv.to_range() -def write_submit_script(fp, content, data): - """ - Write a submission script for divvy by populating a template with data. - :param str fp: Path to the file to which to create/write submissions script. - :param str content: Template for submission script, defining keys that - will be filled by given data - :param Mapping data: a "pool" from which values are available to replace - keys in the template - :return str: Path to the submission script +def write_submit_script(fp: str, content: str, data: dict) -> str: + """Write a submission script for divvy by populating a template with data. + + Args: + fp (str): Path to the file to which to create/write submissions script. + content (str): Template for submission script, defining keys that + will be filled by given data. + data (Mapping): A "pool" from which values are available to replace + keys in the template. + + Returns: + str: Path to the submission script. """ for k, v in data.items(): @@ -1082,7 +1169,7 @@ def write_submit_script(fp, content, data): keys_left = re.findall(r"!$\{(.+?)\}", content) if len(keys_left) > 0: _LOGGER.warning( - "> Warning: %d submission template variables are not " "populated: '%s'", + "> Warning: %d submission template variables are not populated: '%s'", len(keys_left), str(keys_left), ) @@ -1100,10 +1187,10 @@ def write_submit_script(fp, content, data): def inspect_looper_config_file(looper_config_dict) -> None: - """ - Inspects looper config by printing it to terminal. - param dict looper_config_dict: dict representing looper_config + """Inspects looper config by printing it to terminal. + Args: + looper_config_dict (dict): Dict representing looper_config. """ # Simply print this to terminal print("LOOPER INSPECT") @@ -1111,7 +1198,7 @@ def inspect_looper_config_file(looper_config_dict) -> None: print(f"{key} {value}") -def expand_nested_var_templates(var_templates_dict, namespaces): +def expand_nested_var_templates(var_templates_dict: dict, namespaces: dict) -> dict: "Takes all var_templates as a dict and recursively expands any paths." result = {} @@ -1125,7 +1212,7 @@ def expand_nested_var_templates(var_templates_dict, namespaces): return result -def render_nested_var_templates(var_templates_dict, namespaces): +def render_nested_var_templates(var_templates_dict: dict, namespaces: dict) -> dict: "Takes all var_templates as a dict and recursively renders the jinja templates." result = {} diff --git a/docs/img/looper_logo.svg b/looper_logo.svg similarity index 100% rename from docs/img/looper_logo.svg rename to looper_logo.svg diff --git a/mkdocs.yml b/mkdocs.yml deleted file mode 100644 index 8e5700de1..000000000 --- a/mkdocs.yml +++ /dev/null @@ -1,59 +0,0 @@ -site_name: Looper -site_logo: img/looper_logo_dark.svg -site_url: http://looper.databio.org/ -repo_url: http://github.com/pepkit/looper -pypi_name: loopercli - -nav: - - Getting Started: - - Introduction: README.md - - Features at-a-glance: features.md - - Hello world: hello-world.md - - How-to guides: - - Defining a project: defining-a-project.md - - Running a pipeline: running-a-pipeline.md - - Initializing a repository: initialize.md - - Using pipestat: pipestat.md - - Parameterizing pipelines: parameterizing-pipelines.md - - Running on a cluster: running-on-a-cluster.md - - Grouping many jobs into one: grouping-jobs.md - - Running jobs in containers: containers.md - - Handling multiple input files: how-to-merge-inputs.md - - Running multiple pipelines: multiple-pipelines.md - - Writing a pipeline interface: writing-a-pipeline-interface.md - - Using looper config: looper-config.md - - Using geofetch: using-geofetch.md - - Browsable HTML Reports: looper-report.md - - Using divvy: - - Introduction: divvy/README.md - - Configuring divvy: divvy/configuration.md - - "Tutorial: divvy in python": tutorial_divvy.md - - "Tutorial: divvy on the command line": cli_divvy.md - - Configuring containers: divvy/containers.md - - Configuring connection with client software: divvy/adapters.md - - Default packages: divvy/default-packages.md - - DIVCFG examples: http://github.com/pepkit/divcfg - - Reference: - - Pipeline interface specification: pipeline-interface-specification.md - - Pipeline tiers: pipeline-tiers.md - - Concentric templates: concentric-templates.md - - Pre-submission hooks system: pre-submission-hooks.md - - Looper variable namespaces: variable-namespaces.md - - Usage: usage.md - - Configuration files: config-files.md - - API: autodoc_build/looper.md - - FAQ: faq.md - - Support: support.md - - Contributing: contributing.md - - Changelog: changelog.md - -theme: databio - -plugins: - - databio: - autodoc_build: "docs/autodoc_build" - jupyter_source: "docs_jupyter" - jupyter_build: "docs_jupyter/build" - autodoc_package: "looper" - no_top_level: true - - search diff --git a/pyproject.toml b/pyproject.toml new file mode 100644 index 000000000..346903d36 --- /dev/null +++ b/pyproject.toml @@ -0,0 +1,95 @@ +[project] +name = "looper" +version = "2.1.0" +description = "A pipeline submission engine that parses sample inputs and submits pipelines for each sample." +readme = "README.md" +license = "BSD-2-Clause" +requires-python = ">=3.10" +authors = [ + { name = "Nathan Sheffield" }, + { name = "Vince Reuter" }, + { name = "Michal Stolarczyk" }, + { name = "Johanna Klughammer" }, + { name = "Andre Rendeiro" }, + { name = "Donald Campbell" }, +] +keywords = ["bioinformatics", "sequencing", "ngs"] +classifiers = [ + "Development Status :: 4 - Beta", + "License :: OSI Approved :: BSD License", + "Programming Language :: Python :: 3.10", + "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", + "Programming Language :: Python :: 3.14", + "Topic :: Scientific/Engineering :: Bio-Informatics", +] +dependencies = [ + "colorama>=0.3.9", + "eido>=0.2.4", + "jinja2", + "logmuse>=0.2.0", + "pandas>=2.0.2", + "pephubclient>=0.4.0", + "pipestat>=0.12.0a1", + "peppy>=0.40.6", + "pyyaml>=3.12", + "rich>=9.10.0", + "ubiquerg>=0.8.1", + "yacman @ git+https://github.com/databio/yacman.git@dev", # TODO: revert to yacman>=0.9.5 after release + "pydantic-settings>=2.0.0", + "psutil", +] + +[project.urls] +Homepage = "https://github.com/pepkit/looper" + +[project.scripts] +looper = "looper.cli_pydantic:main_cli" +divvy = "looper.__main__:divvy_main" + +[build-system] +requires = ["hatchling"] +build-backend = "hatchling.build" + +[tool.hatch.metadata] +allow-direct-references = true + +[project.optional-dependencies] +api = [ + "fastapi>=0.100.0", + "uvicorn>=0.22.0", +] +test = [ + "hypothesis>=6.84.3", + "mock", + "pytest", + "pytest-cov", + "pytest-remotedata", + "GitPython", + "psutil", +] + +[tool.pytest.ini_options] +addopts = "-rfE" +testpaths = ["tests/unit", "tests/divvytests"] # Fast tests only by default +# Integration tests: RUN_INTEGRATION_TESTS=true pytest tests/integration +python_files = ["test_*.py"] +python_classes = ["Test*", "*Test", "*Tests", "*Tester"] +python_functions = ["test_*"] + +[tool.ruff] +line-length = 88 + +[tool.ruff.lint] +select = ["E", "F", "I"] +ignore = ["F403", "F405", "E501"] + +[tool.ruff.lint.per-file-ignores] +"looper/__init__.py" = ["E402", "F401"] +"looper/looper.py" = ["E402"] +"looper/processed_project.py" = ["E402", "F821"] +"tests/**" = ["F841", "E712", "E722"] + +[tool.ruff.lint.isort] +known-first-party = ["looper"] diff --git a/pytest.ini b/pytest.ini deleted file mode 100644 index fe4c5cc58..000000000 --- a/pytest.ini +++ /dev/null @@ -1,6 +0,0 @@ -[pytest] -; Test discovery process, matching tests directory -; Also restrict test discovery to patterned modules, classes, and functions. -python_files = test_*.py -python_classes = Test* *Test *Tests *Tester -python_functions = test_* test[A-Z]* diff --git a/requirements/requirements-all.txt b/requirements/requirements-all.txt deleted file mode 100644 index 88af67d8d..000000000 --- a/requirements/requirements-all.txt +++ /dev/null @@ -1,14 +0,0 @@ -colorama>=0.3.9 -eido>=0.2.4 -jinja2 -logmuse>=0.2.0 -pandas>=2.0.2 -pephubclient>=0.4.0 -pipestat>=0.12.0a1 -peppy>=0.40.6 -pyyaml>=3.12 -rich>=9.10.0 -ubiquerg>=0.8.1 -yacman==0.9.3 -pydantic-argparse>=0.9.0 -psutil \ No newline at end of file diff --git a/requirements/requirements-doc.txt b/requirements/requirements-doc.txt deleted file mode 100644 index c5cb76cc1..000000000 --- a/requirements/requirements-doc.txt +++ /dev/null @@ -1,9 +0,0 @@ -https://github.com/databio/mkdocs-databio/archive/master.zip -markdown-include -looper -pephubclient -mkdocs>=1.0 -https://github.com/pepkit/pipestat/archive/refs/heads/master.zip -pydoc-markdown -# versioneer -# Cython diff --git a/requirements/requirements-test.txt b/requirements/requirements-test.txt deleted file mode 100644 index e3ba5d423..000000000 --- a/requirements/requirements-test.txt +++ /dev/null @@ -1,7 +0,0 @@ -hypothesis >= 6.84.3 -mock -pytest -pytest-cov -pytest-remotedata -GitPython -psutil \ No newline at end of file diff --git a/setup.cfg b/setup.cfg deleted file mode 100644 index ec734d815..000000000 --- a/setup.cfg +++ /dev/null @@ -1,6 +0,0 @@ -[aliases] -test = pytest - -[pytest] -# Only request extra info from failures and errors. -addopts = -rfE diff --git a/setup.py b/setup.py deleted file mode 100644 index 08c455ba4..000000000 --- a/setup.py +++ /dev/null @@ -1,95 +0,0 @@ -#! /usr/bin/env python - -import os -import sys - -from setuptools import setup - -# Additional keyword arguments for setup(). -extra = {} - - -# Ordinary dependencies -DEPENDENCIES = [] -with open("requirements/requirements-all.txt", "r") as reqs_file: - for line in reqs_file: - if not line.strip(): - continue - # DEPENDENCIES.append(line.split("=")[0].rstrip("<>")) - DEPENDENCIES.append(line) - - -# numexpr for pandas -try: - import numexpr -except ImportError: - # No numexpr is OK for pandas. - pass -else: - # pandas 0.20.2 needs updated numexpr; the claim is 2.4.6, but that failed. - DEPENDENCIES.append("numexpr>=2.6.2") -extra["install_requires"] = DEPENDENCIES - - -# Additional files to include with package -def get_static(name, condition=None): - static = [ - os.path.join(name, f) - for f in os.listdir( - os.path.join(os.path.dirname(os.path.realpath(__file__)), name) - ) - ] - if condition is None: - return static - else: - return [i for i in filter(lambda x: eval(condition), static)] - - -# scripts to be added to the $PATH -# scripts = get_static("scripts", condition="'.' in x") -# scripts removed (TO remove this) -scripts = None - - -with open("looper/_version.py", "r") as versionfile: - version = versionfile.readline().split()[-1].strip("\"'\n") - -with open("README.md") as f: - long_description = f.read() - -setup( - name="looper", - packages=["looper"], - version=version, - description="A pipeline submission engine that parses sample inputs and submits pipelines for each sample.", - long_description=long_description, - long_description_content_type="text/markdown", - classifiers=[ - "Development Status :: 4 - Beta", - "License :: OSI Approved :: BSD License", - "Programming Language :: Python :: 3.8", - "Programming Language :: Python :: 3.9", - "Programming Language :: Python :: 3.10", - "Programming Language :: Python :: 3.11", - "Topic :: Scientific/Engineering :: Bio-Informatics", - ], - keywords="bioinformatics, sequencing, ngs", - url="https://github.com/pepkit/looper", - author="Nathan Sheffield, Vince Reuter, Michal Stolarczyk, Johanna Klughammer, Andre Rendeiro", - license="BSD2", - entry_points={ - "console_scripts": [ - "looper = looper.cli_pydantic:main_cli", - "divvy = looper.__main__:divvy_main", - ], - }, - scripts=scripts, - package_data={"looper": ["submit_templates/*"]}, - include_package_data=True, - test_suite="tests", - tests_require=(["mock", "pytest"]), - setup_requires=( - ["pytest-runner"] if {"test", "pytest", "ptr"} & set(sys.argv) else [] - ), - **extra -) diff --git a/tests/conftest.py b/tests/conftest.py index 960a98b44..f22f26dae 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,314 +1,19 @@ -import shutil -from contextlib import contextmanager -import os -import subprocess -from shutil import copyfile, rmtree, copytree -import tempfile -from typing import * +"""Root test configuration. -import peppy -import pytest -from peppy.const import * -from yaml import dump, safe_load +Test organization: +- tests/unit/ - Fast unit tests with no file I/O +- tests/integration/ - CLI integration tests (set RUN_INTEGRATION_TESTS=true to run) +- tests/divvytests/ - Divvy compute configuration tests -from looper.const import * +Run commands: +- pytest tests/unit tests/divvytests # Fast tests (default) +- RUN_INTEGRATION_TESTS=true pytest tests/integration # Integration tests +- ./tests/scripts/test-integration.sh # Integration tests via script +""" -REPO_URL = "https://github.com/pepkit/hello_looper.git" -CFG = "project_config.yaml" -PIPESTAT_CONFIG = "global_pipestat_config.yaml" -PROJECT_CFG_PIPESTAT = "project_config_pipestat.yaml" -LOOPER_CFG = "looper_config_pipestat.yaml" -PIPESTAT_OS = "pipestat_output_schema.yaml" -PIPESTAT_PI = "pipeline_interface1_sample_pipestat.yaml" -PIPESTAT_PI_PRJ = "pipeline_interface1_project_pipestat.yaml" -ST = "annotation_sheet.csv" -PIP = "pipeline_interface{}_project.yaml" -PIS = "pipeline_interface{}_sample.yaml" -OS = "output_schema.yaml" -RES = "resources-{}.tsv" - - -@pytest.fixture(scope="function") -def dotfile_path(): - path = os.path.join(os.getcwd(), LOOPER_DOTFILE_NAME) - yield path - if os.path.isfile(path): - os.remove(path) - - -def get_outdir(pth): - """ - Get output directory from a config file - - :param str pth: - :return str: output directory - """ - with open(pth, "r") as conf_file: - config_data = safe_load(conf_file) - - output_path = config_data[OUTDIR_KEY] - dirname = os.path.dirname(pth) - - return os.path.join(dirname, output_path) - - -def get_project_config_path(looper_config_pth): - """ - Get project config file path from a looper config file path, since they are relative - - :param str pth: - :return str: output directory - """ - dirname = os.path.dirname(looper_config_pth) - - return os.path.join(dirname, "project/project_config.yaml") - - -def _assert_content_in_files(fs: Union[str, Iterable[str]], query: str, negate: bool): - if isinstance(fs, str): - fs = [fs] - check = (lambda doc: query not in doc) if negate else (lambda doc: query in doc) - for f in fs: - with open(f, "r") as fh: - contents = fh.read() - assert check(contents) - - -def assert_content_in_all_files(fs: Union[str, Iterable[str]], query: str): - """ - Verify that string is in files content. - - :param str | Iterable[str] fs: list of files - :param str query: string to look for - """ - _assert_content_in_files(fs, query, negate=False) - - -def assert_content_not_in_any_files(fs: Union[str, Iterable[str]], query: str): - """ - Verify that string is not in files' content. - - :param str | Iterable[str] fs: list of files - :param str query: string to look for - """ - _assert_content_in_files(fs, query, negate=True) - - -def print_standard_stream(text: Union[str, bytes]) -> None: - if isinstance(text, bytes): - text = text.decode("utf-8") - if not isinstance(text, str): - raise TypeError(f"Stream to print is neither str nor bytes, but {type(text)}") - for line in text.split("\n"): - print(line) - - -def subp_exec( - pth=None, cmd=None, appendix=list(), dry=True -) -> Tuple[bytes, bytes, int]: - """ - - :param str pth: config path - :param str cmd: looper subcommand - :param Iterable[str] appendix: other args to pass to the cmd - :param bool dry: whether to append dry run flag - :return stdout, stderr, and return code - """ - x = ["looper", cmd, "-d" if dry else ""] - if pth: - x.append(pth) - x.extend(appendix) - proc = subprocess.Popen(x, stderr=subprocess.PIPE, stdout=subprocess.PIPE) - stdout, stderr = proc.communicate() - return stdout, stderr, proc.returncode - - -def test_args_expansion(pth=None, cmd=None, appendix=list(), dry=True) -> List[str]: - """ - This function takes a path, command, extra argument list and creates a list of - strings to pass to looper.main() as test_args. - - :param str pth: config path - :param str cmd: looper subcommand - :param Iterable[str] appendix: other args to pass to the cmd - :param bool dry: whether to append dry run flag - :return list of strings to pass to looper.main for testing - """ - # --looper-config .looper.yaml run --dry-run - # x = [cmd, "-d" if dry else ""] - x = [] - if cmd: - x.append(cmd) - if pth: - x.append("--config") - x.append(pth) - if dry: - x.append("--dry-run") - x.extend(appendix) - return x - - -def verify_filecount_in_dir(dirpath, pattern, count): - """ - Check if the expected number of files matching specified pattern - exist in a directory - - :param str dirpath: path to the directory to investigate - :param str pattern: string pattern, used in str.endswith - :param int count: expected number of files - :raise IOError: when the number of files does not meet the expectations - """ - assert os.path.isdir(dirpath) - subm_err = IOError( - f"Expected {count} files mathing '{pattern}' pattern in " - f"'{dirpath}'. Listdir: \n{os.listdir(dirpath)}" +# Register custom markers +def pytest_configure(config): + config.addinivalue_line( + "markers", "integration: marks tests as integration tests (skipped by default)" ) - assert sum([f.endswith(pattern) for f in os.listdir(dirpath)]) == count, subm_err - - -@contextmanager -def mod_yaml_data(path): - """ - Context manager used to modify YAML formatted data - - :param str path: path to the file to modify - """ - # TODO: use everywhere - with open(path, "r") as f: - yaml_data = safe_load(f) - print(f"\nInitial YAML data: \n{yaml_data}\n") - yield yaml_data - print(f"\nModified YAML data: \n{yaml_data}\n") - with open(path, "w") as f: - dump(yaml_data, f) - - -@pytest.fixture -def example_pep_piface_path(): - return os.path.join(os.path.dirname(os.path.abspath(__file__)), "data") - - -@pytest.fixture -def example_pep_piface_path_cfg(example_pep_piface_path): - return os.path.join(example_pep_piface_path, CFG) - - -@pytest.fixture -def prep_temp_pep(example_pep_piface_path): - - # Get Path to local copy of hello_looper - hello_looper_dir_path = os.path.join(example_pep_piface_path, "hello_looper-dev") - - # Make local temp copy of hello_looper - d = tempfile.mkdtemp() - shutil.copytree(hello_looper_dir_path, d, dirs_exist_ok=True) - - advanced_dir = os.path.join(d, "pytesting/advanced_test") - path_to_looper_config = os.path.join(advanced_dir, ".looper.yaml") - - return path_to_looper_config - - -@pytest.fixture -def prep_temp_pep_basic(example_pep_piface_path): - - # Get Path to local copy of hello_looper - hello_looper_dir_path = os.path.join(example_pep_piface_path, "hello_looper-dev") - - # Make local temp copy of hello_looper - d = tempfile.mkdtemp() - shutil.copytree(hello_looper_dir_path, d, dirs_exist_ok=True) - - advanced_dir = os.path.join(d, "pytesting/intermediate_test") - path_to_looper_config = os.path.join(advanced_dir, ".looper.yaml") - - return path_to_looper_config - - -@pytest.fixture -def prep_temp_pep_csv(example_pep_piface_path): - - # Get Path to local copy of hello_looper - hello_looper_dir_path = os.path.join(example_pep_piface_path, "hello_looper-dev") - - # Make local temp copy of hello_looper - d = tempfile.mkdtemp() - shutil.copytree(hello_looper_dir_path, d, dirs_exist_ok=True) - - advanced_dir = os.path.join(d, "looper_csv_example") - path_to_looper_config = os.path.join(advanced_dir, ".looper.yaml") - - return path_to_looper_config - - -@pytest.fixture -def prep_temp_config_with_pep(example_pep_piface_path): - # temp dir - td = tempfile.mkdtemp() - out_td = os.path.join(td, "output") - # ori paths - cfg_path = os.path.join(example_pep_piface_path, CFG) - sample_table_path = os.path.join(example_pep_piface_path, ST) - piface1s_path = os.path.join(example_pep_piface_path, PIS.format("1")) - temp_path_cfg = os.path.join(td, CFG) - temp_path_sample_table = os.path.join(td, ST) - temp_path_piface1s = os.path.join(td, PIS.format("1")) - - # copying - copyfile(cfg_path, temp_path_cfg) - copyfile(sample_table_path, temp_path_sample_table) - copyfile(piface1s_path, temp_path_piface1s) - - return peppy.Project(temp_path_cfg).to_dict(extended=True), temp_path_piface1s - - -@pytest.fixture -def prep_temp_pep_pipestat(example_pep_piface_path): - - # Get Path to local copy of hello_looper - - hello_looper_dir_path = os.path.join(example_pep_piface_path, "hello_looper-dev") - - # Make local temp copy of hello_looper - d = tempfile.mkdtemp() - shutil.copytree(hello_looper_dir_path, d, dirs_exist_ok=True) - - advanced_dir = os.path.join(d, "pytesting/pipestat_test") - path_to_looper_config = os.path.join(advanced_dir, ".looper.yaml") - - return path_to_looper_config - - -@pytest.fixture -def prep_temp_pep_pipestat_advanced(example_pep_piface_path): - - # Get Path to local copy of hello_looper - - hello_looper_dir_path = os.path.join(example_pep_piface_path, "hello_looper-dev") - - # Make local temp copy of hello_looper - d = tempfile.mkdtemp() - shutil.copytree(hello_looper_dir_path, d, dirs_exist_ok=True) - - advanced_dir = os.path.join(d, "pytesting/advanced_test") - path_to_looper_config = os.path.join(advanced_dir, ".looper_advanced_pipestat.yaml") - - return path_to_looper_config - - -@pytest.fixture -def prep_temp_pep_pephub(example_pep_piface_path): - - # Get Path to local copy of hello_looper - - hello_looper_dir_path = os.path.join(example_pep_piface_path, "hello_looper-dev") - - # Make local temp copy of hello_looper - d = tempfile.mkdtemp() - shutil.copytree(hello_looper_dir_path, d, dirs_exist_ok=True) - - advanced_dir = os.path.join(d, "pephub") - path_to_looper_config = os.path.join(advanced_dir, ".looper.yaml") - - return path_to_looper_config diff --git a/tests/data/hello_looper-dev/pep_derived_attrs/pipeline/count_lines_plot.py b/tests/data/hello_looper-dev/pep_derived_attrs/pipeline/count_lines_plot.py index 398c1c02a..7f16be4cc 100644 --- a/tests/data/hello_looper-dev/pep_derived_attrs/pipeline/count_lines_plot.py +++ b/tests/data/hello_looper-dev/pep_derived_attrs/pipeline/count_lines_plot.py @@ -1,7 +1,8 @@ -import matplotlib.pyplot as plt import os import sys +import matplotlib.pyplot as plt + results_dir = sys.argv[ 1 ] # Obtain the looper results directory passed via the looper command template diff --git a/tests/data/hello_looper-dev/pipestat_example/pipeline/count_lines.py b/tests/data/hello_looper-dev/pipestat_example/pipeline/count_lines.py index 6f6a4ab8f..8efabf415 100755 --- a/tests/data/hello_looper-dev/pipestat_example/pipeline/count_lines.py +++ b/tests/data/hello_looper-dev/pipestat_example/pipeline/count_lines.py @@ -1,7 +1,7 @@ import os.path +import sys import pipestat -import sys # Very simple pipeline that calls pipestat # takes arguments invoked during looper submission via command templates @@ -13,9 +13,9 @@ schema_path = sys.argv[4] # Create pipestat manager and then report values -psm = pipestat.PipestatManager( - schema_path=schema_path, +psm = pipestat.PipestatManager.from_file_backend( results_file_path=results_file, + schema_path=schema_path, record_identifier=sample_name, ) diff --git a/tests/data/hello_looper-dev/pipestat_example/pipeline/count_lines_plot.py b/tests/data/hello_looper-dev/pipestat_example/pipeline/count_lines_plot.py index bc3a2bce3..6799a335a 100644 --- a/tests/data/hello_looper-dev/pipestat_example/pipeline/count_lines_plot.py +++ b/tests/data/hello_looper-dev/pipestat_example/pipeline/count_lines_plot.py @@ -1,8 +1,9 @@ -import matplotlib.pyplot as plt # be sure to `pip install matplotlib` import os -import pipestat import sys +import matplotlib.pyplot as plt # be sure to `pip install matplotlib` +import pipestat + # A pipeline that retrieves previously reported pipestat results # and plots them in a bar chart results_file = sys.argv[1] diff --git a/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface1_project.yaml b/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface1_project.yaml index 534905cad..711d7b70f 100644 --- a/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface1_project.yaml +++ b/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface1_project.yaml @@ -1,5 +1,4 @@ pipeline_name: PIPELINE1 -output_schema: output_schema.yaml var_templates: path: "{looper.piface_dir}/col_pipeline1.py" project_interface: diff --git a/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface1_sample.yaml b/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface1_sample.yaml index d0d608498..e47f597fd 100644 --- a/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface1_sample.yaml +++ b/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface1_sample.yaml @@ -1,6 +1,5 @@ pipeline_name: PIPELINE1 input_schema: https://schema.databio.org/pep/2.0.0.yaml -output_schema: output_schema.yaml var_templates: path: "{looper.piface_dir}/pipeline1.py" pre_submit: diff --git a/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface2_project.yaml b/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface2_project.yaml index df557d820..c3600c3c2 100644 --- a/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface2_project.yaml +++ b/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface2_project.yaml @@ -1,5 +1,4 @@ pipeline_name: OTHER_PIPELINE2 -output_schema: output_schema.yaml var_templates: path: "{looper.piface_dir}/col_pipeline2.py" project_interface: diff --git a/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface2_sample.yaml b/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface2_sample.yaml index 0329d33a2..6689e76a0 100644 --- a/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface2_sample.yaml +++ b/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipeline_interface2_sample.yaml @@ -1,5 +1,4 @@ pipeline_name: OTHER_PIPELINE2 -output_schema: output_schema.yaml var_templates: path: "{looper.piface_dir}/other_pipeline2.py" pre_submit: diff --git a/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipestat_pipeline_interface1_sample.yaml b/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipestat_pipeline_interface1_sample.yaml index 4bdbab1fc..183ecf191 100644 --- a/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipestat_pipeline_interface1_sample.yaml +++ b/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipestat_pipeline_interface1_sample.yaml @@ -1,6 +1,5 @@ pipeline_name: example_pipestat_pipeline input_schema: https://schema.databio.org/pep/2.0.0.yaml -output_schema: pipestat_output_schema.yaml var_templates: path: "{looper.piface_dir}/pipeline1.py" pre_submit: diff --git a/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipestat_pipeline_interface2_sample.yaml b/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipestat_pipeline_interface2_sample.yaml index 3fa6829c5..8bcf67a3f 100644 --- a/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipestat_pipeline_interface2_sample.yaml +++ b/tests/data/hello_looper-dev/pytesting/advanced_test/pipeline/pipestat_pipeline_interface2_sample.yaml @@ -1,6 +1,5 @@ pipeline_name: example_pipestat_pipeline input_schema: https://schema.databio.org/pep/2.0.0.yaml -output_schema: pipestat_output_schema.yaml var_templates: path: "{looper.piface_dir}/other_pipeline2.py" pre_submit: diff --git a/tests/data/hello_looper-dev/pytesting/pipestat_test/pipeline_pipestat/count_lines.py b/tests/data/hello_looper-dev/pytesting/pipestat_test/pipeline_pipestat/count_lines.py index 6f6a4ab8f..8efabf415 100755 --- a/tests/data/hello_looper-dev/pytesting/pipestat_test/pipeline_pipestat/count_lines.py +++ b/tests/data/hello_looper-dev/pytesting/pipestat_test/pipeline_pipestat/count_lines.py @@ -1,7 +1,7 @@ import os.path +import sys import pipestat -import sys # Very simple pipeline that calls pipestat # takes arguments invoked during looper submission via command templates @@ -13,9 +13,9 @@ schema_path = sys.argv[4] # Create pipestat manager and then report values -psm = pipestat.PipestatManager( - schema_path=schema_path, +psm = pipestat.PipestatManager.from_file_backend( results_file_path=results_file, + schema_path=schema_path, record_identifier=sample_name, ) diff --git a/tests/data/pipeline_interface1_project.yaml b/tests/data/pipeline_interface1_project.yaml index cddc14b76..2861c20d5 100644 --- a/tests/data/pipeline_interface1_project.yaml +++ b/tests/data/pipeline_interface1_project.yaml @@ -1,6 +1,5 @@ pipeline_name: PIPELINE1 pipeline_type: project -output_schema: output_schema.yaml var_templates: path: "{looper.piface_dir}/pipelines/col_pipeline1.py" command_template: > diff --git a/tests/data/pipeline_interface1_project_pipestat.yaml b/tests/data/pipeline_interface1_project_pipestat.yaml index fc341ac2d..2861c20d5 100644 --- a/tests/data/pipeline_interface1_project_pipestat.yaml +++ b/tests/data/pipeline_interface1_project_pipestat.yaml @@ -1,6 +1,5 @@ pipeline_name: PIPELINE1 pipeline_type: project -output_schema: pipestat_output_schema.yaml var_templates: path: "{looper.piface_dir}/pipelines/col_pipeline1.py" command_template: > diff --git a/tests/data/pipeline_interface1_sample.yaml b/tests/data/pipeline_interface1_sample.yaml index 43638d923..f455d8171 100644 --- a/tests/data/pipeline_interface1_sample.yaml +++ b/tests/data/pipeline_interface1_sample.yaml @@ -1,7 +1,6 @@ pipeline_name: PIPELINE1 pipeline_type: sample input_schema: https://schema.databio.org/pep/2.0.0.yaml -output_schema: output_schema.yaml var_templates: path: "{looper.piface_dir}/pipelines/pipeline1.py" pre_submit: diff --git a/tests/data/pipeline_interface1_sample_pipestat.yaml b/tests/data/pipeline_interface1_sample_pipestat.yaml index d4e5418a2..f455d8171 100644 --- a/tests/data/pipeline_interface1_sample_pipestat.yaml +++ b/tests/data/pipeline_interface1_sample_pipestat.yaml @@ -1,7 +1,6 @@ pipeline_name: PIPELINE1 pipeline_type: sample input_schema: https://schema.databio.org/pep/2.0.0.yaml -output_schema: pipestat_output_schema.yaml var_templates: path: "{looper.piface_dir}/pipelines/pipeline1.py" pre_submit: diff --git a/tests/data/pipeline_interface2_project.yaml b/tests/data/pipeline_interface2_project.yaml index 7c4a42238..d589db752 100644 --- a/tests/data/pipeline_interface2_project.yaml +++ b/tests/data/pipeline_interface2_project.yaml @@ -1,6 +1,5 @@ pipeline_name: OTHER_PIPELINE2 pipeline_type: project -output_schema: output_schema.yaml var_templates: path: "{looper.piface_dir}/pipelines/col_pipeline2.py" command_template: > diff --git a/tests/data/pipeline_interface2_sample.yaml b/tests/data/pipeline_interface2_sample.yaml index 987f7873d..094214722 100644 --- a/tests/data/pipeline_interface2_sample.yaml +++ b/tests/data/pipeline_interface2_sample.yaml @@ -1,6 +1,5 @@ pipeline_name: OTHER_PIPELINE2 pipeline_type: sample -output_schema: output_schema.yaml var_templates: path: "{looper.piface_dir}/pipelines/other_pipeline2.py" pre_submit: diff --git a/tests/divvytests/conftest.py b/tests/divvytests/conftest.py index 2fa0c9049..d42a58724 100644 --- a/tests/divvytests/conftest.py +++ b/tests/divvytests/conftest.py @@ -1,10 +1,10 @@ -import os import glob -import looper.divvy as divvy -import pytest +import os -from looper.divvy import select_divvy_config, DEFAULT_CONFIG_SCHEMA +import pytest +import looper.divvy as divvy +from looper.divvy import select_divvy_config THIS_DIR = os.path.dirname(os.path.abspath(__file__)) DATA_DIR = os.path.join(THIS_DIR, "data/divcfg-master") diff --git a/tests/divvytests/divvy_tests/test_divvy.py b/tests/divvytests/divvy_tests/test_divvy.py index a67e489de..ce247455b 100644 --- a/tests/divvytests/divvy_tests/test_divvy.py +++ b/tests/divvytests/divvy_tests/test_divvy.py @@ -1,9 +1,10 @@ """Assorted divvy tests""" import pytest -from yacman import YacAttMap, load_yaml +from yacman import YAMLConfigManager, load_yaml + from looper.divvy import DEFAULT_COMPUTE_RESOURCES_NAME -from tests.divvytests.conftest import DCC_ATTRIBUTES, FILES, mock_env_missing +from tests.divvytests.conftest import DCC_ATTRIBUTES, FILES class TestDefaultDCC: @@ -60,9 +61,9 @@ class TestGettingActivePackage: """Test for the get_active_package method""" def test_settings_nonempty(self, dcc): - """Test if get_active_package produces a nonempty YacAttMap object""" + """Test if get_active_package produces a nonempty YAMLConfigManager object""" settings = dcc.get_active_package() - assert settings != YacAttMap() + assert settings != YAMLConfigManager() class TestListingPackages: @@ -87,7 +88,7 @@ def test_reset_active_settings(self, dcc): def test_reset_active_settings_works(self, dcc): """Test if the settings are cleared""" dcc.reset_active_settings() - assert dcc.get_active_package() == YacAttMap({}) + assert dcc.get_active_package() == YAMLConfigManager({}) class UpdatingPackagesTests: @@ -98,4 +99,4 @@ def test_update_packages(self, dcc, config_file): """Test updating does not produce empty compute packages""" entries = load_yaml(config_file) dcc.update(entries) - assert dcc["compute_packages"] != YacAttMap() + assert dcc["compute_packages"] != YAMLConfigManager() diff --git a/tests/divvytests/helpers.py b/tests/divvytests/helpers.py index be4b11044..1e9c344f9 100644 --- a/tests/divvytests/helpers.py +++ b/tests/divvytests/helpers.py @@ -12,7 +12,7 @@ def get_random_key(n=10): :return str: Randomize text key """ if not isinstance(n, int): - raise TypeError("Non-integral key size".format(n)) + raise TypeError("Non-integral key size") if n < 1: raise ValueError("Non-positive key size: {}".format(n)) return "".join(random.choice(string.ascii_letters) for _ in range(n)) diff --git a/tests/divvytests/regression/test_write_script.py b/tests/divvytests/regression/test_write_script.py index 0a82753c1..4cc68331b 100644 --- a/tests/divvytests/regression/test_write_script.py +++ b/tests/divvytests/regression/test_write_script.py @@ -1,8 +1,10 @@ """Specific case tests for writing submission script""" -from copy import deepcopy import random +from copy import deepcopy + import pytest + from looper.divvy import ComputingConfiguration, select_divvy_config from tests.divvytests.helpers import get_random_key diff --git a/tests/divvytests/test_divvy_simple.py b/tests/divvytests/test_divvy_simple.py index 5770661f7..912716034 100644 --- a/tests/divvytests/test_divvy_simple.py +++ b/tests/divvytests/test_divvy_simple.py @@ -1,9 +1,6 @@ -import looper.divvy as divvy import os -import pytest -from collections import OrderedDict -from yacman import YacAttMap +import looper.divvy as divvy from looper.divvy import select_divvy_config # For interactive debugging: @@ -49,7 +46,7 @@ def test_write_script(self): # "compute", # [ # dict({"mem": 1000, "test": 0}), -# YacAttMap({"mem": 1000, "test": 0}), +# YAMLConfigManager({"mem": 1000, "test": 0}), # OrderedDict({"mem": 1000, "test": 0}), # ], # ) @@ -68,7 +65,7 @@ def test_write_script(self): # def test_adapters_overwitten_by_others(self): # dcc = divvy.ComputingConfiguration() # dcc.activate_package("singularity_slurm") -# compute = YacAttMap({"mem": 1000}) +# compute = YAMLConfigManager({"mem": 1000}) # extra_vars = [{"compute": compute}, {"MEM": 333}] # dcc.write_script("test1.sub", extra_vars) # with open("test1.sub", "r") as f: diff --git a/tests/integration/__init__.py b/tests/integration/__init__.py new file mode 100644 index 000000000..5e73c37d2 --- /dev/null +++ b/tests/integration/__init__.py @@ -0,0 +1 @@ +# Integration tests - CLI tests requiring temp directories and file I/O diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py new file mode 100644 index 000000000..d0eb7b315 --- /dev/null +++ b/tests/integration/conftest.py @@ -0,0 +1,245 @@ +"""Integration test configuration with environment variable gating.""" + +import os +import shutil +import socket +from contextlib import contextmanager +from shutil import copyfile +from typing import Iterable + +import peppy +import pytest +from yaml import dump, safe_load + +from looper.const import LOOPER_DOTFILE_NAME, OUTDIR_KEY + + +# Skip all integration tests unless explicitly enabled +def pytest_collection_modifyitems(config, items): + """Skip integration tests unless RUN_INTEGRATION_TESTS=true.""" + if os.getenv("RUN_INTEGRATION_TESTS") == "true": + return + skip_marker = pytest.mark.skip( + reason="Integration tests disabled. Set RUN_INTEGRATION_TESTS=true to run." + ) + for item in items: + # Only skip tests in the integration directory that aren't marked as fast + if "integration" in str(item.fspath): + if not any(mark.name == "integration_fast" for mark in item.iter_markers()): + item.add_marker(skip_marker) + + +# File constants +CFG = "project_config.yaml" +PIPESTAT_CONFIG = "global_pipestat_config.yaml" +PROJECT_CFG_PIPESTAT = "project_config_pipestat.yaml" +LOOPER_CFG = "looper_config_pipestat.yaml" +PIPESTAT_OS = "pipestat_output_schema.yaml" +PIPESTAT_PI = "pipeline_interface1_sample_pipestat.yaml" +PIPESTAT_PI_PRJ = "pipeline_interface1_project_pipestat.yaml" +ST = "annotation_sheet.csv" +PIP = "pipeline_interface{}_project.yaml" +PIS = "pipeline_interface{}_sample.yaml" +OS = "output_schema.yaml" +RES = "resources-{}.tsv" + + +@pytest.fixture(scope="function") +def dotfile_path(): + """Fixture for looper dotfile path with cleanup.""" + path = os.path.join(os.getcwd(), LOOPER_DOTFILE_NAME) + yield path + if os.path.isfile(path): + os.remove(path) + + +def get_outdir(pth): + """Get output directory from a config file.""" + with open(pth, "r") as conf_file: + config_data = safe_load(conf_file) + output_path = config_data[OUTDIR_KEY] + dirname = os.path.dirname(pth) + return os.path.join(dirname, output_path) + + +def get_project_config_path(looper_config_pth): + """Get project config file path from a looper config file path.""" + dirname = os.path.dirname(looper_config_pth) + return os.path.join(dirname, "project/project_config.yaml") + + +def _assert_content_in_files(fs: str | Iterable[str], query: str, negate: bool): + """Check file content for presence or absence of query string.""" + if isinstance(fs, str): + fs = [fs] + check = (lambda doc: query not in doc) if negate else (lambda doc: query in doc) + for f in fs: + with open(f, "r") as fh: + contents = fh.read() + assert check(contents) + + +def assert_content_in_all_files(fs: str | Iterable[str], query: str): + """Verify that string is in files content.""" + _assert_content_in_files(fs, query, negate=False) + + +def assert_content_not_in_any_files(fs: str | Iterable[str], query: str): + """Verify that string is not in files' content.""" + _assert_content_in_files(fs, query, negate=True) + + +def print_standard_stream(text: str | bytes) -> None: + """Print bytes or str to stdout.""" + if isinstance(text, bytes): + text = text.decode("utf-8") + if not isinstance(text, str): + raise TypeError(f"Stream to print is neither str nor bytes, but {type(text)}") + for line in text.split("\n"): + print(line) + + +def test_args_expansion(pth=None, cmd=None, appendix=None, dry=True): + """Create list of strings to pass to looper.main() as test_args.""" + if appendix is None: + appendix = [] + x = [] + if cmd: + x.append(cmd) + if pth: + x.append("--config") + x.append(pth) + if dry: + x.append("--dry-run") + x.extend(appendix) + return x + + +def verify_filecount_in_dir(dirpath, pattern, count): + """Check if expected number of files matching pattern exist in directory.""" + assert os.path.isdir(dirpath) + subm_err = IOError( + f"Expected {count} files matching '{pattern}' pattern in " + f"'{dirpath}'. Listdir: \n{os.listdir(dirpath)}" + ) + assert sum([f.endswith(pattern) for f in os.listdir(dirpath)]) == count, subm_err + + +def is_connected(): + """Determines if local machine can connect to the internet.""" + try: + host = socket.gethostbyname("www.databio.org") + socket.create_connection((host, 80), 2) + return True + except Exception: + pass + return False + + +@contextmanager +def mod_yaml_data(path): + """Context manager to modify YAML formatted data.""" + with open(path, "r") as f: + yaml_data = safe_load(f) + print(f"\nInitial YAML data: \n{yaml_data}\n") + yield yaml_data + print(f"\nModified YAML data: \n{yaml_data}\n") + with open(path, "w") as f: + dump(yaml_data, f) + + +@pytest.fixture +def example_pep_piface_path(): + """Path to test data directory.""" + return os.path.join( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "data" + ) + + +@pytest.fixture +def example_pep_piface_path_cfg(example_pep_piface_path): + """Path to test project config.""" + return os.path.join(example_pep_piface_path, CFG) + + +@pytest.fixture +def prep_temp_pep(example_pep_piface_path, tmp_path): + """Prepare a temporary PEP for testing.""" + hello_looper_dir_path = os.path.join(example_pep_piface_path, "hello_looper-dev") + d = tmp_path / "pep" + shutil.copytree(hello_looper_dir_path, d, dirs_exist_ok=True) + advanced_dir = d / "pytesting" / "advanced_test" + path_to_looper_config = str(advanced_dir / ".looper.yaml") + return path_to_looper_config + + +@pytest.fixture +def prep_temp_pep_basic(example_pep_piface_path, tmp_path): + """Prepare a basic temporary PEP for testing.""" + hello_looper_dir_path = os.path.join(example_pep_piface_path, "hello_looper-dev") + d = tmp_path / "pep" + shutil.copytree(hello_looper_dir_path, d, dirs_exist_ok=True) + advanced_dir = d / "pytesting" / "intermediate_test" + path_to_looper_config = str(advanced_dir / ".looper.yaml") + return path_to_looper_config + + +@pytest.fixture +def prep_temp_pep_csv(example_pep_piface_path, tmp_path): + """Prepare a CSV-based PEP for testing.""" + hello_looper_dir_path = os.path.join(example_pep_piface_path, "hello_looper-dev") + d = tmp_path / "pep" + shutil.copytree(hello_looper_dir_path, d, dirs_exist_ok=True) + advanced_dir = d / "looper_csv_example" + path_to_looper_config = str(advanced_dir / ".looper.yaml") + return path_to_looper_config + + +@pytest.fixture +def prep_temp_config_with_pep(example_pep_piface_path, tmp_path): + """Prepare temp config with PEP project dict.""" + td = tmp_path / "cfg" + td.mkdir() + cfg_path = os.path.join(example_pep_piface_path, CFG) + sample_table_path = os.path.join(example_pep_piface_path, ST) + piface1s_path = os.path.join(example_pep_piface_path, PIS.format("1")) + temp_path_cfg = str(td / CFG) + temp_path_sample_table = str(td / ST) + temp_path_piface1s = str(td / PIS.format("1")) + copyfile(cfg_path, temp_path_cfg) + copyfile(sample_table_path, temp_path_sample_table) + copyfile(piface1s_path, temp_path_piface1s) + return peppy.Project(temp_path_cfg).to_dict(extended=True), temp_path_piface1s + + +@pytest.fixture +def prep_temp_pep_pipestat(example_pep_piface_path, tmp_path): + """Prepare a pipestat-enabled PEP for testing.""" + hello_looper_dir_path = os.path.join(example_pep_piface_path, "hello_looper-dev") + d = tmp_path / "pep" + shutil.copytree(hello_looper_dir_path, d, dirs_exist_ok=True) + advanced_dir = d / "pytesting" / "pipestat_test" + path_to_looper_config = str(advanced_dir / ".looper.yaml") + return path_to_looper_config + + +@pytest.fixture +def prep_temp_pep_pipestat_advanced(example_pep_piface_path, tmp_path): + """Prepare an advanced pipestat-enabled PEP for testing.""" + hello_looper_dir_path = os.path.join(example_pep_piface_path, "hello_looper-dev") + d = tmp_path / "pep" + shutil.copytree(hello_looper_dir_path, d, dirs_exist_ok=True) + advanced_dir = d / "pytesting" / "advanced_test" + path_to_looper_config = str(advanced_dir / ".looper_advanced_pipestat.yaml") + return path_to_looper_config + + +@pytest.fixture +def prep_temp_pep_pephub(example_pep_piface_path, tmp_path): + """Prepare a PEPhub PEP for testing.""" + hello_looper_dir_path = os.path.join(example_pep_piface_path, "hello_looper-dev") + d = tmp_path / "pep" + shutil.copytree(hello_looper_dir_path, d, dirs_exist_ok=True) + advanced_dir = d / "pephub" + path_to_looper_config = str(advanced_dir / ".looper.yaml") + return path_to_looper_config diff --git a/tests/test_clean.py b/tests/integration/test_clean.py similarity index 99% rename from tests/test_clean.py rename to tests/integration/test_clean.py index 17a1fa9d0..be70e2c9d 100644 --- a/tests/test_clean.py +++ b/tests/integration/test_clean.py @@ -1,9 +1,11 @@ """Tests for looper's cleaning functionality""" import argparse + import pytest -from looper.looper import Cleaner + from looper import Project +from looper.looper import Cleaner def build_namespace(**kwargs): diff --git a/tests/smoketests/test_other.py b/tests/integration/test_cli_commands.py similarity index 98% rename from tests/smoketests/test_other.py rename to tests/integration/test_cli_commands.py index bc23bfb64..5da535bcf 100644 --- a/tests/smoketests/test_other.py +++ b/tests/integration/test_cli_commands.py @@ -1,16 +1,18 @@ import os.path +import pandas as pd import pytest from peppy import Project +from yaml import dump, safe_load +from looper.cli_pydantic import main +from looper.const import FLAGS, OUTDIR_KEY, PIPESTAT_KEY from looper.exceptions import ( - PipestatConfigurationException, - MisconfigurationException, LooperReportError, + MisconfigurationException, + PipestatConfigurationException, ) -from tests.conftest import * -from looper.cli_pydantic import main -import pandas as pd +from tests.integration.conftest import get_outdir, get_project_config_path def _make_flags_pipestat(cfg, type, pipeline_name): @@ -63,7 +65,6 @@ def _make_flags(cfg, type, pipeline_name): class TestLooperPipestat: - @pytest.mark.parametrize("cmd", ["report", "table", "check"]) def test_fail_no_pipestat_config(self, prep_temp_pep, cmd): "report, table, and check should fail if pipestat is NOT configured." @@ -331,8 +332,7 @@ def test_excluding_multi_flags_works( "--config", tp, "--exc-flag", - "completed", - "running", + "completed,running", # pydantic-settings uses comma-separated for lists "--dry-run", ] @@ -381,8 +381,7 @@ def test_selecting_multi_flags_works( "--config", tp, "--sel-flag", - "completed", - "running", + "completed,running", ] try: @@ -489,8 +488,7 @@ def test_excluding_attr_and_flags_works( "--sel-attr", "protocol", "--sel-incl", - "PROTO1", - "PROTO2", + "PROTO1,PROTO2", ] try: diff --git a/tests/smoketests/test_run.py b/tests/integration/test_cli_run.py similarity index 96% rename from tests/smoketests/test_run.py rename to tests/integration/test_cli_run.py index c35d59470..9ee2c8ff8 100644 --- a/tests/smoketests/test_run.py +++ b/tests/integration/test_cli_run.py @@ -4,11 +4,20 @@ from peppy.const import * from yaml import dump +from looper.cli_pydantic import main from looper.const import * +from looper.exceptions import MisconfigurationException from looper.project import Project -from tests.conftest import * -from looper.utils import * -from looper.cli_pydantic import main +from looper.utils import is_PEP_file_type, is_pephub_registry_path +from tests.integration.conftest import ( + assert_content_in_all_files, + assert_content_not_in_any_files, + get_outdir, + get_project_config_path, + mod_yaml_data, + test_args_expansion, + verify_filecount_in_dir, +) CMD_STRS = ["string", " --string", " --sjhsjd 212", "7867#$@#$cc@@"] @@ -65,19 +74,6 @@ def test_is_PEP_file_type(path): assert result == True -def is_connected(): - """Determines if local machine can connect to the internet.""" - import socket - - try: - host = socket.gethostbyname("www.databio.org") - socket.create_connection((host, 80), 2) - return True - except: - pass - return False - - class TestLooperBothRuns: @pytest.mark.parametrize("cmd", ["run", "runp"]) def test_looper_cfg_invalid(self, cmd): @@ -160,7 +156,6 @@ def test_looper_single_pipeline(self, prep_temp_pep): tp = prep_temp_pep with mod_yaml_data(tp) as config_data: - pifaces = config_data[PIPELINE_INTERFACES_KEY] config_data[PIPELINE_INTERFACES_KEY] = pifaces[0] @@ -520,10 +515,9 @@ def test_looper_uses_cli_compute_options_spec(self, prep_temp_pep, cmd): assert_content_in_all_files(subs_list, "#SBATCH --mem='12345'") @pytest.mark.parametrize("cmd", ["run", "runp"]) - def test_cli_yaml_settings_general(self, prep_temp_pep, cmd): + def test_cli_yaml_settings_general(self, prep_temp_pep, cmd, tmp_path): tp = prep_temp_pep - td = tempfile.mkdtemp() - settings_file_path = os.path.join(td, "settings.yaml") + settings_file_path = str(tmp_path / "settings.yaml") with open(settings_file_path, "w") as sf: dump({"mem": "testin_mem"}, sf) x = test_args_expansion(tp, cmd, ["--settings", settings_file_path]) @@ -542,10 +536,9 @@ def test_nonexistent_yaml_settings_disregarded(self, prep_temp_pep, cmd): raise pytest.fail("DID RAISE {0}".format(Exception)) @pytest.mark.parametrize("cmd", ["run", "runp"]) - def test_cli_yaml_settings_passes_settings(self, prep_temp_pep, cmd): + def test_cli_yaml_settings_passes_settings(self, prep_temp_pep, cmd, tmp_path): tp = prep_temp_pep - td = tempfile.mkdtemp() - settings_file_path = os.path.join(td, "settings.yaml") + settings_file_path = str(tmp_path / "settings.yaml") with open(settings_file_path, "w") as sf: dump({"mem": "testin_mem"}, sf) @@ -561,10 +554,11 @@ def test_cli_yaml_settings_passes_settings(self, prep_temp_pep, cmd): assert_content_in_all_files(subs_list, "testin_mem") @pytest.mark.parametrize("cmd", ["run", "runp"]) - def test_cli_compute_overwrites_yaml_settings_spec(self, prep_temp_pep, cmd): + def test_cli_compute_overwrites_yaml_settings_spec( + self, prep_temp_pep, cmd, tmp_path + ): tp = prep_temp_pep - td = tempfile.mkdtemp() - settings_file_path = os.path.join(td, "settings.yaml") + settings_file_path = str(tmp_path / "settings.yaml") with open(settings_file_path, "w") as sf: dump({"mem": "testin_mem"}, sf) x = test_args_expansion( @@ -593,7 +587,6 @@ def test_cli_compute_overwrites_yaml_settings_spec(self, prep_temp_pep, cmd): reason="This functionality requires input from the user. Causing pytest to error if run without -s flag" ) class TestLooperConfig: - def test_init_config_file(self, prep_temp_pep): tp = prep_temp_pep x = ["init", "--force-yes"] diff --git a/tests/integration/test_cli_startup.py b/tests/integration/test_cli_startup.py new file mode 100644 index 000000000..640b377c5 --- /dev/null +++ b/tests/integration/test_cli_startup.py @@ -0,0 +1,37 @@ +"""Tests for CLI startup performance.""" + +import subprocess +import time + +import pytest + +# These tests are fast and should run by default (not require RUN_INTEGRATION_TESTS) +pytestmark = pytest.mark.integration_fast + + +def test_cli_help_startup_time(): + """Ensure --help responds quickly without loading heavy dependencies.""" + start = time.time() + result = subprocess.run( + ["python", "-m", "looper.cli_pydantic", "--help"], + capture_output=True, + text=True, + ) + elapsed = time.time() - start + + assert result.returncode == 0, f"--help failed: {result.stderr}" + assert elapsed < 0.5, f"CLI --help took {elapsed:.2f}s, should be < 0.5s" + + +def test_subcommand_help_startup_time(): + """Ensure subcommand --help also responds quickly.""" + start = time.time() + result = subprocess.run( + ["python", "-m", "looper.cli_pydantic", "run", "--help"], + capture_output=True, + text=True, + ) + elapsed = time.time() - start + + assert result.returncode == 0, f"run --help failed: {result.stderr}" + assert elapsed < 0.5, f"CLI run --help took {elapsed:.2f}s, should be < 0.5s" diff --git a/tests/smoketests/test_cli_validation.py b/tests/integration/test_cli_validation.py similarity index 94% rename from tests/smoketests/test_cli_validation.py rename to tests/integration/test_cli_validation.py index 82e6b4eb1..64c8f9d24 100644 --- a/tests/smoketests/test_cli_validation.py +++ b/tests/integration/test_cli_validation.py @@ -1,17 +1,16 @@ """Tests for the validation of looper CLI use""" -import argparse from typing import * import pytest + +from looper.cli_pydantic import main from looper.const import ( - SAMPLE_SELECTION_ATTRIBUTE_OPTNAME, SAMPLE_EXCLUSION_OPTNAME, SAMPLE_INCLUSION_OPTNAME, + SAMPLE_SELECTION_ATTRIBUTE_OPTNAME, ) -from tests.conftest import print_standard_stream, subp_exec, test_args_expansion -from looper.cli_pydantic import main - +from tests.integration.conftest import test_args_expansion SUBCOMMANDS_WHICH_SUPPORT_SKIP_XOR_LIMIT = ["run", "destroy"] diff --git a/tests/test_comprehensive.py b/tests/integration/test_comprehensive.py similarity index 97% rename from tests/test_comprehensive.py rename to tests/integration/test_comprehensive.py index 41c73ea0c..4cdfc9bd9 100644 --- a/tests/test_comprehensive.py +++ b/tests/integration/test_comprehensive.py @@ -2,20 +2,15 @@ import pytest from peppy.const import * -from yaml import dump - -from looper.const import * -from looper.project import Project -from tests.conftest import * -from looper.utils import * -from looper.cli_pydantic import main -from tests.smoketests.test_run import is_connected -from tempfile import TemporaryDirectory from pipestat import PipestatManager from pipestat.exceptions import RecordNotFoundError - from yaml import dump, safe_load +from looper.cli_pydantic import main +from looper.const import * +from looper.utils import * +from tests.integration.conftest import get_project_config_path, is_connected + CMD_STRS = ["string", " --string", " --sjhsjd 212", "7867#$@#$cc@@"] diff --git a/tests/scripts/test-integration.sh b/tests/scripts/test-integration.sh new file mode 100755 index 000000000..dea470688 --- /dev/null +++ b/tests/scripts/test-integration.sh @@ -0,0 +1,15 @@ +#!/bin/bash +# Integration Test Runner for Looper +# Runs full CLI integration tests that require temp directories and file I/O + +set -e +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +PROJECT_ROOT="$SCRIPT_DIR/../.." +cd "$PROJECT_ROOT" + +export RUN_INTEGRATION_TESTS=true + +echo "=== Running Looper Integration Tests ===" +python3 -m pytest tests/integration/ -v "$@" + +echo "=== Integration tests completed successfully! ===" diff --git a/tests/smoketests/.looper.yaml b/tests/smoketests/.looper.yaml deleted file mode 100644 index d4cfc108f..000000000 --- a/tests/smoketests/.looper.yaml +++ /dev/null @@ -1,5 +0,0 @@ -pep_config: example/pep/path -output_dir: . -pipeline_interfaces: - sample: [] - project: [] diff --git a/tests/unit/__init__.py b/tests/unit/__init__.py new file mode 100644 index 000000000..fd6ca98d1 --- /dev/null +++ b/tests/unit/__init__.py @@ -0,0 +1 @@ +# Unit tests - fast tests with no file I/O diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py new file mode 100644 index 000000000..680aeb119 --- /dev/null +++ b/tests/unit/conftest.py @@ -0,0 +1,24 @@ +"""Unit test configuration with minimal, fast fixtures (no file I/O).""" + +import pytest + + +@pytest.fixture +def sample_piface_dict(): + """Sample pipeline interface dictionary for unit tests.""" + return { + "pipeline_name": "test_pipeline", + "pipeline_type": "sample", + "command_template": "python pipeline.py {sample.sample_name}", + } + + +@pytest.fixture +def sample_piface_with_output_schema(): + """Pipeline interface dict with output_schema for pipestat tests.""" + return { + "pipeline_name": "test_pipeline", + "pipeline_type": "sample", + "output_schema": "schema.yaml", + "command_template": "python pipeline.py --pipestat-config {pipestat.config_file}", + } diff --git a/tests/test_desired_sample_range.py b/tests/unit/test_desired_sample_range.py similarity index 97% rename from tests/test_desired_sample_range.py rename to tests/unit/test_desired_sample_range.py index 97c662561..bd1394596 100644 --- a/tests/test_desired_sample_range.py +++ b/tests/unit/test_desired_sample_range.py @@ -1,8 +1,9 @@ """Tests for determination of desired sample range""" from itertools import chain + import pytest -from hypothesis import given, strategies as st + from looper.utils import ( NatIntervalException, desired_samples_range_limited, diff --git a/tests/test_natural_range.py b/tests/unit/test_natural_range.py similarity index 97% rename from tests/test_natural_range.py rename to tests/unit/test_natural_range.py index 76f899539..d2c06b22c 100644 --- a/tests/test_natural_range.py +++ b/tests/unit/test_natural_range.py @@ -1,10 +1,12 @@ """Tests for the natural numbers range data type""" from typing import * + import pytest -from hypothesis import given, strategies as st -from looper.utils import NatIntervalException, NatIntervalInclusive +from hypothesis import given +from hypothesis import strategies as st +from looper.utils import NatIntervalException, NatIntervalInclusive gen_pos_int = st.integers(min_value=1) gen_opt_int = st.one_of(st.integers(), st.none()) @@ -71,8 +73,9 @@ def test_from_string__just_delimiter__does_not_parse(legit_delim, upper_bound): @given( lo_hi_upper=st.tuples(gen_opt_int, gen_opt_int, st.integers()).filter( - lambda t: (t[0] is not None or t[1] is not None) - and any(is_non_pos(n) for n in t) + lambda t: ( + (t[0] is not None or t[1] is not None) and any(is_non_pos(n) for n in t) + ) ) ) def test_from_string__nonpositive_values__fail_with_expected_error( diff --git a/tests/unit/test_pipeline_interface.py b/tests/unit/test_pipeline_interface.py new file mode 100644 index 000000000..db9da3c15 --- /dev/null +++ b/tests/unit/test_pipeline_interface.py @@ -0,0 +1,213 @@ +"""Tests for pipestat config handoff validation.""" + +import pytest + +from looper.exceptions import PipelineInterfaceConfigError +from looper.pipeline_interface import PipelineInterface + + +class TestPipestatHandoffValidation: + """Tests for pipestat config handoff validation in PipelineInterface.""" + + def test_cli_handoff_with_config_file(self, tmp_path): + """Interface with {pipestat.config_file} in command_template passes validation.""" + piface_content = """ +pipeline_name: test_pipeline +pipeline_type: sample +output_schema: schema.yaml +command_template: > + python pipeline.py --pipestat-config {pipestat.config_file} +""" + piface_path = tmp_path / "piface.yaml" + piface_path.write_text(piface_content) + + # Should not raise + pi = PipelineInterface(str(piface_path)) + assert pi.pipeline_name == "test_pipeline" + + def test_cli_handoff_with_other_pipestat_var(self, tmp_path): + """Interface with any {pipestat.*} in command_template passes validation.""" + piface_content = """ +pipeline_name: test_pipeline +pipeline_type: sample +output_schema: schema.yaml +command_template: > + python pipeline.py {pipestat.results_file} {pipestat.output_schema} +""" + piface_path = tmp_path / "piface.yaml" + piface_path.write_text(piface_content) + + # Should not raise + pi = PipelineInterface(str(piface_path)) + assert pi.pipeline_name == "test_pipeline" + + def test_cli_handoff_in_sample_interface(self, tmp_path): + """Interface with {pipestat.*} in sample_interface.command_template passes.""" + piface_content = """ +pipeline_name: test_pipeline +output_schema: schema.yaml +sample_interface: + pipeline_type: sample + command_template: > + python pipeline.py --config {pipestat.config_file} +""" + piface_path = tmp_path / "piface.yaml" + piface_path.write_text(piface_content) + + # Should not raise + pi = PipelineInterface(str(piface_path)) + assert pi.pipeline_name == "test_pipeline" + + def test_cli_handoff_in_project_interface(self, tmp_path): + """Interface with {pipestat.*} in project_interface.command_template passes.""" + piface_content = """ +pipeline_name: test_pipeline +output_schema: schema.yaml +project_interface: + pipeline_type: project + command_template: > + python pipeline.py --config {pipestat.config_file} +""" + piface_path = tmp_path / "piface.yaml" + piface_path.write_text(piface_content) + + # Should not raise + pi = PipelineInterface(str(piface_path)) + assert pi.pipeline_name == "test_pipeline" + + def test_env_var_handoff(self, tmp_path): + """Interface with PIPESTAT_CONFIG in inject_env_vars passes validation.""" + piface_content = """ +pipeline_name: test_pipeline +pipeline_type: sample +output_schema: schema.yaml +inject_env_vars: + PIPESTAT_CONFIG: "{pipestat.config_file}" +command_template: > + python pipeline.py +""" + piface_path = tmp_path / "piface.yaml" + piface_path.write_text(piface_content) + + # Should not raise + pi = PipelineInterface(str(piface_path)) + assert pi.pipeline_name == "test_pipeline" + + def test_missing_handoff_raises_error(self, tmp_path): + """Interface with output_schema but no handoff mechanism raises error.""" + piface_content = """ +pipeline_name: test_pipeline +pipeline_type: sample +output_schema: schema.yaml +command_template: > + python pipeline.py --no-pipestat-handoff +""" + piface_path = tmp_path / "piface.yaml" + piface_path.write_text(piface_content) + + with pytest.raises(PipelineInterfaceConfigError) as exc_info: + PipelineInterface(str(piface_path)) + + error_msg = str(exc_info.value) + assert "test_pipeline" in error_msg + assert "output_schema" in error_msg + assert "pipestat" in error_msg.lower() + + def test_no_output_schema_skips_validation(self, tmp_path): + """Interface without output_schema skips pipestat validation entirely.""" + piface_content = """ +pipeline_name: test_pipeline +pipeline_type: sample +command_template: > + python pipeline.py --regular-pipeline +""" + piface_path = tmp_path / "piface.yaml" + piface_path.write_text(piface_content) + + # Should not raise - no pipestat, no validation + pi = PipelineInterface(str(piface_path)) + assert pi.pipeline_name == "test_pipeline" + + def test_pipestat_config_required_false_skips_validation(self, tmp_path): + """Setting pipestat_config_required: false disables validation.""" + piface_content = """ +pipeline_name: test_pipeline +pipeline_type: sample +output_schema: schema.yaml +pipestat_config_required: false +command_template: > + python pipeline.py --custom-pipestat-handling +""" + piface_path = tmp_path / "piface.yaml" + piface_path.write_text(piface_content) + + # Should not raise due to pipestat_config_required: false + pi = PipelineInterface(str(piface_path)) + assert pi.pipeline_name == "test_pipeline" + + def test_error_message_includes_guidance(self, tmp_path): + """Error message includes clear guidance on how to fix the issue.""" + piface_content = """ +pipeline_name: my_pipeline +pipeline_type: sample +output_schema: schema.yaml +command_template: > + python pipeline.py +""" + piface_path = tmp_path / "piface.yaml" + piface_path.write_text(piface_content) + + with pytest.raises(PipelineInterfaceConfigError) as exc_info: + PipelineInterface(str(piface_path)) + + error_msg = str(exc_info.value) + # Should mention both options + assert "command_template" in error_msg + assert "inject_env_vars" in error_msg + assert "PIPESTAT_CONFIG" in error_msg + # Should mention override option + assert "pipestat_config_required: false" in error_msg + + +class TestInjectEnvVars: + """Tests for inject_env_vars rendering in submission scripts.""" + + def test_inject_env_vars_renders_templates(self, tmp_path): + """inject_env_vars templates are rendered with namespaces.""" + from looper.utils import render_inject_env_vars + + inject_env_vars = { + "PIPESTAT_CONFIG": "{pipestat.config_file}", + "OUTPUT_DIR": "{looper.output_dir}", + } + namespaces = { + "pipestat": {"config_file": "/path/to/pipestat_config.yaml"}, + "looper": {"output_dir": "/path/to/output"}, + } + + result = render_inject_env_vars(inject_env_vars, namespaces) + + assert result["PIPESTAT_CONFIG"] == "/path/to/pipestat_config.yaml" + assert result["OUTPUT_DIR"] == "/path/to/output" + + def test_inject_env_vars_schema_valid(self, tmp_path): + """inject_env_vars passes schema validation.""" + piface_content = """ +pipeline_name: test_pipeline +pipeline_type: sample +output_schema: schema.yaml +inject_env_vars: + PIPESTAT_CONFIG: "{pipestat.config_file}" + CUSTOM_VAR: "static_value" + DYNAMIC_VAR: "{looper.output_dir}/subdir" +command_template: > + python pipeline.py +""" + piface_path = tmp_path / "piface.yaml" + piface_path.write_text(piface_content) + + # Should pass schema validation + pi = PipelineInterface(str(piface_path)) + assert pi.get("inject_env_vars") is not None + assert pi["inject_env_vars"]["PIPESTAT_CONFIG"] == "{pipestat.config_file}" + assert pi["inject_env_vars"]["CUSTOM_VAR"] == "static_value"