Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions conf/modules.config
Original file line number Diff line number Diff line change
Expand Up @@ -181,9 +181,9 @@ process {
]
}

withName: CLEAN_PREVIEW_HTML {
withName: EXTRACT_PREVIEW_DATA {
publishDir = [
path: { "${params.outdir}/${params.mode}/utility/clean_html/" },
path: { "${params.outdir}/${params.mode}/utility/preview_data/" },
mode: params.publish_dir_mode,
]
}
Expand Down
55 changes: 0 additions & 55 deletions modules/local/utility/clean_html/templates/clean_html.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
process CLEAN_PREVIEW_HTML {
process EXTRACT_PREVIEW_DATA {
tag "${meta.id}"
label 'process_low'

container "community.wave.seqera.io/library/beautifulsoup4_procs:3f09125465990b35"
container "community.wave.seqera.io/library/beautifulsoup4_pandas:d3b8b3eb86514c3c"

input:
tuple val(meta), path(preview_html)

output:
tuple val(meta), path("${prefix}/preview_mqc.html"), emit: mqc_html
tuple val(meta), path("${prefix}/*_mqc.tsv"), emit: mqc_data
tuple val(meta), path("${prefix}/*_mqc.png"), emit: mqc_img
path ("versions.yml"), emit: versions

when:
Expand All @@ -17,27 +18,31 @@ process CLEAN_PREVIEW_HTML {
script:
// Exit if running this module with -profile conda / -profile mamba
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
error("CLEAN_HTML module does not support Conda. Please use Docker / Singularity / Podman instead.")
error("EXTRACT_PREVIEW_DATA module does not support Conda. Please use Docker / Singularity / Podman instead.")
}

prefix = task.ext.prefix ?: "${meta.id}"

template('clean_html.py')
template('extract_data.py')

stub:
// Exit if running this module with -profile conda / -profile mamba
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
error("CLEAN_HTML module does not support Conda. Please use Docker / Singularity / Podman instead.")
error("EXTRACT_PREVIEW_DATA module does not support Conda. Please use Docker / Singularity / Podman instead.")
}
prefix = task.ext.prefix ?: "${meta.id}"

"""
mkdir -p ${prefix}
touch ${prefix}/preview_mqc.html
touch ${prefix}/noise_distribution_mqc.tsv
touch ${prefix}/gene_structure_mqc.tsv
touch ${prefix}/umap_mqc.tsv
touch ${prefix}/transcript_plots_mqc.png
touch ${prefix}/noise_level_mqc.png

cat <<-END_VERSIONS > versions.yml
"${task.process}":
CLEAN_HTML: "1.0.0"
EXTRACT_PREVIEW_DATA: "1.0.0"
END_VERSIONS
"""
}
188 changes: 188 additions & 0 deletions modules/local/utility/extract_preview_data/templates/extract_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,188 @@
#!/usr/bin/env python3


import re
import sys
import json
import html
import base64
from pathlib import Path
from typing import Dict, List, Optional, Tuple
import pandas as pd
from bs4 import BeautifulSoup


def get_png_files(soup: BeautifulSoup, outdir: Path) -> None:
"""Get png base64 images following specific h1 tags in preview.html"""
target_ids = ["Transcript_Plots", "Noise_Level"]
outdir.mkdir(parents=True, exist_ok=True)

for h1_id in target_ids:
h1_tag = soup.find("h1", id=h1_id)
if not h1_tag:
print(f"[WARN] No <h1> with id {h1_id} found")
continue

# Look for the first <img> after the h1 in the DOM
img_tag = h1_tag.find_next("img")
if not img_tag or not img_tag.get("src"):
print(f"[WARN] No <img> found after h1#{h1_id}")
continue

img_src = img_tag["src"]
if img_src.startswith("data:image/png;base64,"):
base64_data = img_src.split(",", 1)[1]
data = base64.b64decode(base64_data)
else:
print(f"[WARN] img src is not base64 PNG for h1#{h1_id}")
continue

# save png files
img_name = f"{h1_id}.png".lower()
out_path = outdir / img_name
with open(out_path, "wb") as f:
f.write(data)

print(f"[INFO] Saved {img_name}")

return None


def extract_js_object(text: str, start_idx: int) -> Tuple[Optional[str], int]:
"""Extract json-like object starting at start_idx."""
if start_idx >= len(text) or text[start_idx] != "{":
return None, start_idx

stack, in_str, escape, quote = [], False, False, None
for i in range(start_idx, len(text)):
ch = text[i]
if in_str:
if escape:
escape = False
elif ch == "\\":
escape = True
elif ch == quote:
in_str = False
else:
if ch in ('"', "'"):
in_str, quote = True, ch
elif ch == "{":
stack.append("{")
elif ch == "}":
stack.pop()
if not stack:
return text[start_idx : i + 1], i + 1
elif ch == "/" and i + 1 < len(text):
# skip js comments
nxt = text[i + 1]
if nxt == "/":
end = text.find("\n", i + 2)
i = len(text) - 1 if end == -1 else end
elif nxt == "*":
end = text.find("*/", i + 2)
if end == -1:
break
i = end + 1

return None, start_idx


def js_to_json(js: str) -> str:
"""Convert a JS object string to valid JSON."""
# Remove comments
js = re.sub(r"/\*.*?\*/", "", js, flags=re.S)
js = re.sub(r"//[^\n]*", "", js)

# Convert single-quoted strings to double-quoted strings
js = re.sub(
r"'((?:\\.|[^'\\])*)'",
lambda m: '"' + m.group(1).replace('\"', '\\"') + '\"',
js
)

# Remove trailing commas
js = re.sub(r",\s*(?=[}\]])", "", js)
js = re.sub(r",\s*,+", ",", js)

return js.strip()


def find_variables(script_text: str) -> Dict[str, str]:
"""Find all 'var|let|const specN =' declarations and extract their objects."""
specs: Dict[str, str] = {}
script_text = html.unescape(script_text)
pattern = re.compile(r"(?:var|let|const)\s+(spec\d+)\s*=\s*{", re.I)

for match in pattern.finditer(script_text):
var = match.group(1)
obj, _ = extract_js_object(script_text, match.end() - 1)
if obj:
specs[var] = obj
else:
print(f"[WARN] Could not extract object for {var}")
return specs


def write_tsvs(specs: Dict[str, str], outdir: Path) -> List[Path]:
"""Convert extracted json to tsv."""
outdir.mkdir(parents=True, exist_ok=True)
written: List[Path] = []

for var, js_obj in specs.items():
try:
data = json.loads(js_to_json(js_obj))
values = data.get("data", {}).get("values", [])
if not values:
print(f"[WARN] No data.values found in {var}")
continue

df = pd.DataFrame(values)
outpath = outdir / f"{var}_mqc.tsv"

with open(outpath, "w") as f:
f.write("# plot_type: linegraph\n")
f.write(f"# section_name: {var}\n")
f.write("# description: Extracted preview data\n")
df.to_csv(f, sep="\t", index=False)

written.append(outpath)
print(f"[INFO] Wrote {outpath} ({len(df)} rows × {len(df.columns)} cols)")
except Exception as e:
print(f"[ERROR] Failed to process {var}: {e}")

return written



if __name__ == "__main__":

input_path: Path = Path("${preview_html}")
outdir: Path = Path("${prefix}")

text = input_path.read_text(encoding="utf-8", errors="ignore")
soup = BeautifulSoup(text, "html.parser")

# get the script section
if "<script" in text.lower():
script_text = "\n".join(s.get_text() for s in soup.find_all("script"))
else:
script_text = text

spec_variables = find_variables(script_text)
if not spec_variables:
print("[ERROR] No variables (spec1, spec2, spec3) found.")
sys.exit(1)

# write tsv files for multiqc
written = write_tsvs(spec_variables, outdir)
if not written:
print("[ERROR] No TSVs written.")
sys.exit(1)

# get png files
get_png_files(soup=soup, outdir=outdir)

# write versions.yml
with open("versions.yml", "w") as f:
f.write('"${task.process}":\\n')
f.write('EXTRACT_PREVIEW_DATA: "1.0.0"\\n')
13 changes: 8 additions & 5 deletions subworkflows/local/baysor_generate_preview/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

include { BAYSOR_PREVIEW } from '../../../modules/local/baysor/preview/main'
include { BAYSOR_CREATE_DATASET } from '../../../modules/local/baysor/create_dataset/main'
include { CLEAN_PREVIEW_HTML } from '../../../modules/local/utility/clean_html/main'
include { EXTRACT_PREVIEW_DATA } from '../../../modules/local/utility/extract_preview_data/main'
include { PARQUET_TO_CSV } from '../../../modules/local/utility/spatialconverter/parquet_to_csv/main'

workflow BAYSOR_GENERATE_PREVIEW {
Expand All @@ -16,6 +16,7 @@ workflow BAYSOR_GENERATE_PREVIEW {

ch_versions = Channel.empty()
ch_preview_mqc_html = Channel.empty()
ch_preview_mqc_png = Channel.empty()


// run parquet to csv
Expand All @@ -41,12 +42,14 @@ workflow BAYSOR_GENERATE_PREVIEW {
ch_versions = ch_versions.mix(BAYSOR_PREVIEW.out.versions)

// clean the preview html file generated
CLEAN_PREVIEW_HTML(BAYSOR_PREVIEW.out.preview_html)
ch_versions = ch_versions.mix(CLEAN_PREVIEW_HTML.out.versions)
EXTRACT_PREVIEW_DATA(BAYSOR_PREVIEW.out.preview_html)
ch_versions = ch_versions.mix(EXTRACT_PREVIEW_DATA.out.versions)

ch_preview_mqc_html = CLEAN_PREVIEW_HTML.out.mqc_html
ch_preview_mqc_html = EXTRACT_PREVIEW_DATA.out.mqc_data
ch_preview_mqc_png = EXTRACT_PREVIEW_DATA.out.mqc_img

emit:
preview_html = ch_preview_mqc_html // channel: [ val(meta), ["preview_mqc.html"] ]
preview_html = ch_preview_mqc_html // channel: [ val(meta), ["*_mqc.tsv"] ]
preview_img = ch_preview_mqc_png // channel: [ val(meta), ["*_mqc.png"] ]
versions = ch_versions // channel: [ versions.yml ]
}
1 change: 1 addition & 0 deletions subworkflows/local/baysor_generate_preview/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ components:
- baysor/preview
- baysor/create/dataset
- parquet/to/csv
- extract/preview/data
input:
- ch_transcripts_parquet:
description: |
Expand Down
22 changes: 15 additions & 7 deletions tests/preview_mode.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,14 @@
"preview/untar/test_run/transcripts.parquet",
"preview/untar/versions.yml",
"preview/utility",
"preview/utility/clean_html",
"preview/utility/clean_html/test_run",
"preview/utility/clean_html/test_run/preview_mqc.html",
"preview/utility/clean_html/versions.yml",
"preview/utility/preview_data",
"preview/utility/preview_data/test_run",
"preview/utility/preview_data/test_run/gene_structure_mqc.tsv",
"preview/utility/preview_data/test_run/noise_distribution_mqc.tsv",
"preview/utility/preview_data/test_run/noise_level_mqc.png",
"preview/utility/preview_data/test_run/transcript_plots_mqc.png",
"preview/utility/preview_data/test_run/umap_mqc.tsv",
"preview/utility/preview_data/versions.yml",
"preview/utility/spatialconverter",
"preview/utility/spatialconverter/parquet_to_csv",
"preview/utility/spatialconverter/parquet_to_csv/test_run",
Expand All @@ -55,8 +59,12 @@
"morphology.ome.tif:md5,d41d8cd98f00b204e9800998ecf8427e",
"transcripts.parquet:md5,d41d8cd98f00b204e9800998ecf8427e",
"versions.yml:md5,4054f048e726d8faf84c982f8180a9e0",
"preview_mqc.html:md5,d41d8cd98f00b204e9800998ecf8427e",
"versions.yml:md5,51c7b56f60dcced3fbd34092c4c5d9bd",
"gene_structure_mqc.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
"noise_distribution_mqc.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
"noise_level_mqc.png:md5,d41d8cd98f00b204e9800998ecf8427e",
"transcript_plots_mqc.png:md5,d41d8cd98f00b204e9800998ecf8427e",
"umap_mqc.tsv:md5,d41d8cd98f00b204e9800998ecf8427e",
"versions.yml:md5,df5efa071f7cde92b1671a88ecc20959",
"transcripts.parquet.csv:md5,d41d8cd98f00b204e9800998ecf8427e",
"versions.yml:md5,b7a26f2cff61d87a77f2db813dbc851a"
]
Expand All @@ -65,6 +73,6 @@
"nf-test": "0.9.3",
"nextflow": "25.10.0"
},
"timestamp": "2025-10-29T13:22:36.041317"
"timestamp": "2025-10-30T20:21:33.929224"
}
}
Loading