diff --git a/conf/modules.config b/conf/modules.config
index f772918..f91bdce 100644
--- a/conf/modules.config
+++ b/conf/modules.config
@@ -181,9 +181,9 @@ process {
]
}
- withName: CLEAN_PREVIEW_HTML {
+ withName: EXTRACT_PREVIEW_DATA {
publishDir = [
- path: { "${params.outdir}/${params.mode}/utility/clean_html/" },
+ path: { "${params.outdir}/${params.mode}/utility/preview_data/" },
mode: params.publish_dir_mode,
]
}
diff --git a/modules/local/utility/clean_html/templates/clean_html.py b/modules/local/utility/clean_html/templates/clean_html.py
deleted file mode 100644
index aca53be..0000000
--- a/modules/local/utility/clean_html/templates/clean_html.py
+++ /dev/null
@@ -1,55 +0,0 @@
-#!/usr/bin/env python3
-
-from bs4 import BeautifulSoup
-from pathlib import Path
-
-def clean_preview_html(input_html, output_mqc_html, height=800):
- """
- 1. Cleans the Baysor preview HTML by removing the
containing
-
Content
and the
list.
- 2. Inlines the cleaned HTML directly into a MultiQC _mqc.html wrapper.
- """
- input_html = Path(input_html)
- output_mqc_html = Path(prefix)/output_mqc_html
- output_mqc_html.parent.mkdir(parents=True, exist_ok=True)
-
- # Step 1: Clean the HTML
- with open(input_html, 'r') as f:
- soup = BeautifulSoup(f, 'html.parser')
-
- for div in soup.find_all('div'):
- h2 = div.find('h2')
- ul = div.find('ul')
- if h2 and h2.get_text(strip=True) == 'Content' and ul:
- div.decompose()
-
- # Change all to
- for h1 in soup.find_all('h1'):
- h1.name = 'h3'
-
- cleaned_html_content = str(soup)
-
- # Step 2: Wrap and inline into MultiQC _mqc.html
- wrapper_content = f"""
-
-{cleaned_html_content}
-"""
-
- with open(output_mqc_html, 'w') as f:
- f.write(wrapper_content)
-
-
-if __name__ == '__main__':
- preview_html = "${preview_html}"
- output_html = "preview_mqc.html"
- prefix = "${prefix}"
-
- clean_preview_html(
- preview_html,
- output_html
- )
-
- #Output versions.yml
- with open("versions.yml", "w") as f:
- f.write('"${task.process}":\\n')
- f.write('CLEAN_PREVIEW_HTML: "1.0.0"\\n')
diff --git a/modules/local/utility/clean_html/main.nf b/modules/local/utility/extract_preview_data/main.nf
similarity index 54%
rename from modules/local/utility/clean_html/main.nf
rename to modules/local/utility/extract_preview_data/main.nf
index 3218224..d5de41f 100644
--- a/modules/local/utility/clean_html/main.nf
+++ b/modules/local/utility/extract_preview_data/main.nf
@@ -1,14 +1,15 @@
-process CLEAN_PREVIEW_HTML {
+process EXTRACT_PREVIEW_DATA {
tag "${meta.id}"
label 'process_low'
- container "community.wave.seqera.io/library/beautifulsoup4_procs:3f09125465990b35"
+ container "community.wave.seqera.io/library/beautifulsoup4_pandas:d3b8b3eb86514c3c"
input:
tuple val(meta), path(preview_html)
output:
- tuple val(meta), path("${prefix}/preview_mqc.html"), emit: mqc_html
+ tuple val(meta), path("${prefix}/*_mqc.tsv"), emit: mqc_data
+ tuple val(meta), path("${prefix}/*_mqc.png"), emit: mqc_img
path ("versions.yml"), emit: versions
when:
@@ -17,27 +18,31 @@ process CLEAN_PREVIEW_HTML {
script:
// Exit if running this module with -profile conda / -profile mamba
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
- error("CLEAN_HTML module does not support Conda. Please use Docker / Singularity / Podman instead.")
+ error("EXTRACT_PREVIEW_DATA module does not support Conda. Please use Docker / Singularity / Podman instead.")
}
prefix = task.ext.prefix ?: "${meta.id}"
- template('clean_html.py')
+ template('extract_data.py')
stub:
// Exit if running this module with -profile conda / -profile mamba
if (workflow.profile.tokenize(',').intersect(['conda', 'mamba']).size() >= 1) {
- error("CLEAN_HTML module does not support Conda. Please use Docker / Singularity / Podman instead.")
+ error("EXTRACT_PREVIEW_DATA module does not support Conda. Please use Docker / Singularity / Podman instead.")
}
prefix = task.ext.prefix ?: "${meta.id}"
"""
mkdir -p ${prefix}
- touch ${prefix}/preview_mqc.html
+ touch ${prefix}/noise_distribution_mqc.tsv
+ touch ${prefix}/gene_structure_mqc.tsv
+ touch ${prefix}/umap_mqc.tsv
+ touch ${prefix}/transcript_plots_mqc.png
+ touch ${prefix}/noise_level_mqc.png
cat <<-END_VERSIONS > versions.yml
"${task.process}":
- CLEAN_HTML: "1.0.0"
+ EXTRACT_PREVIEW_DATA: "1.0.0"
END_VERSIONS
"""
}
diff --git a/modules/local/utility/extract_preview_data/templates/extract_data.py b/modules/local/utility/extract_preview_data/templates/extract_data.py
new file mode 100644
index 0000000..e27f314
--- /dev/null
+++ b/modules/local/utility/extract_preview_data/templates/extract_data.py
@@ -0,0 +1,188 @@
+#!/usr/bin/env python3
+
+
+import re
+import sys
+import json
+import html
+import base64
+from pathlib import Path
+from typing import Dict, List, Optional, Tuple
+import pandas as pd
+from bs4 import BeautifulSoup
+
+
+def get_png_files(soup: BeautifulSoup, outdir: Path) -> None:
+ """Get png base64 images following specific h1 tags in preview.html"""
+ target_ids = ["Transcript_Plots", "Noise_Level"]
+ outdir.mkdir(parents=True, exist_ok=True)
+
+ for h1_id in target_ids:
+ h1_tag = soup.find("h1", id=h1_id)
+ if not h1_tag:
+ print(f"[WARN] No with id {h1_id} found")
+ continue
+
+ # Look for the first
after the h1 in the DOM
+ img_tag = h1_tag.find_next("img")
+ if not img_tag or not img_tag.get("src"):
+ print(f"[WARN] No
found after h1#{h1_id}")
+ continue
+
+ img_src = img_tag["src"]
+ if img_src.startswith("data:image/png;base64,"):
+ base64_data = img_src.split(",", 1)[1]
+ data = base64.b64decode(base64_data)
+ else:
+ print(f"[WARN] img src is not base64 PNG for h1#{h1_id}")
+ continue
+
+ # save png files
+ img_name = f"{h1_id}.png".lower()
+ out_path = outdir / img_name
+ with open(out_path, "wb") as f:
+ f.write(data)
+
+ print(f"[INFO] Saved {img_name}")
+
+ return None
+
+
+def extract_js_object(text: str, start_idx: int) -> Tuple[Optional[str], int]:
+ """Extract json-like object starting at start_idx."""
+ if start_idx >= len(text) or text[start_idx] != "{":
+ return None, start_idx
+
+ stack, in_str, escape, quote = [], False, False, None
+ for i in range(start_idx, len(text)):
+ ch = text[i]
+ if in_str:
+ if escape:
+ escape = False
+ elif ch == "\\":
+ escape = True
+ elif ch == quote:
+ in_str = False
+ else:
+ if ch in ('"', "'"):
+ in_str, quote = True, ch
+ elif ch == "{":
+ stack.append("{")
+ elif ch == "}":
+ stack.pop()
+ if not stack:
+ return text[start_idx : i + 1], i + 1
+ elif ch == "/" and i + 1 < len(text):
+ # skip js comments
+ nxt = text[i + 1]
+ if nxt == "/":
+ end = text.find("\n", i + 2)
+ i = len(text) - 1 if end == -1 else end
+ elif nxt == "*":
+ end = text.find("*/", i + 2)
+ if end == -1:
+ break
+ i = end + 1
+
+ return None, start_idx
+
+
+def js_to_json(js: str) -> str:
+ """Convert a JS object string to valid JSON."""
+ # Remove comments
+ js = re.sub(r"/\*.*?\*/", "", js, flags=re.S)
+ js = re.sub(r"//[^\n]*", "", js)
+
+ # Convert single-quoted strings to double-quoted strings
+ js = re.sub(
+ r"'((?:\\.|[^'\\])*)'",
+ lambda m: '"' + m.group(1).replace('\"', '\\"') + '\"',
+ js
+ )
+
+ # Remove trailing commas
+ js = re.sub(r",\s*(?=[}\]])", "", js)
+ js = re.sub(r",\s*,+", ",", js)
+
+ return js.strip()
+
+
+def find_variables(script_text: str) -> Dict[str, str]:
+ """Find all 'var|let|const specN =' declarations and extract their objects."""
+ specs: Dict[str, str] = {}
+ script_text = html.unescape(script_text)
+ pattern = re.compile(r"(?:var|let|const)\s+(spec\d+)\s*=\s*{", re.I)
+
+ for match in pattern.finditer(script_text):
+ var = match.group(1)
+ obj, _ = extract_js_object(script_text, match.end() - 1)
+ if obj:
+ specs[var] = obj
+ else:
+ print(f"[WARN] Could not extract object for {var}")
+ return specs
+
+
+def write_tsvs(specs: Dict[str, str], outdir: Path) -> List[Path]:
+ """Convert extracted json to tsv."""
+ outdir.mkdir(parents=True, exist_ok=True)
+ written: List[Path] = []
+
+ for var, js_obj in specs.items():
+ try:
+ data = json.loads(js_to_json(js_obj))
+ values = data.get("data", {}).get("values", [])
+ if not values:
+ print(f"[WARN] No data.values found in {var}")
+ continue
+
+ df = pd.DataFrame(values)
+ outpath = outdir / f"{var}_mqc.tsv"
+
+ with open(outpath, "w") as f:
+ f.write("# plot_type: linegraph\n")
+ f.write(f"# section_name: {var}\n")
+ f.write("# description: Extracted preview data\n")
+ df.to_csv(f, sep="\t", index=False)
+
+ written.append(outpath)
+ print(f"[INFO] Wrote {outpath} ({len(df)} rows × {len(df.columns)} cols)")
+ except Exception as e:
+ print(f"[ERROR] Failed to process {var}: {e}")
+
+ return written
+
+
+
+if __name__ == "__main__":
+
+ input_path: Path = Path("${preview_html}")
+ outdir: Path = Path("${prefix}")
+
+ text = input_path.read_text(encoding="utf-8", errors="ignore")
+ soup = BeautifulSoup(text, "html.parser")
+
+ # get the script section
+ if "