From 9ec7bdc13fe3d39021a4375e77498ecf5a143600 Mon Sep 17 00:00:00 2001
From: Wayne E <1916647616@qq.com>
Date: Thu, 5 Feb 2026 15:49:58 +0800
Subject: [PATCH 1/6] feat: add single-operator subgraph dataset generation
 script

This commit introduces `generate_single_op_dataset.sh` to automate the workflow for generating single-operator subgraph datasets.
---
 graph_net/test/generate_single_op_dataset.sh | 323 +++++++++++++++++++
 1 file changed, 323 insertions(+)
 create mode 100644 graph_net/test/generate_single_op_dataset.sh

diff --git a/graph_net/test/generate_single_op_dataset.sh b/graph_net/test/generate_single_op_dataset.sh
new file mode 100644
index 000000000..4dd94cbd6
--- /dev/null
+++ b/graph_net/test/generate_single_op_dataset.sh
@@ -0,0 +1,323 @@
+#!/bin/bash 
+set -x
+
+################################################################################
+# [CRITICAL NOTICE] BEFORE RUNNING THIS SCRIPT:
+#
+#  /graph_net/test/generate_single_op_dataset.sh
+#
+# 1. Check 'PYTHON_EXEC': Ensure the variable below points to the correct
+#    Python interpreter in your virtual environment.
+#
+# 2. Check 'INPUT_LIST': Look for the 'INPUT_LIST' variable inside the
+#    internal Python script (Stage 1 section). It is currently hardcoded
+#    to 'small10_torch_samples_list.txt'. Please switch it to your full
+#    dataset list file before running large-scale generation.
+################################################################################
+
+# ==============================================================================
+# Configuration Area
+# ==============================================================================
+
+# [TODO] HARDCODED: Paths are currently hardcoded; needs dynamic retrieval or arguments in the future.
+# Virtual Environment Python Executable Path
+PYTHON_EXEC="/workspace/venv_graphnet/bin/python3"
+# Project Root Directory
+GRAPH_NET_ROOT="/workspace/GraphNet"
+
+# Script Runtime Arguments
+GPU_ID=${1:-0}
+RESUME="false"
+
+# Export environment variables to ensure Python can find graph_net
+export CUDA_VISIBLE_DEVICES="${GPU_ID}"
+export PYTHONPATH="${GRAPH_NET_ROOT}:${PYTHONPATH}"
+
+# Workspace Configuration
+TIMESTAMP=$(date +%Y%m%d_%H%M%S)
+WORKSPACE="/tmp/single_op_workspace_${TIMESTAMP}"
+
+# Define standardized output directory structure
+RAW_OUTPUT_DIR="${WORKSPACE}/01_raw_single_op_subgraphs"
+RENAMED_OUTPUT_DIR="${WORKSPACE}/02_renamed_single_op_subgraphs"
+DEDUPLICATED_OUTPUT_DIR="${WORKSPACE}/03_deduplicated_single_op_subgraphs"
+
+# Define intermediate list file paths
+RAW_SAMPLE_LIST="${WORKSPACE}/sample_list_01_raw.txt"
+RENAMED_SAMPLE_LIST="${WORKSPACE}/sample_list_02_renamed.txt"
+
+# Create workspace
+mkdir -p "$WORKSPACE" "$RAW_OUTPUT_DIR"
+
+# ==============================================================================
+# Helper Functions
+# ==============================================================================
+
+# Subgraph list generation function (mimics the original script)
+function generate_subgraph_list() {
+    local target_dir="$1"
+    local sample_list="$2"
+    echo ">>> Generate subgraph_sample_list for samples under ${target_dir}."
+    echo ">>>"
+    
+    # Find parent directories of all model.py files to identify valid samples
+    find ${target_dir} -name "model.py" \
+        | xargs dirname \
+        | xargs realpath --relative-to=${target_dir} \
+        | tee $sample_list
+}
+
+# ==============================================================================
+# Stage 1: Generation (Black Box Mode)
+# ==============================================================================
+
+function generate_raw_data() {
+    echo ">>> [1] Generating Single Operator Subgraphs (Running Python Script)..."
+    echo ">>>"
+
+    local TEMP_GEN_SCRIPT="${WORKSPACE}/_internal_gen.py"
+
+    # 1. Write the Python script to a temporary file
+    # Note: The Python logic is preserved exactly as provided
+    cat << 'EOF' > "$TEMP_GEN_SCRIPT"
+import os
+import sys
+import time
+import math
+import subprocess
+import datetime
+import multiprocessing
+import json
+import base64
+
+# [TODO] HARDCODED: Keep sync with Shell script
+PYTHON_EXEC = "/workspace/venv_graphnet/bin/python3"
+PROJECT_ROOT = "/workspace/GraphNet"
+# [dependency] WARNING: This is currently pointing to the small 10 sample list
+INPUT_LIST = os.path.join(PROJECT_ROOT, "graph_net/config/small10_torch_samples_list.txt")
+
+NUM_GPUS = 2
+TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
+# Temporary directory generated inside the Python script
+BASE_DIR = f"/tmp/decompose_run_{TIMESTAMP}"
+
+def make_config_b64(config_dict):
+    json_str = json.dumps(config_dict)
+    return base64.b64encode(json_str.encode('utf-8')).decode('utf-8')
+
+def run_stage_cmd(env, cwd, cmd_args, stage_name, log_file):
+    cmd = [PYTHON_EXEC, "-u", "-m", "graph_net.apply_sample_pass"] + cmd_args
+    try:
+        result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True)
+        with open(log_file, "a") as f:
+            if result.returncode != 0:
+                f.write(f"\n[FAIL] {stage_name} Error (Exit {result.returncode}):\n")
+                f.write(result.stderr[-2000:] + "\n")
+                return False
+            else:
+                return True
+    except Exception as e:
+        with open(log_file, "a") as f:
+            f.write(f"\n[CRITICAL] {stage_name} Exception: {str(e)}\n")
+        return False
+
+def worker_process(gpu_id, models, base_dir):
+    log_file = os.path.join(base_dir, "logs", f"worker_gpu{gpu_id}.log")
+    workspace = base_dir
+    ranges_dir = os.path.join(workspace, "workspace_single_operator_ranges")
+    
+    env = os.environ.copy()
+    env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
+    env["GRAPH_NET_ROOT"] = PROJECT_ROOT
+    env["PYTHONPATH"] = PROJECT_ROOT
+    env["PYTHONUNBUFFERED"] = "1"
+
+    with open(log_file, "w") as f:
+        f.write(f"==== Worker GPU {gpu_id} Started ====\n")
+    
+    for idx, model_path in enumerate(models):
+        model_output_dir = os.path.join(workspace, model_path)
+        os.makedirs(model_output_dir, exist_ok=True)
+        os.makedirs(ranges_dir, exist_ok=True)
+
+        # Stage 1: OpNamesExtractor
+        cfg_s1 = make_config_b64({
+            "resume": False, 
+            "model_path_prefix": PROJECT_ROOT, 
+            "output_dir": workspace
+        })
+        run_stage_cmd(env, PROJECT_ROOT, [
+            "--model-path", model_path,
+            "--sample-pass-file-path", f"{PROJECT_ROOT}/graph_net/torch/sample_pass/op_names_extractor.py",
+            "--sample-pass-class-name", "OpNamesExtractor",
+            "--sample-pass-config", cfg_s1
+        ], "Stage 1", log_file)
+
+        # Stage 2: OpExtractPointsGenerator
+        cfg_s2 = make_config_b64({
+            "resume": False, 
+            "model_path_prefix": PROJECT_ROOT, 
+            "op_names_path_prefix": workspace, 
+            "output_dir": ranges_dir, 
+            "subgraph_ranges_file_name": "subgraph_ranges.json"
+        })
+        run_stage_cmd(env, PROJECT_ROOT, [
+            "--model-path", model_path,
+            "--sample-pass-file-path", f"{PROJECT_ROOT}/graph_net/sample_pass/op_extract_points_generator.py",
+            "--sample-pass-class-name", "OpExtractPointsGenerator",
+            "--sample-pass-config", cfg_s2
+        ], "Stage 2", log_file)
+
+        # Stage 3: SubgraphGenerator
+        cfg_s3 = make_config_b64({
+            "resume": False, 
+            "model_path_prefix": PROJECT_ROOT, 
+            "output_dir": workspace, 
+            "subgraph_ranges_json_root": ranges_dir, 
+            "group_head_and_tail": False, 
+            "chain_style": False
+        })
+        run_stage_cmd(env, PROJECT_ROOT, [
+            "--model-path", model_path,
+            "--sample-pass-file-path", f"{PROJECT_ROOT}/graph_net/torch/sample_pass/subgraph_generator.py",
+            "--sample-pass-class-name", "SubgraphGenerator",
+            "--sample-pass-config", cfg_s3
+        ], "Stage 3", log_file)
+
+def main():
+    if not os.path.exists(PYTHON_EXEC): return
+    os.makedirs(BASE_DIR, exist_ok=True)
+    os.makedirs(os.path.join(BASE_DIR, "logs"), exist_ok=True)
+    
+    # This line is the key anchor for the Shell script to capture the path
+    print(f"Workspace: {BASE_DIR}") 
+    print(f"Dataset Generation Started...")
+
+    with open(INPUT_LIST, 'r') as f:
+        all_models = [l.strip() for l in f if l.strip() and not l.startswith("#")]
+    
+    chunk_size = math.ceil(len(all_models) / NUM_GPUS)
+    processes = []
+    for i in range(NUM_GPUS):
+        chunk = all_models[i*chunk_size : (i+1)*chunk_size]
+        if not chunk: continue
+        p = multiprocessing.Process(target=worker_process, args=(i, chunk, BASE_DIR))
+        p.start()
+        processes.append(p)
+    
+    for p in processes:
+        p.join()
+
+if __name__ == "__main__":
+    multiprocessing.set_start_method('spawn', force=True)
+    main()
+EOF
+
+    # 2. Execute the Python script and capture the output directory
+    # We use tee to output logs to screen and grep to capture the "Workspace: " line
+    echo ">>> Running internal python generator..."
+    PYTHON_OUTPUT=$($PYTHON_EXEC $TEMP_GEN_SCRIPT | tee /dev/tty)
+    
+    # Extract the generated temporary path
+    TEMP_SRC_DIR=$(echo "$PYTHON_OUTPUT" | grep "Workspace:" | awk '{print $2}' | tr -d '\r')
+
+    if [ -z "$TEMP_SRC_DIR" ]; then
+        echo "Error: Could not capture workspace path from python script."
+        exit 1
+    fi
+
+    echo ">>> Python script finished. Temporary output at: $TEMP_SRC_DIR"
+    
+    # 3. Move Step
+    echo ">>> Moving data from temp dir to standardized dir: $RAW_OUTPUT_DIR"
+    # We only move the generated subgraph folders, excluding logs and range files.
+    # Assuming subgraphs are generated inside model directories under BASE_DIR, 
+    # we move everything first, then clean up.
+    
+    # Move all content
+    cp -r ${TEMP_SRC_DIR}/* ${RAW_OUTPUT_DIR}/
+    
+    # Clean up unnecessary intermediate artifacts (ranges and logs), keeping only subgraphs
+    rm -rf ${RAW_OUTPUT_DIR}/logs
+    rm -rf ${RAW_OUTPUT_DIR}/workspace_single_operator_ranges
+    
+    echo ">>> Data moved and cleaned."
+}
+
+# ==============================================================================
+# Stage 2: Renaming
+# ==============================================================================
+
+function rename_subgraphs() {
+    echo ">>> [2] Rename subgraph samples under ${RAW_OUTPUT_DIR}."
+    echo ">>>"
+    
+    # First, generate the list
+    generate_subgraph_list ${RAW_OUTPUT_DIR} ${RAW_SAMPLE_LIST}
+
+    $PYTHON_EXEC -m graph_net.model_path_handler \
+        --model-path-list ${RAW_SAMPLE_LIST} \
+        --handler-config=$(base64 -w 0 <<EOF
+{
+    "handler_path": "$GRAPH_NET_ROOT/graph_net/sample_pass/ast_graph_variable_renamer.py",
+    "handler_class_name": "AstGraphVariableRenamer",
+    "handler_config": {
+        "device": "cuda",
+        "try_run": false,
+        "resume": ${RESUME},
+        "model_path_prefix": "${RAW_OUTPUT_DIR}",
+        "data_input_predicator_filepath": "$GRAPH_NET_ROOT/graph_net/torch/constraint_util.py",
+        "data_input_predicator_class_name": "NaiveDataInputPredicator",
+        "model_runnable_predicator_filepath": "$GRAPH_NET_ROOT/graph_net/torch/constraint_util.py",
+        "model_runnable_predicator_class_name": "ModelRunnablePredicator",
+        "output_dir": "$RENAMED_OUTPUT_DIR"
+    }
+}
+EOF
+)
+}
+
+# ==============================================================================
+# Stage 3: Deduplication
+# ==============================================================================
+
+function deduplicate_subgraphs() {
+    echo ">>> [3] Remove duplicated subgraph samples under ${RENAMED_OUTPUT_DIR}."
+    echo ">>>"
+
+    
+    if [ -d "${DEDUPLICATED_OUTPUT_DIR}" ]; then
+        echo ">>> Target directory exists. Cleaning up..."
+        rm -rf "${DEDUPLICATED_OUTPUT_DIR}"
+    fi
+    
+    $PYTHON_EXEC -m graph_net.tools.deduplicated \
+        --samples-dir ${RENAMED_OUTPUT_DIR} \
+        --target-dir ${DEDUPLICATED_OUTPUT_DIR}
+}
+
+# ==============================================================================
+# Main Workflow
+# ==============================================================================
+
+main() {
+    echo "=========================================================="
+    echo "START: Single Operator Dataset Generation Pipeline"
+    echo "Workspace: $WORKSPACE"
+    echo "=========================================================="
+
+    # 1. Generate raw data
+    generate_raw_data
+
+    # 2. Rename variables (Standardization)
+    rename_subgraphs
+
+    # 3. Deduplicate
+    deduplicate_subgraphs
+
+    echo "=========================================================="
+    echo "FINISH: Dataset generated at ${DEDUPLICATED_OUTPUT_DIR}"
+    echo "=========================================================="
+}
+
+main

From ca2699bcd56f4f9673bc29b906881920bed621ac Mon Sep 17 00:00:00 2001
From: Wayne E <1916647616@qq.com>
Date: Fri, 6 Feb 2026 17:20:28 +0800
Subject: [PATCH 2/6] Refactor generate_single_op_dataset.sh for dynamic paths

Refactor script for dynamic path detection and improved error handling. Added logging and workspace setup enhancements.
---
 graph_net/test/generate_single_op_dataset.sh | 455 ++++++++-----------
 1 file changed, 184 insertions(+), 271 deletions(-)

diff --git a/graph_net/test/generate_single_op_dataset.sh b/graph_net/test/generate_single_op_dataset.sh
index 4dd94cbd6..f3c74635b 100644
--- a/graph_net/test/generate_single_op_dataset.sh
+++ b/graph_net/test/generate_single_op_dataset.sh
@@ -1,323 +1,236 @@
-#!/bin/bash 
-set -x
-
-################################################################################
-# [CRITICAL NOTICE] BEFORE RUNNING THIS SCRIPT:
-#
-#  /graph_net/test/generate_single_op_dataset.sh
-#
-# 1. Check 'PYTHON_EXEC': Ensure the variable below points to the correct
-#    Python interpreter in your virtual environment.
-#
-# 2. Check 'INPUT_LIST': Look for the 'INPUT_LIST' variable inside the
-#    internal Python script (Stage 1 section). It is currently hardcoded
-#    to 'small10_torch_samples_list.txt'. Please switch it to your full
-#    dataset list file before running large-scale generation.
-################################################################################
+#!/bin/bash
+set -e
 
 # ==============================================================================
 # Configuration Area
 # ==============================================================================
 
-# [TODO] HARDCODED: Paths are currently hardcoded; needs dynamic retrieval or arguments in the future.
-# Virtual Environment Python Executable Path
-PYTHON_EXEC="/workspace/venv_graphnet/bin/python3"
-# Project Root Directory
-GRAPH_NET_ROOT="/workspace/GraphNet"
+# [CRITICAL NOTICE]
+# This script now uses dynamic path detection. 
+# Ensure you are running inside the correct Virtual Environment.
 
-# Script Runtime Arguments
-GPU_ID=${1:-0}
-RESUME="false"
+# 1. Dynamic Path Retrieval (Fixing Hardcoded Paths)
+# Detect python executable from current PATH
+PYTHON_EXEC=$(which python3)
+if [ -z "$PYTHON_EXEC" ]; then
+    echo "Error: 'python3' not found in PATH. Please activate your virtualenv."
+    exit 1
+fi
 
-# Export environment variables to ensure Python can find graph_net
-export CUDA_VISIBLE_DEVICES="${GPU_ID}"
-export PYTHONPATH="${GRAPH_NET_ROOT}:${PYTHONPATH}"
+# Detect Project Root dynamically by importing the module
+GRAPH_NET_ROOT=$($PYTHON_EXEC -c "import graph_net; import os; print(os.path.dirname(os.path.dirname(graph_net.__file__)))")
+if [ -z "$GRAPH_NET_ROOT" ]; then
+    echo "Error: Could not determine GRAPH_NET_ROOT. Ensure 'graph_net' is installed or in PYTHONPATH."
+    exit 1
+fi
 
-# Workspace Configuration
-TIMESTAMP=$(date +%Y%m%d_%H%M%S)
-WORKSPACE="/tmp/single_op_workspace_${TIMESTAMP}"
+# 2. Parallel Processing Config
+AUTO_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l)
+if [ "$AUTO_GPUS" -eq 0 ]; then AUTO_GPUS=1; fi
 
-# Define standardized output directory structure
-RAW_OUTPUT_DIR="${WORKSPACE}/01_raw_single_op_subgraphs"
-RENAMED_OUTPUT_DIR="${WORKSPACE}/02_renamed_single_op_subgraphs"
-DEDUPLICATED_OUTPUT_DIR="${WORKSPACE}/03_deduplicated_single_op_subgraphs"
+# 逻辑：
+# 1. 如果你运行命令带了参数 (e.g., ./script.sh 8)，就用参数值。
+# 2. 否则，使用自动检测到的 GPU 数量。
+NUM_GPUS=${1:-$AUTO_GPUS}
 
-# Define intermediate list file paths
-RAW_SAMPLE_LIST="${WORKSPACE}/sample_list_01_raw.txt"
-RENAMED_SAMPLE_LIST="${WORKSPACE}/sample_list_02_renamed.txt"
+echo ">>> Detected/Set NUM_GPUS: ${NUM_GPUS}"
 
-# Create workspace
-mkdir -p "$WORKSPACE" "$RAW_OUTPUT_DIR"
+RESUME="false"
 
-# ==============================================================================
-# Helper Functions
-# ==============================================================================
+# 3. Workspace Setup
+TIMESTAMP=$(date +%Y%m%d_%H%M)
+WORKSPACE="/tmp/single_op_workspace_${TIMESTAMP}"
+# You can override this via argument or env var
+MODEL_LIST="${MODEL_LIST:-${GRAPH_NET_ROOT}/graph_net/config/small100_torch_samples_list.txt}"
 
-# Subgraph list generation function (mimics the original script)
-function generate_subgraph_list() {
-    local target_dir="$1"
-    local sample_list="$2"
-    echo ">>> Generate subgraph_sample_list for samples under ${target_dir}."
-    echo ">>>"
-    
-    # Find parent directories of all model.py files to identify valid samples
-    find ${target_dir} -name "model.py" \
-        | xargs dirname \
-        | xargs realpath --relative-to=${target_dir} \
-        | tee $sample_list
-}
+# 4. Output Directories
+OP_NAMES_DIR="${WORKSPACE}/01_op_names"
+RANGES_DIR="${WORKSPACE}/02_ranges"
+RAW_SUBGRAPH_DIR="${WORKSPACE}/03_raw_subgraphs"
+RENAMED_DIR="${WORKSPACE}/04_renamed"
+DEDUPLICATED_DIR="${WORKSPACE}/05_deduplicated"
+LOG_DIR="${WORKSPACE}/logs"  # New: Dedicated log directory
+
+export PYTHONPATH="${GRAPH_NET_ROOT}:${PYTHONPATH}"
+export GRAPH_NET_ROOT PYTHON_EXEC WORKSPACE OP_NAMES_DIR RANGES_DIR RAW_SUBGRAPH_DIR RESUME LOG_DIR
+
+mkdir -p "$WORKSPACE" "$LOG_DIR"
 
 # ==============================================================================
-# Stage 1: Generation (Black Box Mode)
+# Core Logic: Single Model Processing (V3: Strict Error Checking)
 # ==============================================================================
-
-function generate_raw_data() {
-    echo ">>> [1] Generating Single Operator Subgraphs (Running Python Script)..."
-    echo ">>>"
-
-    local TEMP_GEN_SCRIPT="${WORKSPACE}/_internal_gen.py"
-
-    # 1. Write the Python script to a temporary file
-    # Note: The Python logic is preserved exactly as provided
-    cat << 'EOF' > "$TEMP_GEN_SCRIPT"
-import os
-import sys
-import time
-import math
-import subprocess
-import datetime
-import multiprocessing
-import json
-import base64
-
-# [TODO] HARDCODED: Keep sync with Shell script
-PYTHON_EXEC = "/workspace/venv_graphnet/bin/python3"
-PROJECT_ROOT = "/workspace/GraphNet"
-# [dependency] WARNING: This is currently pointing to the small 10 sample list
-INPUT_LIST = os.path.join(PROJECT_ROOT, "graph_net/config/small10_torch_samples_list.txt")
-
-NUM_GPUS = 2
-TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
-# Temporary directory generated inside the Python script
-BASE_DIR = f"/tmp/decompose_run_{TIMESTAMP}"
-
-def make_config_b64(config_dict):
-    json_str = json.dumps(config_dict)
-    return base64.b64encode(json_str.encode('utf-8')).decode('utf-8')
-
-def run_stage_cmd(env, cwd, cmd_args, stage_name, log_file):
-    cmd = [PYTHON_EXEC, "-u", "-m", "graph_net.apply_sample_pass"] + cmd_args
-    try:
-        result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True)
-        with open(log_file, "a") as f:
-            if result.returncode != 0:
-                f.write(f"\n[FAIL] {stage_name} Error (Exit {result.returncode}):\n")
-                f.write(result.stderr[-2000:] + "\n")
-                return False
-            else:
-                return True
-    except Exception as e:
-        with open(log_file, "a") as f:
-            f.write(f"\n[CRITICAL] {stage_name} Exception: {str(e)}\n")
-        return False
-
-def worker_process(gpu_id, models, base_dir):
-    log_file = os.path.join(base_dir, "logs", f"worker_gpu{gpu_id}.log")
-    workspace = base_dir
-    ranges_dir = os.path.join(workspace, "workspace_single_operator_ranges")
-    
-    env = os.environ.copy()
-    env["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
-    env["GRAPH_NET_ROOT"] = PROJECT_ROOT
-    env["PYTHONPATH"] = PROJECT_ROOT
-    env["PYTHONUNBUFFERED"] = "1"
-
-    with open(log_file, "w") as f:
-        f.write(f"==== Worker GPU {gpu_id} Started ====\n")
+process_single_model() {
+    local model_path=$1
+    local gpu_id=$2
     
-    for idx, model_path in enumerate(models):
-        model_output_dir = os.path.join(workspace, model_path)
-        os.makedirs(model_output_dir, exist_ok=True)
-        os.makedirs(ranges_dir, exist_ok=True)
-
-        # Stage 1: OpNamesExtractor
-        cfg_s1 = make_config_b64({
-            "resume": False, 
-            "model_path_prefix": PROJECT_ROOT, 
-            "output_dir": workspace
-        })
-        run_stage_cmd(env, PROJECT_ROOT, [
-            "--model-path", model_path,
-            "--sample-pass-file-path", f"{PROJECT_ROOT}/graph_net/torch/sample_pass/op_names_extractor.py",
-            "--sample-pass-class-name", "OpNamesExtractor",
-            "--sample-pass-config", cfg_s1
-        ], "Stage 1", log_file)
-
-        # Stage 2: OpExtractPointsGenerator
-        cfg_s2 = make_config_b64({
-            "resume": False, 
-            "model_path_prefix": PROJECT_ROOT, 
-            "op_names_path_prefix": workspace, 
-            "output_dir": ranges_dir, 
-            "subgraph_ranges_file_name": "subgraph_ranges.json"
-        })
-        run_stage_cmd(env, PROJECT_ROOT, [
-            "--model-path", model_path,
-            "--sample-pass-file-path", f"{PROJECT_ROOT}/graph_net/sample_pass/op_extract_points_generator.py",
-            "--sample-pass-class-name", "OpExtractPointsGenerator",
-            "--sample-pass-config", cfg_s2
-        ], "Stage 2", log_file)
-
-        # Stage 3: SubgraphGenerator
-        cfg_s3 = make_config_b64({
-            "resume": False, 
-            "model_path_prefix": PROJECT_ROOT, 
-            "output_dir": workspace, 
-            "subgraph_ranges_json_root": ranges_dir, 
-            "group_head_and_tail": False, 
-            "chain_style": False
-        })
-        run_stage_cmd(env, PROJECT_ROOT, [
-            "--model-path", model_path,
-            "--sample-pass-file-path", f"{PROJECT_ROOT}/graph_net/torch/sample_pass/subgraph_generator.py",
-            "--sample-pass-class-name", "SubgraphGenerator",
-            "--sample-pass-config", cfg_s3
-        ], "Stage 3", log_file)
-
-def main():
-    if not os.path.exists(PYTHON_EXEC): return
-    os.makedirs(BASE_DIR, exist_ok=True)
-    os.makedirs(os.path.join(BASE_DIR, "logs"), exist_ok=True)
+    export CUDA_VISIBLE_DEVICES="${gpu_id}"
     
-    # This line is the key anchor for the Shell script to capture the path
-    print(f"Workspace: {BASE_DIR}") 
-    print(f"Dataset Generation Started...")
+    local safe_name=$(basename "$model_path")
+    local tmp_list="${WORKSPACE}/tmp_list_${BASHPID}.txt"
+    local log_file="${LOG_DIR}/${safe_name}_${BASHPID}.log"
+
+    echo "${model_path}" > "${tmp_list}"
+    echo "=== Processing ${model_path} ===" > "$log_file"
+
+    run_step() {
+        local step_name=$1
+        local cmd_str=$2
+        
+        echo "---------------------------------------------------" >> "$log_file"
+        echo ">>> Running Stage: ${step_name}" >> "$log_file"
+        
+
+        if ! eval "$cmd_str" >> "$log_file" 2>&1; then
+            echo "[GPU ${gpu_id}] System Failed at ${step_name}: ${model_path}"
+            return 1
+        fi
+        
+        if grep -q -E "Traceback \(most recent call last\)|Error:|Exception:" "$log_file"; then
+            echo "[GPU ${gpu_id}] Logic Failed at ${step_name} (Found Traceback): ${model_path}"
+            echo "    -> Log saved at: ${log_file}"
+            tail -n 5 "$log_file" | sed "s/^/[GPU ${gpu_id}] /"
+            return 1
+        fi
+        
+        return 0
+    }
 
-    with open(INPUT_LIST, 'r') as f:
-        all_models = [l.strip() for l in f if l.strip() and not l.startswith("#")]
-    
-    chunk_size = math.ceil(len(all_models) / NUM_GPUS)
-    processes = []
-    for i in range(NUM_GPUS):
-        chunk = all_models[i*chunk_size : (i+1)*chunk_size]
-        if not chunk: continue
-        p = multiprocessing.Process(target=worker_process, args=(i, chunk, BASE_DIR))
-        p.start()
-        processes.append(p)
+    # --- Stage 1: Op Names ---
+    cmd_s1="$PYTHON_EXEC -m graph_net.model_path_handler --model-path-list ${tmp_list} --handler-config=\$(base64 -w 0 <<EOF
+{
+    \"handler_path\": \"$GRAPH_NET_ROOT/graph_net/torch/sample_pass/op_names_extractor.py\",
+    \"handler_class_name\": \"OpNamesExtractor\",
+    \"handler_config\": { \"resume\": $RESUME, \"model_path_prefix\": \"$GRAPH_NET_ROOT\", \"output_dir\": \"$OP_NAMES_DIR\" }
+}
+EOF
+)"
     
-    for p in processes:
-        p.join()
+    run_step "OpNames" "$cmd_s1" || { rm -f "${tmp_list}"; return 1; }
 
-if __name__ == "__main__":
-    multiprocessing.set_start_method('spawn', force=True)
-    main()
+    # --- Stage 2: Ranges ---
+    cmd_s2="$PYTHON_EXEC -m graph_net.apply_sample_pass --model-path-list ${tmp_list} --sample-pass-file-path \"$GRAPH_NET_ROOT/graph_net/sample_pass/op_extract_points_generator.py\" --sample-pass-class-name \"OpExtractPointsGenerator\" --sample-pass-config=\$(base64 -w 0 <<EOF
+{
+    \"model_path_prefix\": \"$GRAPH_NET_ROOT\", \"op_names_path_prefix\": \"$OP_NAMES_DIR\",
+    \"output_dir\": \"$RANGES_DIR\", \"subgraph_ranges_file_name\": \"subgraph_ranges.json\"
+}
 EOF
+)"
+    run_step "Ranges" "$cmd_s2" || { rm -f "${tmp_list}"; return 1; }
 
-    # 2. Execute the Python script and capture the output directory
-    # We use tee to output logs to screen and grep to capture the "Workspace: " line
-    echo ">>> Running internal python generator..."
-    PYTHON_OUTPUT=$($PYTHON_EXEC $TEMP_GEN_SCRIPT | tee /dev/tty)
-    
-    # Extract the generated temporary path
-    TEMP_SRC_DIR=$(echo "$PYTHON_OUTPUT" | grep "Workspace:" | awk '{print $2}' | tr -d '\r')
+    # --- Stage 3: Decompose ---
+    cmd_s3="$PYTHON_EXEC -m graph_net.model_path_handler --model-path-list ${tmp_list} --handler-config=\$(base64 -w 0 <<EOF
+{
+    \"handler_path\": \"$GRAPH_NET_ROOT/graph_net/torch/sample_pass/subgraph_generator.py\",
+    \"handler_class_name\": \"SubgraphGenerator\",
+    \"handler_config\": {
+        \"resume\": $RESUME, \"model_path_prefix\": \"$GRAPH_NET_ROOT\", \"output_dir\": \"$RAW_SUBGRAPH_DIR\",
+        \"subgraph_ranges_json_root\": \"$RANGES_DIR\", \"subgraph_ranges_json_file_name\": \"subgraph_ranges.json\",
+        \"group_head_and_tail\": false, \"chain_style\": false, \"device\": \"cuda\"
+    }
+}
+EOF
+)"
+    run_step "Decompose" "$cmd_s3" || { rm -f "${tmp_list}"; return 1; }
 
-    if [ -z "$TEMP_SRC_DIR" ]; then
-        echo "Error: Could not capture workspace path from python script."
-        exit 1
-    fi
+    rm -f "${tmp_list}"
 
-    echo ">>> Python script finished. Temporary output at: $TEMP_SRC_DIR"
-    
-    # 3. Move Step
-    echo ">>> Moving data from temp dir to standardized dir: $RAW_OUTPUT_DIR"
-    # We only move the generated subgraph folders, excluding logs and range files.
-    # Assuming subgraphs are generated inside model directories under BASE_DIR, 
-    # we move everything first, then clean up.
-    
-    # Move all content
-    cp -r ${TEMP_SRC_DIR}/* ${RAW_OUTPUT_DIR}/
-    
-    # Clean up unnecessary intermediate artifacts (ranges and logs), keeping only subgraphs
-    rm -rf ${RAW_OUTPUT_DIR}/logs
-    rm -rf ${RAW_OUTPUT_DIR}/workspace_single_operator_ranges
-    
-    echo ">>> Data moved and cleaned."
+    echo "[GPU ${gpu_id}] Done: ${model_path}"
 }
 
+export -f process_single_model
+
 # ==============================================================================
-# Stage 2: Renaming
+# Helper Function: Subgraph List Generation
 # ==============================================================================
+function generate_subgraph_list() {
+    local target_dir="$1"
+    local sample_list="$2"
+    echo ">>> Generating subgraph list for ${target_dir}..."
+    find ${target_dir} -name "model.py" \
+        | xargs dirname \
+        | xargs realpath --relative-to=${target_dir} \
+        | tee $sample_list > /dev/null
+}
 
-function rename_subgraphs() {
-    echo ">>> [2] Rename subgraph samples under ${RAW_OUTPUT_DIR}."
-    echo ">>>"
+# ==============================================================================
+# Main Pipeline Dispatcher
+# ==============================================================================
+function main() {
+    echo ">>> Starting Pipeline..."
+    echo "    Python: $PYTHON_EXEC"
+    echo "    Root:   $GRAPH_NET_ROOT"
+    echo "    Logs:   $LOG_DIR"
+    
+    # 1. Prepare Data
+    if [ ! -f "$MODEL_LIST" ]; then
+        echo "Error: Model list not found at $MODEL_LIST"
+        exit 1
+    fi
+    grep -v "^#" "${MODEL_LIST}" | grep -v "^$" > "${WORKSPACE}/clean_list.txt"
+    total_lines=$(wc -l < "${WORKSPACE}/clean_list.txt")
     
-    # First, generate the list
-    generate_subgraph_list ${RAW_OUTPUT_DIR} ${RAW_SAMPLE_LIST}
+    echo ">>> Total Models: $total_lines | GPUS: $NUM_GPUS"
+
+    # 2. Sharding
+    lines_per_gpu=$(( (total_lines + NUM_GPUS - 1) / NUM_GPUS ))
+    split -l ${lines_per_gpu} -d "${WORKSPACE}/clean_list.txt" "${WORKSPACE}/gpu_chunk_"
+
+    # 3. Parallel Execution
+    for (( i=0; i<NUM_GPUS; i++ )); do
+        suffix=$(printf "%02d" $i)
+        chunk_file="${WORKSPACE}/gpu_chunk_${suffix}"
+        
+        [ ! -f "$chunk_file" ] && continue
+        
+        echo ">>> Launching Worker for GPU $i..."
+        (
+            while read -r model_path; do
+                process_single_model "$model_path" "$i" || true 
+            done < "$chunk_file"
+        ) & 
+    done
+
+    # 4. Wait
+    echo ">>> Waiting for workers..."
+    wait
+    echo ">>> Generation Phase Complete."
+
+    # ==========================================================================
+    # Post-processing
+    # ==========================================================================
+    
+    echo ">>> Starting Renaming Phase..."
+    generate_subgraph_list ${RAW_SUBGRAPH_DIR} "${WORKSPACE}/raw_list.txt"
 
+    # We redirect output to a main log file here because it's a single process
     $PYTHON_EXEC -m graph_net.model_path_handler \
-        --model-path-list ${RAW_SAMPLE_LIST} \
+        --model-path-list "${WORKSPACE}/raw_list.txt" \
         --handler-config=$(base64 -w 0 <<EOF
 {
     "handler_path": "$GRAPH_NET_ROOT/graph_net/sample_pass/ast_graph_variable_renamer.py",
     "handler_class_name": "AstGraphVariableRenamer",
     "handler_config": {
-        "device": "cuda",
-        "try_run": false,
-        "resume": ${RESUME},
-        "model_path_prefix": "${RAW_OUTPUT_DIR}",
+        "device": "cuda", "try_run": false, "resume": $RESUME,
+        "model_path_prefix": "${RAW_SUBGRAPH_DIR}",
         "data_input_predicator_filepath": "$GRAPH_NET_ROOT/graph_net/torch/constraint_util.py",
         "data_input_predicator_class_name": "NaiveDataInputPredicator",
         "model_runnable_predicator_filepath": "$GRAPH_NET_ROOT/graph_net/torch/constraint_util.py",
         "model_runnable_predicator_class_name": "ModelRunnablePredicator",
-        "output_dir": "$RENAMED_OUTPUT_DIR"
+        "output_dir": "${RENAMED_DIR}"
     }
 }
 EOF
-)
-}
-
-# ==============================================================================
-# Stage 3: Deduplication
-# ==============================================================================
+) >> "${LOG_DIR}/renaming.log" 2>&1
 
-function deduplicate_subgraphs() {
-    echo ">>> [3] Remove duplicated subgraph samples under ${RENAMED_OUTPUT_DIR}."
-    echo ">>>"
+    echo ">>> Starting Deduplication Phase..."
+    if [ -d "${DEDUPLICATED_DIR}" ]; then rm -rf "${DEDUPLICATED_DIR}"; fi
 
-    
-    if [ -d "${DEDUPLICATED_OUTPUT_DIR}" ]; then
-        echo ">>> Target directory exists. Cleaning up..."
-        rm -rf "${DEDUPLICATED_OUTPUT_DIR}"
-    fi
-    
     $PYTHON_EXEC -m graph_net.tools.deduplicated \
-        --samples-dir ${RENAMED_OUTPUT_DIR} \
-        --target-dir ${DEDUPLICATED_OUTPUT_DIR}
-}
-
-# ==============================================================================
-# Main Workflow
-# ==============================================================================
-
-main() {
-    echo "=========================================================="
-    echo "START: Single Operator Dataset Generation Pipeline"
-    echo "Workspace: $WORKSPACE"
-    echo "=========================================================="
-
-    # 1. Generate raw data
-    generate_raw_data
-
-    # 2. Rename variables (Standardization)
-    rename_subgraphs
-
-    # 3. Deduplicate
-    deduplicate_subgraphs
+        --samples-dir ${RENAMED_DIR} \
+        --target-dir ${DEDUPLICATED_DIR} >> "${LOG_DIR}/deduplication.log" 2>&1
 
-    echo "=========================================================="
-    echo "FINISH: Dataset generated at ${DEDUPLICATED_OUTPUT_DIR}"
-    echo "=========================================================="
+    echo ">>> ALL DONE. Final dataset located at: ${DEDUPLICATED_DIR}"
+    echo ">>> Check ${LOG_DIR} for error logs if any failures occurred."
 }
 
 main

From 3e250b00f5e7dc1e7dc745aaedb1bf1876403d65 Mon Sep 17 00:00:00 2001
From: ywh555hhh <1916647616@qq.com>
Date: Mon, 9 Feb 2026 16:42:24 +0800
Subject: [PATCH 3/6] Refactor dataset generation script to strict serial
 execution mode

---
 graph_net/test/generate_single_op_dataset.sh | 241 ++++++-------------
 1 file changed, 68 insertions(+), 173 deletions(-)

diff --git a/graph_net/test/generate_single_op_dataset.sh b/graph_net/test/generate_single_op_dataset.sh
index f3c74635b..804feff7e 100644
--- a/graph_net/test/generate_single_op_dataset.sh
+++ b/graph_net/test/generate_single_op_dataset.sh
@@ -5,207 +5,105 @@ set -e
 # Configuration Area
 # ==============================================================================
 
-# [CRITICAL NOTICE]
-# This script now uses dynamic path detection. 
-# Ensure you are running inside the correct Virtual Environment.
-
-# 1. Dynamic Path Retrieval (Fixing Hardcoded Paths)
-# Detect python executable from current PATH
+# Dynamic Path Retrieval
 PYTHON_EXEC=$(which python3)
 if [ -z "$PYTHON_EXEC" ]; then
     echo "Error: 'python3' not found in PATH. Please activate your virtualenv."
     exit 1
 fi
 
-# Detect Project Root dynamically by importing the module
 GRAPH_NET_ROOT=$($PYTHON_EXEC -c "import graph_net; import os; print(os.path.dirname(os.path.dirname(graph_net.__file__)))")
 if [ -z "$GRAPH_NET_ROOT" ]; then
     echo "Error: Could not determine GRAPH_NET_ROOT. Ensure 'graph_net' is installed or in PYTHONPATH."
     exit 1
 fi
 
-# 2. Parallel Processing Config
-AUTO_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l)
-if [ "$AUTO_GPUS" -eq 0 ]; then AUTO_GPUS=1; fi
-
-# 逻辑：
-# 1. 如果你运行命令带了参数 (e.g., ./script.sh 8)，就用参数值。
-# 2. 否则，使用自动检测到的 GPU 数量。
-NUM_GPUS=${1:-$AUTO_GPUS}
-
-echo ">>> Detected/Set NUM_GPUS: ${NUM_GPUS}"
-
 RESUME="false"
 
-# 3. Workspace Setup
+# Workspace Setup
 TIMESTAMP=$(date +%Y%m%d_%H%M)
 WORKSPACE="/tmp/single_op_workspace_${TIMESTAMP}"
-# You can override this via argument or env var
 MODEL_LIST="${MODEL_LIST:-${GRAPH_NET_ROOT}/graph_net/config/small100_torch_samples_list.txt}"
 
-# 4. Output Directories
+# Output Directories
 OP_NAMES_DIR="${WORKSPACE}/01_op_names"
 RANGES_DIR="${WORKSPACE}/02_ranges"
 RAW_SUBGRAPH_DIR="${WORKSPACE}/03_raw_subgraphs"
 RENAMED_DIR="${WORKSPACE}/04_renamed"
 DEDUPLICATED_DIR="${WORKSPACE}/05_deduplicated"
-LOG_DIR="${WORKSPACE}/logs"  # New: Dedicated log directory
 
-export PYTHONPATH="${GRAPH_NET_ROOT}:${PYTHONPATH}"
-export GRAPH_NET_ROOT PYTHON_EXEC WORKSPACE OP_NAMES_DIR RANGES_DIR RAW_SUBGRAPH_DIR RESUME LOG_DIR
-
-mkdir -p "$WORKSPACE" "$LOG_DIR"
+mkdir -p "$WORKSPACE"
 
 # ==============================================================================
-# Core Logic: Single Model Processing (V3: Strict Error Checking)
+# Main Pipeline
 # ==============================================================================
-process_single_model() {
-    local model_path=$1
-    local gpu_id=$2
-    
-    export CUDA_VISIBLE_DEVICES="${gpu_id}"
-    
-    local safe_name=$(basename "$model_path")
-    local tmp_list="${WORKSPACE}/tmp_list_${BASHPID}.txt"
-    local log_file="${LOG_DIR}/${safe_name}_${BASHPID}.log"
-
-    echo "${model_path}" > "${tmp_list}"
-    echo "=== Processing ${model_path} ===" > "$log_file"
-
-    run_step() {
-        local step_name=$1
-        local cmd_str=$2
-        
-        echo "---------------------------------------------------" >> "$log_file"
-        echo ">>> Running Stage: ${step_name}" >> "$log_file"
-        
-
-        if ! eval "$cmd_str" >> "$log_file" 2>&1; then
-            echo "[GPU ${gpu_id}] System Failed at ${step_name}: ${model_path}"
-            return 1
-        fi
-        
-        if grep -q -E "Traceback \(most recent call last\)|Error:|Exception:" "$log_file"; then
-            echo "[GPU ${gpu_id}] Logic Failed at ${step_name} (Found Traceback): ${model_path}"
-            echo "    -> Log saved at: ${log_file}"
-            tail -n 5 "$log_file" | sed "s/^/[GPU ${gpu_id}] /"
-            return 1
-        fi
-        
-        return 0
-    }
 
-    # --- Stage 1: Op Names ---
-    cmd_s1="$PYTHON_EXEC -m graph_net.model_path_handler --model-path-list ${tmp_list} --handler-config=\$(base64 -w 0 <<EOF
+echo ">>> Starting Pipeline..."
+echo "    Python: $PYTHON_EXEC"
+echo "    Root:   $GRAPH_NET_ROOT"
+
+# 1. Prepare Data
+if [ ! -f "$MODEL_LIST" ]; then
+    echo "Error: Model list not found at $MODEL_LIST"
+    exit 1
+fi
+
+grep -v "^#" "${MODEL_LIST}" | grep -v "^$" > "${WORKSPACE}/clean_list.txt"
+
+# 2. Stage 1: Op Names
+echo ">>> Running Stage 1: Op Names..."
+python3 -m graph_net.model_path_handler \
+    --model-path-list "${WORKSPACE}/clean_list.txt" \
+    --handler-config=$(base64 -w 0 <<EOF
 {
-    \"handler_path\": \"$GRAPH_NET_ROOT/graph_net/torch/sample_pass/op_names_extractor.py\",
-    \"handler_class_name\": \"OpNamesExtractor\",
-    \"handler_config\": { \"resume\": $RESUME, \"model_path_prefix\": \"$GRAPH_NET_ROOT\", \"output_dir\": \"$OP_NAMES_DIR\" }
+    "handler_path": "$GRAPH_NET_ROOT/graph_net/torch/sample_pass/op_names_extractor.py",
+    "handler_class_name": "OpNamesExtractor",
+    "handler_config": { "resume": $RESUME, "model_path_prefix": "$GRAPH_NET_ROOT", "output_dir": "$OP_NAMES_DIR" }
 }
 EOF
-)"
-    
-    run_step "OpNames" "$cmd_s1" || { rm -f "${tmp_list}"; return 1; }
-
-    # --- Stage 2: Ranges ---
-    cmd_s2="$PYTHON_EXEC -m graph_net.apply_sample_pass --model-path-list ${tmp_list} --sample-pass-file-path \"$GRAPH_NET_ROOT/graph_net/sample_pass/op_extract_points_generator.py\" --sample-pass-class-name \"OpExtractPointsGenerator\" --sample-pass-config=\$(base64 -w 0 <<EOF
+)
+
+# 3. Stage 2: Ranges
+echo ">>> Running Stage 2: Ranges..."
+python3 -m graph_net.apply_sample_pass \
+    --model-path-list "${WORKSPACE}/clean_list.txt" \
+    --sample-pass-file-path "$GRAPH_NET_ROOT/graph_net/sample_pass/op_extract_points_generator.py" \
+    --sample-pass-class-name "OpExtractPointsGenerator" \
+    --sample-pass-config=$(base64 -w 0 <<EOF
 {
-    \"model_path_prefix\": \"$GRAPH_NET_ROOT\", \"op_names_path_prefix\": \"$OP_NAMES_DIR\",
-    \"output_dir\": \"$RANGES_DIR\", \"subgraph_ranges_file_name\": \"subgraph_ranges.json\"
+    "model_path_prefix": "$GRAPH_NET_ROOT", "op_names_path_prefix": "$OP_NAMES_DIR",
+    "output_dir": "$RANGES_DIR", "subgraph_ranges_file_name": "subgraph_ranges.json"
 }
 EOF
-)"
-    run_step "Ranges" "$cmd_s2" || { rm -f "${tmp_list}"; return 1; }
+)
 
-    # --- Stage 3: Decompose ---
-    cmd_s3="$PYTHON_EXEC -m graph_net.model_path_handler --model-path-list ${tmp_list} --handler-config=\$(base64 -w 0 <<EOF
+# 4. Stage 3: Decompose
+echo ">>> Running Stage 3: Decompose..."
+python3 -m graph_net.model_path_handler \
+    --model-path-list "${WORKSPACE}/clean_list.txt" \
+    --handler-config=$(base64 -w 0 <<EOF
 {
-    \"handler_path\": \"$GRAPH_NET_ROOT/graph_net/torch/sample_pass/subgraph_generator.py\",
-    \"handler_class_name\": \"SubgraphGenerator\",
-    \"handler_config\": {
-        \"resume\": $RESUME, \"model_path_prefix\": \"$GRAPH_NET_ROOT\", \"output_dir\": \"$RAW_SUBGRAPH_DIR\",
-        \"subgraph_ranges_json_root\": \"$RANGES_DIR\", \"subgraph_ranges_json_file_name\": \"subgraph_ranges.json\",
-        \"group_head_and_tail\": false, \"chain_style\": false, \"device\": \"cuda\"
+    "handler_path": "$GRAPH_NET_ROOT/graph_net/torch/sample_pass/subgraph_generator.py",
+    "handler_class_name": "SubgraphGenerator",
+    "handler_config": {
+        "resume": $RESUME, "model_path_prefix": "$GRAPH_NET_ROOT", "output_dir": "$RAW_SUBGRAPH_DIR",
+        "subgraph_ranges_json_root": "$RANGES_DIR", "subgraph_ranges_json_file_name": "subgraph_ranges.json",
+        "group_head_and_tail": false, "chain_style": false, "device": "cuda"
     }
 }
 EOF
-)"
-    run_step "Decompose" "$cmd_s3" || { rm -f "${tmp_list}"; return 1; }
-
-    rm -f "${tmp_list}"
-
-    echo "[GPU ${gpu_id}] Done: ${model_path}"
-}
-
-export -f process_single_model
-
-# ==============================================================================
-# Helper Function: Subgraph List Generation
-# ==============================================================================
-function generate_subgraph_list() {
-    local target_dir="$1"
-    local sample_list="$2"
-    echo ">>> Generating subgraph list for ${target_dir}..."
-    find ${target_dir} -name "model.py" \
-        | xargs dirname \
-        | xargs realpath --relative-to=${target_dir} \
-        | tee $sample_list > /dev/null
-}
-
-# ==============================================================================
-# Main Pipeline Dispatcher
-# ==============================================================================
-function main() {
-    echo ">>> Starting Pipeline..."
-    echo "    Python: $PYTHON_EXEC"
-    echo "    Root:   $GRAPH_NET_ROOT"
-    echo "    Logs:   $LOG_DIR"
-    
-    # 1. Prepare Data
-    if [ ! -f "$MODEL_LIST" ]; then
-        echo "Error: Model list not found at $MODEL_LIST"
-        exit 1
-    fi
-    grep -v "^#" "${MODEL_LIST}" | grep -v "^$" > "${WORKSPACE}/clean_list.txt"
-    total_lines=$(wc -l < "${WORKSPACE}/clean_list.txt")
-    
-    echo ">>> Total Models: $total_lines | GPUS: $NUM_GPUS"
-
-    # 2. Sharding
-    lines_per_gpu=$(( (total_lines + NUM_GPUS - 1) / NUM_GPUS ))
-    split -l ${lines_per_gpu} -d "${WORKSPACE}/clean_list.txt" "${WORKSPACE}/gpu_chunk_"
-
-    # 3. Parallel Execution
-    for (( i=0; i<NUM_GPUS; i++ )); do
-        suffix=$(printf "%02d" $i)
-        chunk_file="${WORKSPACE}/gpu_chunk_${suffix}"
-        
-        [ ! -f "$chunk_file" ] && continue
-        
-        echo ">>> Launching Worker for GPU $i..."
-        (
-            while read -r model_path; do
-                process_single_model "$model_path" "$i" || true 
-            done < "$chunk_file"
-        ) & 
-    done
-
-    # 4. Wait
-    echo ">>> Waiting for workers..."
-    wait
-    echo ">>> Generation Phase Complete."
-
-    # ==========================================================================
-    # Post-processing
-    # ==========================================================================
-    
-    echo ">>> Starting Renaming Phase..."
-    generate_subgraph_list ${RAW_SUBGRAPH_DIR} "${WORKSPACE}/raw_list.txt"
-
-    # We redirect output to a main log file here because it's a single process
-    $PYTHON_EXEC -m graph_net.model_path_handler \
-        --model-path-list "${WORKSPACE}/raw_list.txt" \
-        --handler-config=$(base64 -w 0 <<EOF
+)
+
+# 5. Post-processing: Rename
+echo ">>> Running Post-processing: Rename..."
+find ${RAW_SUBGRAPH_DIR} -name "model.py" \
+    | xargs dirname \
+    | xargs realpath --relative-to=${RAW_SUBGRAPH_DIR} \
+    > "${WORKSPACE}/raw_list.txt"
+
+python3 -m graph_net.model_path_handler \
+    --model-path-list "${WORKSPACE}/raw_list.txt" \
+    --handler-config=$(base64 -w 0 <<EOF
 {
     "handler_path": "$GRAPH_NET_ROOT/graph_net/sample_pass/ast_graph_variable_renamer.py",
     "handler_class_name": "AstGraphVariableRenamer",
@@ -220,17 +118,14 @@ function main() {
     }
 }
 EOF
-) >> "${LOG_DIR}/renaming.log" 2>&1
-
-    echo ">>> Starting Deduplication Phase..."
-    if [ -d "${DEDUPLICATED_DIR}" ]; then rm -rf "${DEDUPLICATED_DIR}"; fi
+)
 
-    $PYTHON_EXEC -m graph_net.tools.deduplicated \
-        --samples-dir ${RENAMED_DIR} \
-        --target-dir ${DEDUPLICATED_DIR} >> "${LOG_DIR}/deduplication.log" 2>&1
+# 6. Post-processing: Deduplicate
+echo ">>> Running Post-processing: Deduplicate..."
+if [ -d "${DEDUPLICATED_DIR}" ]; then rm -rf "${DEDUPLICATED_DIR}"; fi
 
-    echo ">>> ALL DONE. Final dataset located at: ${DEDUPLICATED_DIR}"
-    echo ">>> Check ${LOG_DIR} for error logs if any failures occurred."
-}
+python3 -m graph_net.tools.deduplicated \
+    --samples-dir ${RENAMED_DIR} \
+    --target-dir ${DEDUPLICATED_DIR}
 
-main
+echo ">>> ALL DONE. Final dataset located at: ${DEDUPLICATED_DIR}"

From f4f8c952f2e1175c7183b939b8d4a5bb3135bbdb Mon Sep 17 00:00:00 2001
From: ywh555hhh <1916647616@qq.com>
Date: Mon, 9 Feb 2026 16:50:09 +0800
Subject: [PATCH 4/6] extract raw_list.txt generation as separate step and copy
 to final output

---
 graph_net/test/generate_single_op_dataset.sh | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/graph_net/test/generate_single_op_dataset.sh b/graph_net/test/generate_single_op_dataset.sh
index 804feff7e..2fbcb3231 100644
--- a/graph_net/test/generate_single_op_dataset.sh
+++ b/graph_net/test/generate_single_op_dataset.sh
@@ -94,13 +94,15 @@ python3 -m graph_net.model_path_handler \
 EOF
 )
 
-# 5. Post-processing: Rename
-echo ">>> Running Post-processing: Rename..."
+# 5. Generate raw_list.txt
+echo ">>> Generating raw_list.txt..."
 find ${RAW_SUBGRAPH_DIR} -name "model.py" \
     | xargs dirname \
     | xargs realpath --relative-to=${RAW_SUBGRAPH_DIR} \
     > "${WORKSPACE}/raw_list.txt"
 
+# 6. Post-processing: Rename
+echo ">>> Running Post-processing: Rename..."
 python3 -m graph_net.model_path_handler \
     --model-path-list "${WORKSPACE}/raw_list.txt" \
     --handler-config=$(base64 -w 0 <<EOF
@@ -120,7 +122,7 @@ python3 -m graph_net.model_path_handler \
 EOF
 )
 
-# 6. Post-processing: Deduplicate
+# 7. Post-processing: Deduplicate
 echo ">>> Running Post-processing: Deduplicate..."
 if [ -d "${DEDUPLICATED_DIR}" ]; then rm -rf "${DEDUPLICATED_DIR}"; fi
 
@@ -128,4 +130,8 @@ python3 -m graph_net.tools.deduplicated \
     --samples-dir ${RENAMED_DIR} \
     --target-dir ${DEDUPLICATED_DIR}
 
+# Copy raw_list.txt to final output
+cp "${WORKSPACE}/raw_list.txt" "${DEDUPLICATED_DIR}/"
+
 echo ">>> ALL DONE. Final dataset located at: ${DEDUPLICATED_DIR}"
+echo ">>> raw_list.txt also saved to: ${DEDUPLICATED_DIR}/raw_list.txt"

From 570d7d128f93d8372d5ec7a079fafc572028c8ff Mon Sep 17 00:00:00 2001
From: ywh555hhh <1916647616@qq.com>
Date: Mon, 9 Feb 2026 18:37:37 +0800
Subject: [PATCH 5/6] remove redundant clean_list.txt and rename raw_list.txt
 to generated_subgraphs_list.txt

---
 graph_net/test/generate_single_op_dataset.sh | 22 +++++++++-----------
 1 file changed, 10 insertions(+), 12 deletions(-)

diff --git a/graph_net/test/generate_single_op_dataset.sh b/graph_net/test/generate_single_op_dataset.sh
index 2fbcb3231..8781913bf 100644
--- a/graph_net/test/generate_single_op_dataset.sh
+++ b/graph_net/test/generate_single_op_dataset.sh
@@ -48,12 +48,10 @@ if [ ! -f "$MODEL_LIST" ]; then
     exit 1
 fi
 
-grep -v "^#" "${MODEL_LIST}" | grep -v "^$" > "${WORKSPACE}/clean_list.txt"
-
 # 2. Stage 1: Op Names
 echo ">>> Running Stage 1: Op Names..."
 python3 -m graph_net.model_path_handler \
-    --model-path-list "${WORKSPACE}/clean_list.txt" \
+    --model-path-list "${MODEL_LIST}" \
     --handler-config=$(base64 -w 0 <<EOF
 {
     "handler_path": "$GRAPH_NET_ROOT/graph_net/torch/sample_pass/op_names_extractor.py",
@@ -66,7 +64,7 @@ EOF
 # 3. Stage 2: Ranges
 echo ">>> Running Stage 2: Ranges..."
 python3 -m graph_net.apply_sample_pass \
-    --model-path-list "${WORKSPACE}/clean_list.txt" \
+    --model-path-list "${MODEL_LIST}" \
     --sample-pass-file-path "$GRAPH_NET_ROOT/graph_net/sample_pass/op_extract_points_generator.py" \
     --sample-pass-class-name "OpExtractPointsGenerator" \
     --sample-pass-config=$(base64 -w 0 <<EOF
@@ -80,7 +78,7 @@ EOF
 # 4. Stage 3: Decompose
 echo ">>> Running Stage 3: Decompose..."
 python3 -m graph_net.model_path_handler \
-    --model-path-list "${WORKSPACE}/clean_list.txt" \
+    --model-path-list "${MODEL_LIST}" \
     --handler-config=$(base64 -w 0 <<EOF
 {
     "handler_path": "$GRAPH_NET_ROOT/graph_net/torch/sample_pass/subgraph_generator.py",
@@ -94,17 +92,17 @@ python3 -m graph_net.model_path_handler \
 EOF
 )
 
-# 5. Generate raw_list.txt
-echo ">>> Generating raw_list.txt..."
+# 5. Generate generated_subgraphs_list.txt
+echo ">>> Generating generated_subgraphs_list.txt..."
 find ${RAW_SUBGRAPH_DIR} -name "model.py" \
     | xargs dirname \
     | xargs realpath --relative-to=${RAW_SUBGRAPH_DIR} \
-    > "${WORKSPACE}/raw_list.txt"
+    > "${WORKSPACE}/generated_subgraphs_list.txt"
 
 # 6. Post-processing: Rename
 echo ">>> Running Post-processing: Rename..."
 python3 -m graph_net.model_path_handler \
-    --model-path-list "${WORKSPACE}/raw_list.txt" \
+    --model-path-list "${WORKSPACE}/generated_subgraphs_list.txt" \
     --handler-config=$(base64 -w 0 <<EOF
 {
     "handler_path": "$GRAPH_NET_ROOT/graph_net/sample_pass/ast_graph_variable_renamer.py",
@@ -130,8 +128,8 @@ python3 -m graph_net.tools.deduplicated \
     --samples-dir ${RENAMED_DIR} \
     --target-dir ${DEDUPLICATED_DIR}
 
-# Copy raw_list.txt to final output
-cp "${WORKSPACE}/raw_list.txt" "${DEDUPLICATED_DIR}/"
+# Copy generated_subgraphs_list.txt to final output
+cp "${WORKSPACE}/generated_subgraphs_list.txt" "${DEDUPLICATED_DIR}/"
 
 echo ">>> ALL DONE. Final dataset located at: ${DEDUPLICATED_DIR}"
-echo ">>> raw_list.txt also saved to: ${DEDUPLICATED_DIR}/raw_list.txt"
+echo ">>> generated_subgraphs_list.txt also saved to: ${DEDUPLICATED_DIR}/generated_subgraphs_list.txt"

From 304dfe6c7573e30d1f30f23bb93bd7cb78f73281 Mon Sep 17 00:00:00 2001
From: ywh555hhh <1916647616@qq.com>
Date: Mon, 9 Feb 2026 18:39:58 +0800
Subject: [PATCH 6/6] move generate_single_op_dataset.sh to tools directory

---
 graph_net/{test => tools}/generate_single_op_dataset.sh | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename graph_net/{test => tools}/generate_single_op_dataset.sh (100%)

diff --git a/graph_net/test/generate_single_op_dataset.sh b/graph_net/tools/generate_single_op_dataset.sh
similarity index 100%
rename from graph_net/test/generate_single_op_dataset.sh
rename to graph_net/tools/generate_single_op_dataset.sh