From 9ec7bdc13fe3d39021a4375e77498ecf5a143600 Mon Sep 17 00:00:00 2001 From: Wayne E <1916647616@qq.com> Date: Thu, 5 Feb 2026 15:49:58 +0800 Subject: [PATCH 1/6] feat: add single-operator subgraph dataset generation script This commit introduces `generate_single_op_dataset.sh` to automate the workflow for generating single-operator subgraph datasets. --- graph_net/test/generate_single_op_dataset.sh | 323 +++++++++++++++++++ 1 file changed, 323 insertions(+) create mode 100644 graph_net/test/generate_single_op_dataset.sh diff --git a/graph_net/test/generate_single_op_dataset.sh b/graph_net/test/generate_single_op_dataset.sh new file mode 100644 index 000000000..4dd94cbd6 --- /dev/null +++ b/graph_net/test/generate_single_op_dataset.sh @@ -0,0 +1,323 @@ +#!/bin/bash +set -x + +################################################################################ +# [CRITICAL NOTICE] BEFORE RUNNING THIS SCRIPT: +# +# /graph_net/test/generate_single_op_dataset.sh +# +# 1. Check 'PYTHON_EXEC': Ensure the variable below points to the correct +# Python interpreter in your virtual environment. +# +# 2. Check 'INPUT_LIST': Look for the 'INPUT_LIST' variable inside the +# internal Python script (Stage 1 section). It is currently hardcoded +# to 'small10_torch_samples_list.txt'. Please switch it to your full +# dataset list file before running large-scale generation. +################################################################################ + +# ============================================================================== +# Configuration Area +# ============================================================================== + +# [TODO] HARDCODED: Paths are currently hardcoded; needs dynamic retrieval or arguments in the future. +# Virtual Environment Python Executable Path +PYTHON_EXEC="/workspace/venv_graphnet/bin/python3" +# Project Root Directory +GRAPH_NET_ROOT="/workspace/GraphNet" + +# Script Runtime Arguments +GPU_ID=${1:-0} +RESUME="false" + +# Export environment variables to ensure Python can find graph_net +export CUDA_VISIBLE_DEVICES="${GPU_ID}" +export PYTHONPATH="${GRAPH_NET_ROOT}:${PYTHONPATH}" + +# Workspace Configuration +TIMESTAMP=$(date +%Y%m%d_%H%M%S) +WORKSPACE="/tmp/single_op_workspace_${TIMESTAMP}" + +# Define standardized output directory structure +RAW_OUTPUT_DIR="${WORKSPACE}/01_raw_single_op_subgraphs" +RENAMED_OUTPUT_DIR="${WORKSPACE}/02_renamed_single_op_subgraphs" +DEDUPLICATED_OUTPUT_DIR="${WORKSPACE}/03_deduplicated_single_op_subgraphs" + +# Define intermediate list file paths +RAW_SAMPLE_LIST="${WORKSPACE}/sample_list_01_raw.txt" +RENAMED_SAMPLE_LIST="${WORKSPACE}/sample_list_02_renamed.txt" + +# Create workspace +mkdir -p "$WORKSPACE" "$RAW_OUTPUT_DIR" + +# ============================================================================== +# Helper Functions +# ============================================================================== + +# Subgraph list generation function (mimics the original script) +function generate_subgraph_list() { + local target_dir="$1" + local sample_list="$2" + echo ">>> Generate subgraph_sample_list for samples under ${target_dir}." + echo ">>>" + + # Find parent directories of all model.py files to identify valid samples + find ${target_dir} -name "model.py" \ + | xargs dirname \ + | xargs realpath --relative-to=${target_dir} \ + | tee $sample_list +} + +# ============================================================================== +# Stage 1: Generation (Black Box Mode) +# ============================================================================== + +function generate_raw_data() { + echo ">>> [1] Generating Single Operator Subgraphs (Running Python Script)..." + echo ">>>" + + local TEMP_GEN_SCRIPT="${WORKSPACE}/_internal_gen.py" + + # 1. Write the Python script to a temporary file + # Note: The Python logic is preserved exactly as provided + cat << 'EOF' > "$TEMP_GEN_SCRIPT" +import os +import sys +import time +import math +import subprocess +import datetime +import multiprocessing +import json +import base64 + +# [TODO] HARDCODED: Keep sync with Shell script +PYTHON_EXEC = "/workspace/venv_graphnet/bin/python3" +PROJECT_ROOT = "/workspace/GraphNet" +# [dependency] WARNING: This is currently pointing to the small 10 sample list +INPUT_LIST = os.path.join(PROJECT_ROOT, "graph_net/config/small10_torch_samples_list.txt") + +NUM_GPUS = 2 +TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") +# Temporary directory generated inside the Python script +BASE_DIR = f"/tmp/decompose_run_{TIMESTAMP}" + +def make_config_b64(config_dict): + json_str = json.dumps(config_dict) + return base64.b64encode(json_str.encode('utf-8')).decode('utf-8') + +def run_stage_cmd(env, cwd, cmd_args, stage_name, log_file): + cmd = [PYTHON_EXEC, "-u", "-m", "graph_net.apply_sample_pass"] + cmd_args + try: + result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True) + with open(log_file, "a") as f: + if result.returncode != 0: + f.write(f"\n[FAIL] {stage_name} Error (Exit {result.returncode}):\n") + f.write(result.stderr[-2000:] + "\n") + return False + else: + return True + except Exception as e: + with open(log_file, "a") as f: + f.write(f"\n[CRITICAL] {stage_name} Exception: {str(e)}\n") + return False + +def worker_process(gpu_id, models, base_dir): + log_file = os.path.join(base_dir, "logs", f"worker_gpu{gpu_id}.log") + workspace = base_dir + ranges_dir = os.path.join(workspace, "workspace_single_operator_ranges") + + env = os.environ.copy() + env["CUDA_VISIBLE_DEVICES"] = str(gpu_id) + env["GRAPH_NET_ROOT"] = PROJECT_ROOT + env["PYTHONPATH"] = PROJECT_ROOT + env["PYTHONUNBUFFERED"] = "1" + + with open(log_file, "w") as f: + f.write(f"==== Worker GPU {gpu_id} Started ====\n") + + for idx, model_path in enumerate(models): + model_output_dir = os.path.join(workspace, model_path) + os.makedirs(model_output_dir, exist_ok=True) + os.makedirs(ranges_dir, exist_ok=True) + + # Stage 1: OpNamesExtractor + cfg_s1 = make_config_b64({ + "resume": False, + "model_path_prefix": PROJECT_ROOT, + "output_dir": workspace + }) + run_stage_cmd(env, PROJECT_ROOT, [ + "--model-path", model_path, + "--sample-pass-file-path", f"{PROJECT_ROOT}/graph_net/torch/sample_pass/op_names_extractor.py", + "--sample-pass-class-name", "OpNamesExtractor", + "--sample-pass-config", cfg_s1 + ], "Stage 1", log_file) + + # Stage 2: OpExtractPointsGenerator + cfg_s2 = make_config_b64({ + "resume": False, + "model_path_prefix": PROJECT_ROOT, + "op_names_path_prefix": workspace, + "output_dir": ranges_dir, + "subgraph_ranges_file_name": "subgraph_ranges.json" + }) + run_stage_cmd(env, PROJECT_ROOT, [ + "--model-path", model_path, + "--sample-pass-file-path", f"{PROJECT_ROOT}/graph_net/sample_pass/op_extract_points_generator.py", + "--sample-pass-class-name", "OpExtractPointsGenerator", + "--sample-pass-config", cfg_s2 + ], "Stage 2", log_file) + + # Stage 3: SubgraphGenerator + cfg_s3 = make_config_b64({ + "resume": False, + "model_path_prefix": PROJECT_ROOT, + "output_dir": workspace, + "subgraph_ranges_json_root": ranges_dir, + "group_head_and_tail": False, + "chain_style": False + }) + run_stage_cmd(env, PROJECT_ROOT, [ + "--model-path", model_path, + "--sample-pass-file-path", f"{PROJECT_ROOT}/graph_net/torch/sample_pass/subgraph_generator.py", + "--sample-pass-class-name", "SubgraphGenerator", + "--sample-pass-config", cfg_s3 + ], "Stage 3", log_file) + +def main(): + if not os.path.exists(PYTHON_EXEC): return + os.makedirs(BASE_DIR, exist_ok=True) + os.makedirs(os.path.join(BASE_DIR, "logs"), exist_ok=True) + + # This line is the key anchor for the Shell script to capture the path + print(f"Workspace: {BASE_DIR}") + print(f"Dataset Generation Started...") + + with open(INPUT_LIST, 'r') as f: + all_models = [l.strip() for l in f if l.strip() and not l.startswith("#")] + + chunk_size = math.ceil(len(all_models) / NUM_GPUS) + processes = [] + for i in range(NUM_GPUS): + chunk = all_models[i*chunk_size : (i+1)*chunk_size] + if not chunk: continue + p = multiprocessing.Process(target=worker_process, args=(i, chunk, BASE_DIR)) + p.start() + processes.append(p) + + for p in processes: + p.join() + +if __name__ == "__main__": + multiprocessing.set_start_method('spawn', force=True) + main() +EOF + + # 2. Execute the Python script and capture the output directory + # We use tee to output logs to screen and grep to capture the "Workspace: " line + echo ">>> Running internal python generator..." + PYTHON_OUTPUT=$($PYTHON_EXEC $TEMP_GEN_SCRIPT | tee /dev/tty) + + # Extract the generated temporary path + TEMP_SRC_DIR=$(echo "$PYTHON_OUTPUT" | grep "Workspace:" | awk '{print $2}' | tr -d '\r') + + if [ -z "$TEMP_SRC_DIR" ]; then + echo "Error: Could not capture workspace path from python script." + exit 1 + fi + + echo ">>> Python script finished. Temporary output at: $TEMP_SRC_DIR" + + # 3. Move Step + echo ">>> Moving data from temp dir to standardized dir: $RAW_OUTPUT_DIR" + # We only move the generated subgraph folders, excluding logs and range files. + # Assuming subgraphs are generated inside model directories under BASE_DIR, + # we move everything first, then clean up. + + # Move all content + cp -r ${TEMP_SRC_DIR}/* ${RAW_OUTPUT_DIR}/ + + # Clean up unnecessary intermediate artifacts (ranges and logs), keeping only subgraphs + rm -rf ${RAW_OUTPUT_DIR}/logs + rm -rf ${RAW_OUTPUT_DIR}/workspace_single_operator_ranges + + echo ">>> Data moved and cleaned." +} + +# ============================================================================== +# Stage 2: Renaming +# ============================================================================== + +function rename_subgraphs() { + echo ">>> [2] Rename subgraph samples under ${RAW_OUTPUT_DIR}." + echo ">>>" + + # First, generate the list + generate_subgraph_list ${RAW_OUTPUT_DIR} ${RAW_SAMPLE_LIST} + + $PYTHON_EXEC -m graph_net.model_path_handler \ + --model-path-list ${RAW_SAMPLE_LIST} \ + --handler-config=$(base64 -w 0 <>> [3] Remove duplicated subgraph samples under ${RENAMED_OUTPUT_DIR}." + echo ">>>" + + + if [ -d "${DEDUPLICATED_OUTPUT_DIR}" ]; then + echo ">>> Target directory exists. Cleaning up..." + rm -rf "${DEDUPLICATED_OUTPUT_DIR}" + fi + + $PYTHON_EXEC -m graph_net.tools.deduplicated \ + --samples-dir ${RENAMED_OUTPUT_DIR} \ + --target-dir ${DEDUPLICATED_OUTPUT_DIR} +} + +# ============================================================================== +# Main Workflow +# ============================================================================== + +main() { + echo "==========================================================" + echo "START: Single Operator Dataset Generation Pipeline" + echo "Workspace: $WORKSPACE" + echo "==========================================================" + + # 1. Generate raw data + generate_raw_data + + # 2. Rename variables (Standardization) + rename_subgraphs + + # 3. Deduplicate + deduplicate_subgraphs + + echo "==========================================================" + echo "FINISH: Dataset generated at ${DEDUPLICATED_OUTPUT_DIR}" + echo "==========================================================" +} + +main From ca2699bcd56f4f9673bc29b906881920bed621ac Mon Sep 17 00:00:00 2001 From: Wayne E <1916647616@qq.com> Date: Fri, 6 Feb 2026 17:20:28 +0800 Subject: [PATCH 2/6] Refactor generate_single_op_dataset.sh for dynamic paths Refactor script for dynamic path detection and improved error handling. Added logging and workspace setup enhancements. --- graph_net/test/generate_single_op_dataset.sh | 455 ++++++++----------- 1 file changed, 184 insertions(+), 271 deletions(-) diff --git a/graph_net/test/generate_single_op_dataset.sh b/graph_net/test/generate_single_op_dataset.sh index 4dd94cbd6..f3c74635b 100644 --- a/graph_net/test/generate_single_op_dataset.sh +++ b/graph_net/test/generate_single_op_dataset.sh @@ -1,323 +1,236 @@ -#!/bin/bash -set -x - -################################################################################ -# [CRITICAL NOTICE] BEFORE RUNNING THIS SCRIPT: -# -# /graph_net/test/generate_single_op_dataset.sh -# -# 1. Check 'PYTHON_EXEC': Ensure the variable below points to the correct -# Python interpreter in your virtual environment. -# -# 2. Check 'INPUT_LIST': Look for the 'INPUT_LIST' variable inside the -# internal Python script (Stage 1 section). It is currently hardcoded -# to 'small10_torch_samples_list.txt'. Please switch it to your full -# dataset list file before running large-scale generation. -################################################################################ +#!/bin/bash +set -e # ============================================================================== # Configuration Area # ============================================================================== -# [TODO] HARDCODED: Paths are currently hardcoded; needs dynamic retrieval or arguments in the future. -# Virtual Environment Python Executable Path -PYTHON_EXEC="/workspace/venv_graphnet/bin/python3" -# Project Root Directory -GRAPH_NET_ROOT="/workspace/GraphNet" +# [CRITICAL NOTICE] +# This script now uses dynamic path detection. +# Ensure you are running inside the correct Virtual Environment. -# Script Runtime Arguments -GPU_ID=${1:-0} -RESUME="false" +# 1. Dynamic Path Retrieval (Fixing Hardcoded Paths) +# Detect python executable from current PATH +PYTHON_EXEC=$(which python3) +if [ -z "$PYTHON_EXEC" ]; then + echo "Error: 'python3' not found in PATH. Please activate your virtualenv." + exit 1 +fi -# Export environment variables to ensure Python can find graph_net -export CUDA_VISIBLE_DEVICES="${GPU_ID}" -export PYTHONPATH="${GRAPH_NET_ROOT}:${PYTHONPATH}" +# Detect Project Root dynamically by importing the module +GRAPH_NET_ROOT=$($PYTHON_EXEC -c "import graph_net; import os; print(os.path.dirname(os.path.dirname(graph_net.__file__)))") +if [ -z "$GRAPH_NET_ROOT" ]; then + echo "Error: Could not determine GRAPH_NET_ROOT. Ensure 'graph_net' is installed or in PYTHONPATH." + exit 1 +fi -# Workspace Configuration -TIMESTAMP=$(date +%Y%m%d_%H%M%S) -WORKSPACE="/tmp/single_op_workspace_${TIMESTAMP}" +# 2. Parallel Processing Config +AUTO_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l) +if [ "$AUTO_GPUS" -eq 0 ]; then AUTO_GPUS=1; fi -# Define standardized output directory structure -RAW_OUTPUT_DIR="${WORKSPACE}/01_raw_single_op_subgraphs" -RENAMED_OUTPUT_DIR="${WORKSPACE}/02_renamed_single_op_subgraphs" -DEDUPLICATED_OUTPUT_DIR="${WORKSPACE}/03_deduplicated_single_op_subgraphs" +# 逻辑: +# 1. 如果你运行命令带了参数 (e.g., ./script.sh 8),就用参数值。 +# 2. 否则,使用自动检测到的 GPU 数量。 +NUM_GPUS=${1:-$AUTO_GPUS} -# Define intermediate list file paths -RAW_SAMPLE_LIST="${WORKSPACE}/sample_list_01_raw.txt" -RENAMED_SAMPLE_LIST="${WORKSPACE}/sample_list_02_renamed.txt" +echo ">>> Detected/Set NUM_GPUS: ${NUM_GPUS}" -# Create workspace -mkdir -p "$WORKSPACE" "$RAW_OUTPUT_DIR" +RESUME="false" -# ============================================================================== -# Helper Functions -# ============================================================================== +# 3. Workspace Setup +TIMESTAMP=$(date +%Y%m%d_%H%M) +WORKSPACE="/tmp/single_op_workspace_${TIMESTAMP}" +# You can override this via argument or env var +MODEL_LIST="${MODEL_LIST:-${GRAPH_NET_ROOT}/graph_net/config/small100_torch_samples_list.txt}" -# Subgraph list generation function (mimics the original script) -function generate_subgraph_list() { - local target_dir="$1" - local sample_list="$2" - echo ">>> Generate subgraph_sample_list for samples under ${target_dir}." - echo ">>>" - - # Find parent directories of all model.py files to identify valid samples - find ${target_dir} -name "model.py" \ - | xargs dirname \ - | xargs realpath --relative-to=${target_dir} \ - | tee $sample_list -} +# 4. Output Directories +OP_NAMES_DIR="${WORKSPACE}/01_op_names" +RANGES_DIR="${WORKSPACE}/02_ranges" +RAW_SUBGRAPH_DIR="${WORKSPACE}/03_raw_subgraphs" +RENAMED_DIR="${WORKSPACE}/04_renamed" +DEDUPLICATED_DIR="${WORKSPACE}/05_deduplicated" +LOG_DIR="${WORKSPACE}/logs" # New: Dedicated log directory + +export PYTHONPATH="${GRAPH_NET_ROOT}:${PYTHONPATH}" +export GRAPH_NET_ROOT PYTHON_EXEC WORKSPACE OP_NAMES_DIR RANGES_DIR RAW_SUBGRAPH_DIR RESUME LOG_DIR + +mkdir -p "$WORKSPACE" "$LOG_DIR" # ============================================================================== -# Stage 1: Generation (Black Box Mode) +# Core Logic: Single Model Processing (V3: Strict Error Checking) # ============================================================================== - -function generate_raw_data() { - echo ">>> [1] Generating Single Operator Subgraphs (Running Python Script)..." - echo ">>>" - - local TEMP_GEN_SCRIPT="${WORKSPACE}/_internal_gen.py" - - # 1. Write the Python script to a temporary file - # Note: The Python logic is preserved exactly as provided - cat << 'EOF' > "$TEMP_GEN_SCRIPT" -import os -import sys -import time -import math -import subprocess -import datetime -import multiprocessing -import json -import base64 - -# [TODO] HARDCODED: Keep sync with Shell script -PYTHON_EXEC = "/workspace/venv_graphnet/bin/python3" -PROJECT_ROOT = "/workspace/GraphNet" -# [dependency] WARNING: This is currently pointing to the small 10 sample list -INPUT_LIST = os.path.join(PROJECT_ROOT, "graph_net/config/small10_torch_samples_list.txt") - -NUM_GPUS = 2 -TIMESTAMP = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") -# Temporary directory generated inside the Python script -BASE_DIR = f"/tmp/decompose_run_{TIMESTAMP}" - -def make_config_b64(config_dict): - json_str = json.dumps(config_dict) - return base64.b64encode(json_str.encode('utf-8')).decode('utf-8') - -def run_stage_cmd(env, cwd, cmd_args, stage_name, log_file): - cmd = [PYTHON_EXEC, "-u", "-m", "graph_net.apply_sample_pass"] + cmd_args - try: - result = subprocess.run(cmd, cwd=cwd, env=env, capture_output=True, text=True) - with open(log_file, "a") as f: - if result.returncode != 0: - f.write(f"\n[FAIL] {stage_name} Error (Exit {result.returncode}):\n") - f.write(result.stderr[-2000:] + "\n") - return False - else: - return True - except Exception as e: - with open(log_file, "a") as f: - f.write(f"\n[CRITICAL] {stage_name} Exception: {str(e)}\n") - return False - -def worker_process(gpu_id, models, base_dir): - log_file = os.path.join(base_dir, "logs", f"worker_gpu{gpu_id}.log") - workspace = base_dir - ranges_dir = os.path.join(workspace, "workspace_single_operator_ranges") - - env = os.environ.copy() - env["CUDA_VISIBLE_DEVICES"] = str(gpu_id) - env["GRAPH_NET_ROOT"] = PROJECT_ROOT - env["PYTHONPATH"] = PROJECT_ROOT - env["PYTHONUNBUFFERED"] = "1" - - with open(log_file, "w") as f: - f.write(f"==== Worker GPU {gpu_id} Started ====\n") +process_single_model() { + local model_path=$1 + local gpu_id=$2 - for idx, model_path in enumerate(models): - model_output_dir = os.path.join(workspace, model_path) - os.makedirs(model_output_dir, exist_ok=True) - os.makedirs(ranges_dir, exist_ok=True) - - # Stage 1: OpNamesExtractor - cfg_s1 = make_config_b64({ - "resume": False, - "model_path_prefix": PROJECT_ROOT, - "output_dir": workspace - }) - run_stage_cmd(env, PROJECT_ROOT, [ - "--model-path", model_path, - "--sample-pass-file-path", f"{PROJECT_ROOT}/graph_net/torch/sample_pass/op_names_extractor.py", - "--sample-pass-class-name", "OpNamesExtractor", - "--sample-pass-config", cfg_s1 - ], "Stage 1", log_file) - - # Stage 2: OpExtractPointsGenerator - cfg_s2 = make_config_b64({ - "resume": False, - "model_path_prefix": PROJECT_ROOT, - "op_names_path_prefix": workspace, - "output_dir": ranges_dir, - "subgraph_ranges_file_name": "subgraph_ranges.json" - }) - run_stage_cmd(env, PROJECT_ROOT, [ - "--model-path", model_path, - "--sample-pass-file-path", f"{PROJECT_ROOT}/graph_net/sample_pass/op_extract_points_generator.py", - "--sample-pass-class-name", "OpExtractPointsGenerator", - "--sample-pass-config", cfg_s2 - ], "Stage 2", log_file) - - # Stage 3: SubgraphGenerator - cfg_s3 = make_config_b64({ - "resume": False, - "model_path_prefix": PROJECT_ROOT, - "output_dir": workspace, - "subgraph_ranges_json_root": ranges_dir, - "group_head_and_tail": False, - "chain_style": False - }) - run_stage_cmd(env, PROJECT_ROOT, [ - "--model-path", model_path, - "--sample-pass-file-path", f"{PROJECT_ROOT}/graph_net/torch/sample_pass/subgraph_generator.py", - "--sample-pass-class-name", "SubgraphGenerator", - "--sample-pass-config", cfg_s3 - ], "Stage 3", log_file) - -def main(): - if not os.path.exists(PYTHON_EXEC): return - os.makedirs(BASE_DIR, exist_ok=True) - os.makedirs(os.path.join(BASE_DIR, "logs"), exist_ok=True) + export CUDA_VISIBLE_DEVICES="${gpu_id}" - # This line is the key anchor for the Shell script to capture the path - print(f"Workspace: {BASE_DIR}") - print(f"Dataset Generation Started...") + local safe_name=$(basename "$model_path") + local tmp_list="${WORKSPACE}/tmp_list_${BASHPID}.txt" + local log_file="${LOG_DIR}/${safe_name}_${BASHPID}.log" + + echo "${model_path}" > "${tmp_list}" + echo "=== Processing ${model_path} ===" > "$log_file" + + run_step() { + local step_name=$1 + local cmd_str=$2 + + echo "---------------------------------------------------" >> "$log_file" + echo ">>> Running Stage: ${step_name}" >> "$log_file" + + + if ! eval "$cmd_str" >> "$log_file" 2>&1; then + echo "[GPU ${gpu_id}] System Failed at ${step_name}: ${model_path}" + return 1 + fi + + if grep -q -E "Traceback \(most recent call last\)|Error:|Exception:" "$log_file"; then + echo "[GPU ${gpu_id}] Logic Failed at ${step_name} (Found Traceback): ${model_path}" + echo " -> Log saved at: ${log_file}" + tail -n 5 "$log_file" | sed "s/^/[GPU ${gpu_id}] /" + return 1 + fi + + return 0 + } - with open(INPUT_LIST, 'r') as f: - all_models = [l.strip() for l in f if l.strip() and not l.startswith("#")] - - chunk_size = math.ceil(len(all_models) / NUM_GPUS) - processes = [] - for i in range(NUM_GPUS): - chunk = all_models[i*chunk_size : (i+1)*chunk_size] - if not chunk: continue - p = multiprocessing.Process(target=worker_process, args=(i, chunk, BASE_DIR)) - p.start() - processes.append(p) + # --- Stage 1: Op Names --- + cmd_s1="$PYTHON_EXEC -m graph_net.model_path_handler --model-path-list ${tmp_list} --handler-config=\$(base64 -w 0 <>> Running internal python generator..." - PYTHON_OUTPUT=$($PYTHON_EXEC $TEMP_GEN_SCRIPT | tee /dev/tty) - - # Extract the generated temporary path - TEMP_SRC_DIR=$(echo "$PYTHON_OUTPUT" | grep "Workspace:" | awk '{print $2}' | tr -d '\r') + # --- Stage 3: Decompose --- + cmd_s3="$PYTHON_EXEC -m graph_net.model_path_handler --model-path-list ${tmp_list} --handler-config=\$(base64 -w 0 <>> Python script finished. Temporary output at: $TEMP_SRC_DIR" - - # 3. Move Step - echo ">>> Moving data from temp dir to standardized dir: $RAW_OUTPUT_DIR" - # We only move the generated subgraph folders, excluding logs and range files. - # Assuming subgraphs are generated inside model directories under BASE_DIR, - # we move everything first, then clean up. - - # Move all content - cp -r ${TEMP_SRC_DIR}/* ${RAW_OUTPUT_DIR}/ - - # Clean up unnecessary intermediate artifacts (ranges and logs), keeping only subgraphs - rm -rf ${RAW_OUTPUT_DIR}/logs - rm -rf ${RAW_OUTPUT_DIR}/workspace_single_operator_ranges - - echo ">>> Data moved and cleaned." + echo "[GPU ${gpu_id}] Done: ${model_path}" } +export -f process_single_model + # ============================================================================== -# Stage 2: Renaming +# Helper Function: Subgraph List Generation # ============================================================================== +function generate_subgraph_list() { + local target_dir="$1" + local sample_list="$2" + echo ">>> Generating subgraph list for ${target_dir}..." + find ${target_dir} -name "model.py" \ + | xargs dirname \ + | xargs realpath --relative-to=${target_dir} \ + | tee $sample_list > /dev/null +} -function rename_subgraphs() { - echo ">>> [2] Rename subgraph samples under ${RAW_OUTPUT_DIR}." - echo ">>>" +# ============================================================================== +# Main Pipeline Dispatcher +# ============================================================================== +function main() { + echo ">>> Starting Pipeline..." + echo " Python: $PYTHON_EXEC" + echo " Root: $GRAPH_NET_ROOT" + echo " Logs: $LOG_DIR" + + # 1. Prepare Data + if [ ! -f "$MODEL_LIST" ]; then + echo "Error: Model list not found at $MODEL_LIST" + exit 1 + fi + grep -v "^#" "${MODEL_LIST}" | grep -v "^$" > "${WORKSPACE}/clean_list.txt" + total_lines=$(wc -l < "${WORKSPACE}/clean_list.txt") - # First, generate the list - generate_subgraph_list ${RAW_OUTPUT_DIR} ${RAW_SAMPLE_LIST} + echo ">>> Total Models: $total_lines | GPUS: $NUM_GPUS" + + # 2. Sharding + lines_per_gpu=$(( (total_lines + NUM_GPUS - 1) / NUM_GPUS )) + split -l ${lines_per_gpu} -d "${WORKSPACE}/clean_list.txt" "${WORKSPACE}/gpu_chunk_" + + # 3. Parallel Execution + for (( i=0; i>> Launching Worker for GPU $i..." + ( + while read -r model_path; do + process_single_model "$model_path" "$i" || true + done < "$chunk_file" + ) & + done + + # 4. Wait + echo ">>> Waiting for workers..." + wait + echo ">>> Generation Phase Complete." + + # ========================================================================== + # Post-processing + # ========================================================================== + + echo ">>> Starting Renaming Phase..." + generate_subgraph_list ${RAW_SUBGRAPH_DIR} "${WORKSPACE}/raw_list.txt" + # We redirect output to a main log file here because it's a single process $PYTHON_EXEC -m graph_net.model_path_handler \ - --model-path-list ${RAW_SAMPLE_LIST} \ + --model-path-list "${WORKSPACE}/raw_list.txt" \ --handler-config=$(base64 -w 0 <> "${LOG_DIR}/renaming.log" 2>&1 -function deduplicate_subgraphs() { - echo ">>> [3] Remove duplicated subgraph samples under ${RENAMED_OUTPUT_DIR}." - echo ">>>" + echo ">>> Starting Deduplication Phase..." + if [ -d "${DEDUPLICATED_DIR}" ]; then rm -rf "${DEDUPLICATED_DIR}"; fi - - if [ -d "${DEDUPLICATED_OUTPUT_DIR}" ]; then - echo ">>> Target directory exists. Cleaning up..." - rm -rf "${DEDUPLICATED_OUTPUT_DIR}" - fi - $PYTHON_EXEC -m graph_net.tools.deduplicated \ - --samples-dir ${RENAMED_OUTPUT_DIR} \ - --target-dir ${DEDUPLICATED_OUTPUT_DIR} -} - -# ============================================================================== -# Main Workflow -# ============================================================================== - -main() { - echo "==========================================================" - echo "START: Single Operator Dataset Generation Pipeline" - echo "Workspace: $WORKSPACE" - echo "==========================================================" - - # 1. Generate raw data - generate_raw_data - - # 2. Rename variables (Standardization) - rename_subgraphs - - # 3. Deduplicate - deduplicate_subgraphs + --samples-dir ${RENAMED_DIR} \ + --target-dir ${DEDUPLICATED_DIR} >> "${LOG_DIR}/deduplication.log" 2>&1 - echo "==========================================================" - echo "FINISH: Dataset generated at ${DEDUPLICATED_OUTPUT_DIR}" - echo "==========================================================" + echo ">>> ALL DONE. Final dataset located at: ${DEDUPLICATED_DIR}" + echo ">>> Check ${LOG_DIR} for error logs if any failures occurred." } main From 3e250b00f5e7dc1e7dc745aaedb1bf1876403d65 Mon Sep 17 00:00:00 2001 From: ywh555hhh <1916647616@qq.com> Date: Mon, 9 Feb 2026 16:42:24 +0800 Subject: [PATCH 3/6] Refactor dataset generation script to strict serial execution mode --- graph_net/test/generate_single_op_dataset.sh | 241 ++++++------------- 1 file changed, 68 insertions(+), 173 deletions(-) diff --git a/graph_net/test/generate_single_op_dataset.sh b/graph_net/test/generate_single_op_dataset.sh index f3c74635b..804feff7e 100644 --- a/graph_net/test/generate_single_op_dataset.sh +++ b/graph_net/test/generate_single_op_dataset.sh @@ -5,207 +5,105 @@ set -e # Configuration Area # ============================================================================== -# [CRITICAL NOTICE] -# This script now uses dynamic path detection. -# Ensure you are running inside the correct Virtual Environment. - -# 1. Dynamic Path Retrieval (Fixing Hardcoded Paths) -# Detect python executable from current PATH +# Dynamic Path Retrieval PYTHON_EXEC=$(which python3) if [ -z "$PYTHON_EXEC" ]; then echo "Error: 'python3' not found in PATH. Please activate your virtualenv." exit 1 fi -# Detect Project Root dynamically by importing the module GRAPH_NET_ROOT=$($PYTHON_EXEC -c "import graph_net; import os; print(os.path.dirname(os.path.dirname(graph_net.__file__)))") if [ -z "$GRAPH_NET_ROOT" ]; then echo "Error: Could not determine GRAPH_NET_ROOT. Ensure 'graph_net' is installed or in PYTHONPATH." exit 1 fi -# 2. Parallel Processing Config -AUTO_GPUS=$(nvidia-smi -L 2>/dev/null | wc -l) -if [ "$AUTO_GPUS" -eq 0 ]; then AUTO_GPUS=1; fi - -# 逻辑: -# 1. 如果你运行命令带了参数 (e.g., ./script.sh 8),就用参数值。 -# 2. 否则,使用自动检测到的 GPU 数量。 -NUM_GPUS=${1:-$AUTO_GPUS} - -echo ">>> Detected/Set NUM_GPUS: ${NUM_GPUS}" - RESUME="false" -# 3. Workspace Setup +# Workspace Setup TIMESTAMP=$(date +%Y%m%d_%H%M) WORKSPACE="/tmp/single_op_workspace_${TIMESTAMP}" -# You can override this via argument or env var MODEL_LIST="${MODEL_LIST:-${GRAPH_NET_ROOT}/graph_net/config/small100_torch_samples_list.txt}" -# 4. Output Directories +# Output Directories OP_NAMES_DIR="${WORKSPACE}/01_op_names" RANGES_DIR="${WORKSPACE}/02_ranges" RAW_SUBGRAPH_DIR="${WORKSPACE}/03_raw_subgraphs" RENAMED_DIR="${WORKSPACE}/04_renamed" DEDUPLICATED_DIR="${WORKSPACE}/05_deduplicated" -LOG_DIR="${WORKSPACE}/logs" # New: Dedicated log directory -export PYTHONPATH="${GRAPH_NET_ROOT}:${PYTHONPATH}" -export GRAPH_NET_ROOT PYTHON_EXEC WORKSPACE OP_NAMES_DIR RANGES_DIR RAW_SUBGRAPH_DIR RESUME LOG_DIR - -mkdir -p "$WORKSPACE" "$LOG_DIR" +mkdir -p "$WORKSPACE" # ============================================================================== -# Core Logic: Single Model Processing (V3: Strict Error Checking) +# Main Pipeline # ============================================================================== -process_single_model() { - local model_path=$1 - local gpu_id=$2 - - export CUDA_VISIBLE_DEVICES="${gpu_id}" - - local safe_name=$(basename "$model_path") - local tmp_list="${WORKSPACE}/tmp_list_${BASHPID}.txt" - local log_file="${LOG_DIR}/${safe_name}_${BASHPID}.log" - - echo "${model_path}" > "${tmp_list}" - echo "=== Processing ${model_path} ===" > "$log_file" - - run_step() { - local step_name=$1 - local cmd_str=$2 - - echo "---------------------------------------------------" >> "$log_file" - echo ">>> Running Stage: ${step_name}" >> "$log_file" - - - if ! eval "$cmd_str" >> "$log_file" 2>&1; then - echo "[GPU ${gpu_id}] System Failed at ${step_name}: ${model_path}" - return 1 - fi - - if grep -q -E "Traceback \(most recent call last\)|Error:|Exception:" "$log_file"; then - echo "[GPU ${gpu_id}] Logic Failed at ${step_name} (Found Traceback): ${model_path}" - echo " -> Log saved at: ${log_file}" - tail -n 5 "$log_file" | sed "s/^/[GPU ${gpu_id}] /" - return 1 - fi - - return 0 - } - # --- Stage 1: Op Names --- - cmd_s1="$PYTHON_EXEC -m graph_net.model_path_handler --model-path-list ${tmp_list} --handler-config=\$(base64 -w 0 <>> Starting Pipeline..." +echo " Python: $PYTHON_EXEC" +echo " Root: $GRAPH_NET_ROOT" + +# 1. Prepare Data +if [ ! -f "$MODEL_LIST" ]; then + echo "Error: Model list not found at $MODEL_LIST" + exit 1 +fi + +grep -v "^#" "${MODEL_LIST}" | grep -v "^$" > "${WORKSPACE}/clean_list.txt" + +# 2. Stage 1: Op Names +echo ">>> Running Stage 1: Op Names..." +python3 -m graph_net.model_path_handler \ + --model-path-list "${WORKSPACE}/clean_list.txt" \ + --handler-config=$(base64 -w 0 <>> Running Stage 2: Ranges..." +python3 -m graph_net.apply_sample_pass \ + --model-path-list "${WORKSPACE}/clean_list.txt" \ + --sample-pass-file-path "$GRAPH_NET_ROOT/graph_net/sample_pass/op_extract_points_generator.py" \ + --sample-pass-class-name "OpExtractPointsGenerator" \ + --sample-pass-config=$(base64 -w 0 <>> Running Stage 3: Decompose..." +python3 -m graph_net.model_path_handler \ + --model-path-list "${WORKSPACE}/clean_list.txt" \ + --handler-config=$(base64 -w 0 <>> Generating subgraph list for ${target_dir}..." - find ${target_dir} -name "model.py" \ - | xargs dirname \ - | xargs realpath --relative-to=${target_dir} \ - | tee $sample_list > /dev/null -} - -# ============================================================================== -# Main Pipeline Dispatcher -# ============================================================================== -function main() { - echo ">>> Starting Pipeline..." - echo " Python: $PYTHON_EXEC" - echo " Root: $GRAPH_NET_ROOT" - echo " Logs: $LOG_DIR" - - # 1. Prepare Data - if [ ! -f "$MODEL_LIST" ]; then - echo "Error: Model list not found at $MODEL_LIST" - exit 1 - fi - grep -v "^#" "${MODEL_LIST}" | grep -v "^$" > "${WORKSPACE}/clean_list.txt" - total_lines=$(wc -l < "${WORKSPACE}/clean_list.txt") - - echo ">>> Total Models: $total_lines | GPUS: $NUM_GPUS" - - # 2. Sharding - lines_per_gpu=$(( (total_lines + NUM_GPUS - 1) / NUM_GPUS )) - split -l ${lines_per_gpu} -d "${WORKSPACE}/clean_list.txt" "${WORKSPACE}/gpu_chunk_" - - # 3. Parallel Execution - for (( i=0; i>> Launching Worker for GPU $i..." - ( - while read -r model_path; do - process_single_model "$model_path" "$i" || true - done < "$chunk_file" - ) & - done - - # 4. Wait - echo ">>> Waiting for workers..." - wait - echo ">>> Generation Phase Complete." - - # ========================================================================== - # Post-processing - # ========================================================================== - - echo ">>> Starting Renaming Phase..." - generate_subgraph_list ${RAW_SUBGRAPH_DIR} "${WORKSPACE}/raw_list.txt" - - # We redirect output to a main log file here because it's a single process - $PYTHON_EXEC -m graph_net.model_path_handler \ - --model-path-list "${WORKSPACE}/raw_list.txt" \ - --handler-config=$(base64 -w 0 <>> Running Post-processing: Rename..." +find ${RAW_SUBGRAPH_DIR} -name "model.py" \ + | xargs dirname \ + | xargs realpath --relative-to=${RAW_SUBGRAPH_DIR} \ + > "${WORKSPACE}/raw_list.txt" + +python3 -m graph_net.model_path_handler \ + --model-path-list "${WORKSPACE}/raw_list.txt" \ + --handler-config=$(base64 -w 0 <> "${LOG_DIR}/renaming.log" 2>&1 - - echo ">>> Starting Deduplication Phase..." - if [ -d "${DEDUPLICATED_DIR}" ]; then rm -rf "${DEDUPLICATED_DIR}"; fi +) - $PYTHON_EXEC -m graph_net.tools.deduplicated \ - --samples-dir ${RENAMED_DIR} \ - --target-dir ${DEDUPLICATED_DIR} >> "${LOG_DIR}/deduplication.log" 2>&1 +# 6. Post-processing: Deduplicate +echo ">>> Running Post-processing: Deduplicate..." +if [ -d "${DEDUPLICATED_DIR}" ]; then rm -rf "${DEDUPLICATED_DIR}"; fi - echo ">>> ALL DONE. Final dataset located at: ${DEDUPLICATED_DIR}" - echo ">>> Check ${LOG_DIR} for error logs if any failures occurred." -} +python3 -m graph_net.tools.deduplicated \ + --samples-dir ${RENAMED_DIR} \ + --target-dir ${DEDUPLICATED_DIR} -main +echo ">>> ALL DONE. Final dataset located at: ${DEDUPLICATED_DIR}" From f4f8c952f2e1175c7183b939b8d4a5bb3135bbdb Mon Sep 17 00:00:00 2001 From: ywh555hhh <1916647616@qq.com> Date: Mon, 9 Feb 2026 16:50:09 +0800 Subject: [PATCH 4/6] extract raw_list.txt generation as separate step and copy to final output --- graph_net/test/generate_single_op_dataset.sh | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/graph_net/test/generate_single_op_dataset.sh b/graph_net/test/generate_single_op_dataset.sh index 804feff7e..2fbcb3231 100644 --- a/graph_net/test/generate_single_op_dataset.sh +++ b/graph_net/test/generate_single_op_dataset.sh @@ -94,13 +94,15 @@ python3 -m graph_net.model_path_handler \ EOF ) -# 5. Post-processing: Rename -echo ">>> Running Post-processing: Rename..." +# 5. Generate raw_list.txt +echo ">>> Generating raw_list.txt..." find ${RAW_SUBGRAPH_DIR} -name "model.py" \ | xargs dirname \ | xargs realpath --relative-to=${RAW_SUBGRAPH_DIR} \ > "${WORKSPACE}/raw_list.txt" +# 6. Post-processing: Rename +echo ">>> Running Post-processing: Rename..." python3 -m graph_net.model_path_handler \ --model-path-list "${WORKSPACE}/raw_list.txt" \ --handler-config=$(base64 -w 0 <>> Running Post-processing: Deduplicate..." if [ -d "${DEDUPLICATED_DIR}" ]; then rm -rf "${DEDUPLICATED_DIR}"; fi @@ -128,4 +130,8 @@ python3 -m graph_net.tools.deduplicated \ --samples-dir ${RENAMED_DIR} \ --target-dir ${DEDUPLICATED_DIR} +# Copy raw_list.txt to final output +cp "${WORKSPACE}/raw_list.txt" "${DEDUPLICATED_DIR}/" + echo ">>> ALL DONE. Final dataset located at: ${DEDUPLICATED_DIR}" +echo ">>> raw_list.txt also saved to: ${DEDUPLICATED_DIR}/raw_list.txt" From 570d7d128f93d8372d5ec7a079fafc572028c8ff Mon Sep 17 00:00:00 2001 From: ywh555hhh <1916647616@qq.com> Date: Mon, 9 Feb 2026 18:37:37 +0800 Subject: [PATCH 5/6] remove redundant clean_list.txt and rename raw_list.txt to generated_subgraphs_list.txt --- graph_net/test/generate_single_op_dataset.sh | 22 +++++++++----------- 1 file changed, 10 insertions(+), 12 deletions(-) diff --git a/graph_net/test/generate_single_op_dataset.sh b/graph_net/test/generate_single_op_dataset.sh index 2fbcb3231..8781913bf 100644 --- a/graph_net/test/generate_single_op_dataset.sh +++ b/graph_net/test/generate_single_op_dataset.sh @@ -48,12 +48,10 @@ if [ ! -f "$MODEL_LIST" ]; then exit 1 fi -grep -v "^#" "${MODEL_LIST}" | grep -v "^$" > "${WORKSPACE}/clean_list.txt" - # 2. Stage 1: Op Names echo ">>> Running Stage 1: Op Names..." python3 -m graph_net.model_path_handler \ - --model-path-list "${WORKSPACE}/clean_list.txt" \ + --model-path-list "${MODEL_LIST}" \ --handler-config=$(base64 -w 0 <>> Running Stage 2: Ranges..." python3 -m graph_net.apply_sample_pass \ - --model-path-list "${WORKSPACE}/clean_list.txt" \ + --model-path-list "${MODEL_LIST}" \ --sample-pass-file-path "$GRAPH_NET_ROOT/graph_net/sample_pass/op_extract_points_generator.py" \ --sample-pass-class-name "OpExtractPointsGenerator" \ --sample-pass-config=$(base64 -w 0 <>> Running Stage 3: Decompose..." python3 -m graph_net.model_path_handler \ - --model-path-list "${WORKSPACE}/clean_list.txt" \ + --model-path-list "${MODEL_LIST}" \ --handler-config=$(base64 -w 0 <>> Generating raw_list.txt..." +# 5. Generate generated_subgraphs_list.txt +echo ">>> Generating generated_subgraphs_list.txt..." find ${RAW_SUBGRAPH_DIR} -name "model.py" \ | xargs dirname \ | xargs realpath --relative-to=${RAW_SUBGRAPH_DIR} \ - > "${WORKSPACE}/raw_list.txt" + > "${WORKSPACE}/generated_subgraphs_list.txt" # 6. Post-processing: Rename echo ">>> Running Post-processing: Rename..." python3 -m graph_net.model_path_handler \ - --model-path-list "${WORKSPACE}/raw_list.txt" \ + --model-path-list "${WORKSPACE}/generated_subgraphs_list.txt" \ --handler-config=$(base64 -w 0 <>> ALL DONE. Final dataset located at: ${DEDUPLICATED_DIR}" -echo ">>> raw_list.txt also saved to: ${DEDUPLICATED_DIR}/raw_list.txt" +echo ">>> generated_subgraphs_list.txt also saved to: ${DEDUPLICATED_DIR}/generated_subgraphs_list.txt" From 304dfe6c7573e30d1f30f23bb93bd7cb78f73281 Mon Sep 17 00:00:00 2001 From: ywh555hhh <1916647616@qq.com> Date: Mon, 9 Feb 2026 18:39:58 +0800 Subject: [PATCH 6/6] move generate_single_op_dataset.sh to tools directory --- graph_net/{test => tools}/generate_single_op_dataset.sh | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename graph_net/{test => tools}/generate_single_op_dataset.sh (100%) diff --git a/graph_net/test/generate_single_op_dataset.sh b/graph_net/tools/generate_single_op_dataset.sh similarity index 100% rename from graph_net/test/generate_single_op_dataset.sh rename to graph_net/tools/generate_single_op_dataset.sh