From 044104da2ba6dc00543a4152eda97d7960ff0f88 Mon Sep 17 00:00:00 2001 From: "xiongshaopan.xsp" Date: Tue, 3 Feb 2026 21:08:23 +0800 Subject: [PATCH] (feat): publish roll v0.2.0. Co-Authored-By: chengengru.cgr Co-Authored-By: fengjingxuan.fjx Co-Authored-By: ft498870 Co-Authored-By: heyancheng.hyc Co-Authored-By: hongzhen.yj Co-Authored-By: huangju.hj Co-Authored-By: jiamang.wang Co-Authored-By: scott.lxy Co-Authored-By: shenjingyu.sjy Co-Authored-By: shenliao.sla Co-Authored-By: tianhe.lzd Co-Authored-By: weixun.wwx Co-Authored-By: wzy496492 Co-Authored-By: xiongshaopan.xsp Co-Authored-By: xuehuanran.xhr Co-Authored-By: zhaohaizhou.zhz Co-Authored-By: bzd02333762 Co-authored-by: beiyue.lj --- .gitignore | 4 - README.md | 19 +- data/deepeyes_mini_10.parquet | Bin 0 -> 405793 bytes docs/qa.md | 93 - .../README_code_sandbox_reward_worker.md | 216 -- .../Developer Guide/custom_loss_func.md | 360 +++ .../llm_as_judge_optimization.md | 262 ++ .../Developer Guide/rollout_mock_usage.md | 289 +++ .../Advanced Features/dynamic_batching.md | 214 ++ .../Advanced Features/sequence_packing.md | 319 +++ .../docs/User Guides/Configuration/fsdp2.md | 246 ++ .../docs/User Guides/Configuration/vllm.md | 14 - .../Pipeline/agent_pipeline_start.md | 3 + .../Pipeline/agentic_pipeline_start.md | 4 +- .../Pipeline/distill_pipeline_start.md | 1 + .../Pipeline/rlvr_pipeline_start.md | 3 + .../Pipeline/sft_pipeline_start.md | 272 ++ .../Developer Guide/custom_loss_func_cn.md | 362 +++ .../llm_as_judge_optimization.md | 262 ++ .../Developer Guide/rollout_mock_usage.md | 288 +++ .../Advanced Features/dynamic_batching.md | 214 ++ .../Advanced Features/sequence_packing.md | 321 +++ .../User Guides/Configuration/fsdp2.md | 242 ++ .../current/User Guides/Configuration/vllm.md | 15 - .../Pipeline/agentic_pipeline_start.md | 2 + .../Pipeline/distill_pipeline_start.md | 1 + .../Pipeline/rlvr_pipeline_start.md | 2 + .../Pipeline/sft_pipeline_start.md | 275 +- examples/agentic_deepeyes/deepeyes.yaml | 218 ++ .../agentic_deepeyes/run_agentic_pipeline.sh | 6 + .../agent_val_frozen_lake-pg_var.yaml | 175 ++ ...ent_val_frozen_lake-pg_var_is_correct.yaml | 192 ++ .../agent_val_frozen_lake.yaml | 19 +- .../agentic_sokoban_rollout_mock_dump.yaml | 153 ++ .../agentic_val_sokoban_dynamic_batching.yaml | 175 ++ .../agentic_val_sokoban_lora.yaml | 169 ++ .../agentic_val_sokoban_native.yaml | 175 ++ .../agentic_val_sokoban_sandbox.yaml | 176 ++ .../submit_pipeline_amd.sh | 49 - .../submit_pipeline_amd_async.sh | 49 - .../run_agentic_pipeline_webshop.sh | 0 .../rlvr_config.yaml | 4 +- .../run_rlvr_pipeline.sh | 0 .../rlvr_config_amd_async.yaml | 2 - .../rlvr_config_lora.yaml | 265 ++ .../rlvr_config_sequence_packing.yaml | 273 ++ .../rlvr_rollout_mock_dump.yaml | 166 ++ .../submit_pipeline_amd.sh | 49 - .../submit_pipeline_amd_async.sh | 49 - .../submit_pipeline_amd_zero3_lora.sh | 46 - .../distill_vl_zero3.yaml | 1 + .../qwen2.5-vl-7B-math/rlvr_math_lora.yaml | 2 + .../rlvr_math_megatron.yaml | 2 + .../qwen2.5-vl-7B-math/rlvr_math_zero3.yaml | 2 + .../qwen2.5-vl-7B-math/submit_pipeline_amd.sh | 49 - examples/qwen2.5-vl-7B-rlvr/rlvr_async.yaml | 151 ++ .../qwen2.5-vl-7B-rlvr/rlvr_megatron.yaml | 2 + .../submit_pipeline_amd.sh | 49 - .../agentic_val_sokoban_30a3.yaml | 184 ++ .../rlvr_config_amd.yaml | 6 +- .../rlvr_config_lora.yaml | 271 ++ .../rlvr_config_sglang.yaml | 2 +- .../submit_pipeline_amd.sh | 49 - .../submit_pipeline_amd.sh | 49 - .../rlvr_megatron.yaml | 48 +- examples/qwen3-omni/run_rlvr_pipeline.sh | 5 + .../rlvr_megatron_80GB.yaml | 172 ++ .../run_rlvr_pipeline.sh | 8 + .../run_rlvr_pipeline.sh | 0 .../rlvr_megatron_80G.yaml | 163 ++ .../run_rlvr_pipeline.sh | 5 +- examples/start_agentic_pipeline.py | 13 +- .../reward_fl_config.yaml | 3 + mcore_adapter/requirements.txt | 2 +- mcore_adapter/src/mcore_adapter/__init__.py | 2 +- .../src/mcore_adapter/adapters/__init__.py | 6 +- .../src/mcore_adapter/adapters/lora_layer.py | 28 +- .../src/mcore_adapter/checkpointing.py | 8 - mcore_adapter/src/mcore_adapter/constants.py | 2 + mcore_adapter/src/mcore_adapter/initialize.py | 3 +- .../src/mcore_adapter/models/__init__.py | 2 + .../models/converter/convert_utils.py | 109 +- .../models/converter/dist_converter.py | 120 +- .../models/converter/model_converter.py | 107 +- .../models/converter/post_converter.py | 278 ++- .../models/converter/template.py | 97 +- .../models/deepseek_v3/__init__.py | 16 +- .../mcore_adapter/models/glm4_moe/__init__.py | 97 +- .../src/mcore_adapter/models/model_config.py | 63 +- .../src/mcore_adapter/models/model_factory.py | 88 +- .../src/mcore_adapter/models/model_utils.py | 53 +- .../models/qwen2_5_vl/modeling_qwen2_5_vl.py | 18 +- .../models/qwen2_vl/modeling_qwen2_vl.py | 14 +- .../mcore_adapter/models/qwen3/__init__.py | 12 +- .../models/qwen3_moe/__init__.py | 7 +- .../models/qwen3_next/__init__.py | 42 +- .../models/qwen3_next/modeling_qwen3_next.py | 7 +- .../models/qwen3_omni/__init__.py | 142 ++ .../models/qwen3_omni/config_qwen3_omni.py | 69 + .../models/qwen3_omni/modeling_qwen3_omni.py | 305 +++ .../mcore_adapter/models/qwen3_vl/__init__.py | 28 + .../models/qwen3_vl/modeling_qwen3_vl.py | 4 + .../models/qwen3_vl/rope_utils.py | 1 - .../models/qwen3_vl/transformer_block.py | 2 +- .../models/qwen3_vl_moe/__init__.py | 190 ++ .../parallel_functions/vocab_parallel.py | 6 +- mcore_adapter/src/mcore_adapter/patcher.py | 160 ++ .../src/mcore_adapter/platforms/__init__.py | 7 +- .../src/mcore_adapter/platforms/cpu.py | 7 +- .../src/mcore_adapter/platforms/cuda.py | 20 +- .../src/mcore_adapter/platforms/npu.py | 10 +- .../src/mcore_adapter/platforms/platform.py | 24 +- .../src/mcore_adapter/platforms/rocm.py | 20 +- .../src/mcore_adapter/platforms/unknown.py | 23 +- .../src/mcore_adapter/trainer/trainer.py | 74 +- .../src/mcore_adapter/trainer/utils.py | 20 +- .../src/mcore_adapter/training_args.py | 29 + mcore_adapter/src/mcore_adapter/utils.py | 18 +- mcore_adapter/tools/convert.py | 17 +- requirements_common.txt | 2 +- roll/__init__.py | 11 + roll/configs/base_config.py | 185 +- roll/configs/data_args.py | 1 - roll/configs/generating_args.py | 4 + roll/configs/model_args.py | 29 +- roll/configs/training_args.py | 4 + roll/configs/worker_config.py | 225 +- roll/datasets/collator.py | 56 +- roll/datasets/dataset.py | 73 +- roll/datasets/global_dataset.py | 6 +- roll/distributed/executor/cluster.py | 35 +- .../executor/model_update_group.py | 173 +- roll/distributed/executor/worker.py | 51 +- roll/distributed/scheduler/driver_utils.py | 18 +- .../scheduler/generate_scheduler.py | 2209 ++++++++++++----- .../distributed/scheduler/reward_scheduler.py | 6 +- .../scheduler/rollout_mock_mixin.py | 200 ++ .../scheduler/rollout_scheduler.py | 377 ++- .../scheduler/user_defined_rollout_loop.py | 271 ++ .../strategy/deepspeed_strategy.py | 171 +- roll/distributed/strategy/factory.py | 17 +- roll/distributed/strategy/fsdp2_strategy.py | 1268 ++++++++++ roll/distributed/strategy/fsdp_strategy.py | 0 roll/distributed/strategy/hf_strategy.py | 80 +- .../distributed/strategy/megatron_strategy.py | 471 ++-- roll/distributed/strategy/mock_strategy.py | 3 - roll/distributed/strategy/sglang_strategy.py | 387 +-- roll/distributed/strategy/strategy.py | 184 +- roll/distributed/strategy/vllm_strategy.py | 460 ++-- roll/models/model_providers.py | 466 +++- .../agentic/agentic_actor_pg_worker.py | 588 +++++ roll/pipeline/agentic/agentic_actor_worker.py | 119 +- roll/pipeline/agentic/agentic_config.py | 112 +- roll/pipeline/agentic/agentic_pipeline.py | 541 +++- .../agentic/agentic_rollout_pipeline.py | 9 +- roll/pipeline/agentic/env/__init__.py | 4 +- .../pipeline/agentic/env/deepeyes/__init__.py | 7 + roll/pipeline/agentic/env/deepeyes/env.py | 451 ++++ roll/pipeline/agentic/env/deepeyes/utils.py | 370 +++ roll/pipeline/agentic/env/sokoban/env.py | 2 +- .../agentic/env/sokoban/native_env.py | 284 +++ .../env_manager/agent_native_env_manager.py | 521 ++++ .../env_manager/step_concat_env_manager.py | 3 +- .../agentic/env_manager/step_env_manager.py | 8 +- .../agentic/env_manager/token_mask_utils.py | 97 +- .../agentic/env_manager/traj_env_manager.py | 32 +- .../env_manager/vl_traj_env_manager.py | 337 ++- roll/pipeline/agentic/llm_proxy/__init__.py | 4 +- .../agentic/llm_proxy/openai_proxy.py | 6 +- .../agentic/llm_proxy/policy_proxy.py | 1 - .../pipeline/agentic/llm_proxy/proxy_utils.py | 158 ++ roll/pipeline/agentic/utils.py | 155 +- roll/pipeline/base_pipeline.py | 58 +- roll/pipeline/base_worker.py | 483 ++-- .../diffusion/reward_fl/reward_fl_pipeline.py | 3 +- roll/pipeline/distill/distill_config.py | 2 - roll/pipeline/distill/distill_pipeline.py | 16 +- roll/pipeline/distill/distill_vlm_pipeline.py | 3 +- roll/pipeline/distill/distill_worker.py | 109 +- roll/pipeline/dpo/dpo_config.py | 6 +- roll/pipeline/dpo/dpo_pipeline.py | 5 +- roll/pipeline/rlvr/actor_pg_worker.py | 262 +- roll/pipeline/rlvr/actor_worker.py | 173 +- roll/pipeline/rlvr/rewards/__init__.py | 3 +- .../crossthinkqa_rule_reward_worker.py | 3 - .../rlvr/rewards/detection_reward_worker.py | 20 +- .../rewards/general_val_rule_reward_worker.py | 3 - .../rlvr/rewards/ifeval_rule_reward_worker.py | 3 - .../rlvr/rewards/llm_judge_reward_worker.py | 9 +- .../rlvr/rewards/math_rule_reward_worker.py | 46 +- ...ultiple_choice_boxed_rule_reward_worker.py | 3 - roll/pipeline/rlvr/rlvr_config.py | 40 +- roll/pipeline/rlvr/rlvr_math_vlm_pipeline.py | 9 +- roll/pipeline/rlvr/rlvr_pipeline.py | 225 +- roll/pipeline/rlvr/rlvr_rollout_pipeline.py | 13 +- roll/pipeline/rlvr/rlvr_vlm_pipeline.py | 206 +- roll/pipeline/rlvr/utils.py | 8 +- roll/pipeline/sft/sft_config.py | 2 - roll/pipeline/sft/sft_pipeline.py | 60 +- roll/pipeline/sft/sft_worker.py | 20 +- roll/platforms/cuda.py | 7 +- roll/platforms/npu.py | 6 +- roll/platforms/rocm.py | 15 +- roll/platforms/unknown.py | 4 +- roll/third_party/deepspeed/model_update.py | 205 ++ .../{vllm/vllm_0_10_0 => fsdp2}/__init__.py | 0 roll/third_party/fsdp2/model_update.py | 323 +++ roll/third_party/fsdp2/qwen3_moe_patch.py | 36 + roll/third_party/fsdp2/tiled_mlp.py | 239 ++ roll/third_party/megatron/model_update.py | 483 ++++ roll/third_party/megatron/tensor_parallel.py | 1 - roll/third_party/sglang/__init__.py | 3 + roll/third_party/sglang/async_engine.py | 205 -- roll/third_party/sglang/fp8.py | 304 +++ roll/third_party/sglang/io_struct.py | 62 - .../sglang/v0410post2_patch/__init__.py | 1 - .../sglang/v0410post2_patch/engine.py | 125 +- .../sglang/v0410post2_patch/io_struct.py | 62 - .../sglang/v0410post2_patch/model_runner.py | 195 -- .../sglang/v0410post2_patch/scheduler.py | 96 - .../v0410post2_patch/tokenizer_manager.py | 126 - .../sglang/v0410post2_patch/tp_worker.py | 86 - .../sglang/v046post4_patch/__init__.py | 1 - .../sglang/v046post4_patch/async_engine.py | 169 -- .../sglang/v046post4_patch/engine.py | 143 +- .../sglang/v046post4_patch/io_struct.py | 62 - .../sglang/v046post4_patch/model_runner.py | 190 -- .../sglang/v046post4_patch/scheduler.py | 98 - .../v046post4_patch/tokenizer_manager.py | 126 - .../sglang/v046post4_patch/tp_worker.py | 86 - .../third_party/sglang/v052_patch/__init__.py | 1 - roll/third_party/sglang/v052_patch/engine.py | 149 +- .../sglang/v052_patch/io_struct.py | 62 - .../sglang/v052_patch/model_runner.py | 200 -- .../sglang/v052_patch/scheduler.py | 108 - .../sglang/v052_patch/tokenizer_manager.py | 112 - .../sglang/v052_patch/tp_worker.py | 85 - .../third_party/sglang/v054_patch/__init__.py | 1 - roll/third_party/sglang/v054_patch/engine.py | 163 +- .../sglang/v054_patch/model_runner.py | 246 -- .../sglang/v054_patch/scheduler.py | 105 - .../sglang/v054_patch/tokenizer_manager.py | 112 - .../sglang/v054_patch/tp_worker.py | 58 - roll/third_party/vllm/__init__.py | 141 +- roll/third_party/vllm/async_llm.py | 28 + roll/third_party/vllm/async_llm_engine.py | 27 + roll/third_party/vllm/fp8.py | 83 +- roll/third_party/vllm/vllm_0_10_0/llm.py | 233 -- .../vllm/vllm_0_10_0/llm_engine.py | 89 - .../vllm_0_10_0/ray_distributed_executor.py | 265 -- .../vllm/vllm_0_10_0/v1/async_llm.py | 98 - .../vllm/vllm_0_10_0/v1/llm_engine.py | 241 -- .../v1/ray_distributed_executor.py | 9 - .../third_party/vllm/vllm_0_10_0/v1/worker.py | 51 - roll/third_party/vllm/vllm_0_10_0/worker.py | 15 - roll/third_party/vllm/vllm_0_10_2/llm.py | 285 --- .../vllm/vllm_0_10_2/llm_engine.py | 87 - .../vllm_0_10_2/ray_distributed_executor.py | 1 + .../vllm/vllm_0_10_2/v1/llm_engine.py | 235 -- .../v1/ray_distributed_executor.py | 2 +- .../third_party/vllm/vllm_0_10_2/v1/worker.py | 52 - roll/third_party/vllm/vllm_0_10_2/worker.py | 15 - roll/third_party/vllm/vllm_0_11_0/llm.py | 307 --- .../vllm/vllm_0_11_0/llm_engine.py | 87 - .../vllm_0_11_0/ray_distributed_executor.py | 29 +- .../vllm/vllm_0_11_0/v1/llm_engine.py | 233 -- .../third_party/vllm/vllm_0_11_0/v1/worker.py | 51 - roll/third_party/vllm/vllm_0_11_0/worker.py | 15 - .../v1 => vllm_0_12_0}/__init__.py | 0 .../vllm_0_12_0/ray_distributed_executor.py | 190 ++ roll/third_party/vllm/vllm_0_8_4/__init__.py | 81 +- roll/third_party/vllm/vllm_0_8_4/llm.py | 230 -- .../third_party/vllm/vllm_0_8_4/llm_engine.py | 89 - .../vllm_0_8_4/ray_distributed_executor.py | 1 + .../vllm/vllm_0_8_4/v1/async_llm.py | 98 - .../vllm/vllm_0_8_4/v1/llm_engine.py | 244 -- roll/third_party/vllm/vllm_0_8_4/v1/worker.py | 51 - roll/third_party/vllm/vllm_0_8_4/worker.py | 16 - roll/third_party/vllm/vllm_utils.py | 61 +- roll/third_party/vllm/worker.py | 169 ++ roll/third_party/vllm/worker_helper.py | 120 - roll/utils/asyncio_decorator.py | 41 + roll/utils/collective/collective.py | 23 +- roll/utils/collective/pg_utils.py | 8 +- roll/utils/constants.py | 4 +- roll/utils/context_parallel/__init__.py | 1 - roll/utils/context_parallel/all_to_all.py | 26 +- .../utils/context_parallel/autograd_gather.py | 98 + .../hf_flash_attention_patch.py | 404 +++ roll/utils/context_parallel/monkey_patch.py | 23 +- roll/utils/context_parallel/rmpad_ulysses.py | 136 + .../context_parallel/ulysses_attention.py | 14 +- roll/utils/context_parallel/vlm_cp_patch.py | 147 ++ roll/utils/dynamic_batching.py | 72 +- roll/utils/fp8.py | 61 + roll/utils/fsdp_utils.py | 290 +++ roll/utils/functionals.py | 461 +++- roll/utils/logging.py | 7 + roll/utils/metrics/metrics_manager.py | 38 +- roll/utils/send_recv_utils.py | 202 +- roll/utils/sequence_packing.py | 623 +++-- roll/utils/str_utils.py | 67 +- roll/utils/taskgroups.py | 298 +++ roll/utils/tracking.py | 47 + roll/utils/train_infer_corrections.py | 255 ++ tests/agentic/env/test_mcp_client.py | 94 + tests/agentic/env/test_sokoban_mcp.py | 159 ++ tests/agentic/env/test_sokoban_sandbox.py | 88 + .../test_traj_env_manager_debug.py | 152 ++ .../env_manager/traj_env_manager_debug.yaml | 99 + tests/agentic/test_segment_masked_mean.py | 197 ++ tests/datasets/test_collator.py | 110 +- .../executor/test_async_cluster.py | 66 + .../distributed/executor/test_ray_debugger.py | 35 + .../scheduler/test_generate_scheduler.py | 495 ++++ .../strategy/checkpoint/fsdp_config.yaml | 55 + .../strategy/checkpoint/fsdp_lora_config.yaml | 59 + .../strategy/checkpoint/megatron_config.yaml | 12 +- .../strategy/checkpoint/test_fsdp_strategy.py | 54 + .../checkpoint/test_megatron_stategy_ckpt.py | 6 +- .../test_fsdp2_cp_grad_equivalence.py | 169 ++ .../test_fsdp2_cp_qwen3_hf_equivalence.py | 163 ++ ...est_fsdp2_cp_qwen3_hf_rmpad_equivalence.py | 504 ++++ .../test_fsdp2_cp_ulysses_equivalence.py | 94 + .../test_fsdp2_cp_vlm_rmpad_equivalence.py | 484 ++++ .../grad_norm/run_fsdp2_distributed_test.sh | 38 + .../grad_norm/test_fsdp2_grad_norm.py | 291 +++ .../test_grad_accumulation_scaling.py | 324 +++ .../strategy/grad_norm/test_grad_norm_unit.py | 222 ++ .../log_probs/analyze_layer_divergence.py | 612 +++++ .../strategy/log_probs/apply_model_patch.py | 744 ++++++ .../log_probs/layer_states_capture.py | 165 ++ .../log_probs/log_probs_cmp_config.yaml | 2 +- .../log_probs/log_probs_fsdp_config.yaml | 109 + .../log_probs/log_probs_fsdp_cp_config.yaml | 110 + .../log_probs_fsdp_cp_rmpad_config.yaml | 111 + .../log_probs/log_probs_fsdp_lora_config.yaml | 113 + .../log_probs_fsdp_vlm_cp2_config.yaml | 103 + .../log_probs/log_probs_megatron_config.yaml | 2 +- .../log_probs/test_ds_hf_log_probs.py | 80 +- .../strategy/log_probs/test_fsdp_log_probs.py | 311 +++ .../log_probs/test_fsdp_vlm_layer_states.py | 308 +++ .../log_probs/test_fsdp_vlm_log_probs.py | 214 ++ .../log_probs/test_fsdp_vlm_log_probs_perf.py | 310 +++ .../log_probs/test_megatron_strategy.py | 12 +- .../model_update_baseline_config.yaml | 2 +- .../model_update/model_update_debug.py | 50 +- .../model_update/model_update_fsdp.yaml | 70 + .../model_update_multi_group_debug.py | 50 +- .../model_update/model_update_pipeline.py | 9 +- .../model_update_pipeline_multi_group.py | 6 +- .../standalone/fsdp2_standalone_strategy.py | 523 ++++ .../standalone/run_fsdp2_standalone.py | 114 + .../strategy/test_fsdp_strategy_collection.py | 918 +++++++ .../models/cuda_mem/test_mca_model_forward.py | 4 +- .../megatron/test_offload_states.py | 16 +- tests/third_party/sglang/test_abort.py | 119 + tests/third_party/sglang/test_fp8.py | 68 + tests/third_party/vllm/test_abort.py | 147 ++ tests/third_party/vllm/test_collective_rpc.py | 83 + tests/third_party/vllm/test_fp8.py | 141 +- tests/third_party/vllm/test_fp8_perf.py | 58 +- tests/third_party/vllm/test_model_update.py | 130 +- .../third_party/vllm/test_vllm_local_actor.py | 35 +- tests/third_party/vllm/test_vllm_mem_oom.py | 169 +- tests/third_party/vllm/utils.py | 86 + tests/utils/test_action_parser.py | 24 + tests/utils/test_cp_rmpad_ulysses_utils.py | 48 + tests/utils/test_dynamic_batching.py | 34 +- tests/utils/test_sequence_packing.py | 247 ++ tests/utils/test_taskgroups.py | 115 + 371 files changed, 34626 insertions(+), 11503 deletions(-) create mode 100644 data/deepeyes_mini_10.parquet delete mode 100644 docs/qa.md delete mode 100644 docs/reward_worker_examples/README_code_sandbox_reward_worker.md create mode 100644 docs_roll/docs/Development/Developer Guide/custom_loss_func.md create mode 100644 docs_roll/docs/Development/Developer Guide/llm_as_judge_optimization.md create mode 100644 docs_roll/docs/Development/Developer Guide/rollout_mock_usage.md create mode 100644 docs_roll/docs/User Guides/Advanced Features/dynamic_batching.md create mode 100644 docs_roll/docs/User Guides/Advanced Features/sequence_packing.md create mode 100644 docs_roll/docs/User Guides/Configuration/fsdp2.md create mode 100644 docs_roll/docs/User Guides/Pipeline/sft_pipeline_start.md create mode 100644 docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/Development/Developer Guide/custom_loss_func_cn.md create mode 100644 docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/Development/Developer Guide/llm_as_judge_optimization.md create mode 100644 docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/Development/Developer Guide/rollout_mock_usage.md create mode 100644 docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Advanced Features/dynamic_batching.md create mode 100644 docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Advanced Features/sequence_packing.md create mode 100644 docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Configuration/fsdp2.md create mode 100644 examples/agentic_deepeyes/deepeyes.yaml create mode 100755 examples/agentic_deepeyes/run_agentic_pipeline.sh create mode 100644 examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake-pg_var.yaml create mode 100644 examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake-pg_var_is_correct.yaml create mode 100644 examples/qwen2.5-0.5B-agentic/agentic_sokoban_rollout_mock_dump.yaml create mode 100644 examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_dynamic_batching.yaml create mode 100644 examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_lora.yaml create mode 100644 examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_native.yaml create mode 100644 examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_sandbox.yaml delete mode 100644 examples/qwen2.5-0.5B-agentic/submit_pipeline_amd.sh delete mode 100644 examples/qwen2.5-0.5B-agentic/submit_pipeline_amd_async.sh mode change 100644 => 100755 examples/qwen2.5-7B-agentic_megatron/run_agentic_pipeline_webshop.sh mode change 100644 => 100755 examples/qwen2.5-7B-rlvr-offpolicy/run_rlvr_pipeline.sh create mode 100644 examples/qwen2.5-7B-rlvr_megatron/rlvr_config_lora.yaml create mode 100644 examples/qwen2.5-7B-rlvr_megatron/rlvr_config_sequence_packing.yaml create mode 100644 examples/qwen2.5-7B-rlvr_megatron/rlvr_rollout_mock_dump.yaml delete mode 100644 examples/qwen2.5-7B-rlvr_megatron/submit_pipeline_amd.sh delete mode 100644 examples/qwen2.5-7B-rlvr_megatron/submit_pipeline_amd_async.sh delete mode 100644 examples/qwen2.5-7B-rlvr_megatron/submit_pipeline_amd_zero3_lora.sh delete mode 100644 examples/qwen2.5-vl-7B-math/submit_pipeline_amd.sh create mode 100644 examples/qwen2.5-vl-7B-rlvr/rlvr_async.yaml delete mode 100644 examples/qwen3-235BA22B-rlvr_megatron/submit_pipeline_amd.sh create mode 100644 examples/qwen3-30BA3B-agentic_fsdp2/agentic_val_sokoban_30a3.yaml create mode 100644 examples/qwen3-30BA3B-rlvr_megatron/rlvr_config_lora.yaml delete mode 100644 examples/qwen3-30BA3B-rlvr_megatron/submit_pipeline_amd.sh delete mode 100644 examples/qwen3-next-80BA3B-rlvr_megatron/submit_pipeline_amd.sh rename examples/{qwen3-vl-4B-rlvr_megatron => qwen3-omni}/rlvr_megatron.yaml (82%) create mode 100755 examples/qwen3-omni/run_rlvr_pipeline.sh create mode 100644 examples/qwen3-vl-30BA3B-rlvr_megatron/rlvr_megatron_80GB.yaml create mode 100755 examples/qwen3-vl-30BA3B-rlvr_megatron/run_rlvr_pipeline.sh mode change 100644 => 100755 examples/qwen3-vl-32B-rlvr_megatron/run_rlvr_pipeline.sh create mode 100644 examples/qwen3-vl-4B-rlvr_megatron/rlvr_megatron_80G.yaml mode change 100644 => 100755 examples/qwen3-vl-4B-rlvr_megatron/run_rlvr_pipeline.sh create mode 100644 mcore_adapter/src/mcore_adapter/models/qwen3_omni/__init__.py create mode 100644 mcore_adapter/src/mcore_adapter/models/qwen3_omni/config_qwen3_omni.py create mode 100644 mcore_adapter/src/mcore_adapter/models/qwen3_omni/modeling_qwen3_omni.py create mode 100644 mcore_adapter/src/mcore_adapter/models/qwen3_vl_moe/__init__.py create mode 100644 mcore_adapter/src/mcore_adapter/patcher.py create mode 100644 roll/distributed/scheduler/rollout_mock_mixin.py create mode 100644 roll/distributed/scheduler/user_defined_rollout_loop.py create mode 100644 roll/distributed/strategy/fsdp2_strategy.py delete mode 100644 roll/distributed/strategy/fsdp_strategy.py create mode 100644 roll/pipeline/agentic/agentic_actor_pg_worker.py create mode 100644 roll/pipeline/agentic/env/deepeyes/__init__.py create mode 100644 roll/pipeline/agentic/env/deepeyes/env.py create mode 100644 roll/pipeline/agentic/env/deepeyes/utils.py create mode 100644 roll/pipeline/agentic/env/sokoban/native_env.py create mode 100644 roll/pipeline/agentic/env_manager/agent_native_env_manager.py create mode 100644 roll/pipeline/agentic/llm_proxy/proxy_utils.py create mode 100644 roll/third_party/deepspeed/model_update.py rename roll/third_party/{vllm/vllm_0_10_0 => fsdp2}/__init__.py (100%) create mode 100644 roll/third_party/fsdp2/model_update.py create mode 100644 roll/third_party/fsdp2/qwen3_moe_patch.py create mode 100644 roll/third_party/fsdp2/tiled_mlp.py create mode 100644 roll/third_party/megatron/model_update.py delete mode 100644 roll/third_party/sglang/async_engine.py create mode 100644 roll/third_party/sglang/fp8.py delete mode 100644 roll/third_party/sglang/io_struct.py delete mode 100644 roll/third_party/sglang/v0410post2_patch/io_struct.py delete mode 100644 roll/third_party/sglang/v0410post2_patch/model_runner.py delete mode 100644 roll/third_party/sglang/v0410post2_patch/scheduler.py delete mode 100644 roll/third_party/sglang/v0410post2_patch/tokenizer_manager.py delete mode 100644 roll/third_party/sglang/v0410post2_patch/tp_worker.py delete mode 100644 roll/third_party/sglang/v046post4_patch/async_engine.py delete mode 100644 roll/third_party/sglang/v046post4_patch/io_struct.py delete mode 100644 roll/third_party/sglang/v046post4_patch/model_runner.py delete mode 100644 roll/third_party/sglang/v046post4_patch/scheduler.py delete mode 100644 roll/third_party/sglang/v046post4_patch/tokenizer_manager.py delete mode 100644 roll/third_party/sglang/v046post4_patch/tp_worker.py delete mode 100644 roll/third_party/sglang/v052_patch/io_struct.py delete mode 100644 roll/third_party/sglang/v052_patch/model_runner.py delete mode 100644 roll/third_party/sglang/v052_patch/scheduler.py delete mode 100644 roll/third_party/sglang/v052_patch/tokenizer_manager.py delete mode 100644 roll/third_party/sglang/v052_patch/tp_worker.py delete mode 100644 roll/third_party/sglang/v054_patch/model_runner.py delete mode 100644 roll/third_party/sglang/v054_patch/scheduler.py delete mode 100644 roll/third_party/sglang/v054_patch/tokenizer_manager.py delete mode 100644 roll/third_party/sglang/v054_patch/tp_worker.py create mode 100644 roll/third_party/vllm/async_llm.py create mode 100644 roll/third_party/vllm/async_llm_engine.py delete mode 100644 roll/third_party/vllm/vllm_0_10_0/llm.py delete mode 100644 roll/third_party/vllm/vllm_0_10_0/llm_engine.py delete mode 100644 roll/third_party/vllm/vllm_0_10_0/ray_distributed_executor.py delete mode 100644 roll/third_party/vllm/vllm_0_10_0/v1/async_llm.py delete mode 100644 roll/third_party/vllm/vllm_0_10_0/v1/llm_engine.py delete mode 100644 roll/third_party/vllm/vllm_0_10_0/v1/ray_distributed_executor.py delete mode 100644 roll/third_party/vllm/vllm_0_10_0/v1/worker.py delete mode 100644 roll/third_party/vllm/vllm_0_10_0/worker.py delete mode 100644 roll/third_party/vllm/vllm_0_10_2/llm.py delete mode 100644 roll/third_party/vllm/vllm_0_10_2/llm_engine.py delete mode 100644 roll/third_party/vllm/vllm_0_10_2/v1/llm_engine.py delete mode 100644 roll/third_party/vllm/vllm_0_10_2/v1/worker.py delete mode 100644 roll/third_party/vllm/vllm_0_10_2/worker.py delete mode 100644 roll/third_party/vllm/vllm_0_11_0/llm.py delete mode 100644 roll/third_party/vllm/vllm_0_11_0/llm_engine.py delete mode 100644 roll/third_party/vllm/vllm_0_11_0/v1/llm_engine.py delete mode 100644 roll/third_party/vllm/vllm_0_11_0/v1/worker.py delete mode 100644 roll/third_party/vllm/vllm_0_11_0/worker.py rename roll/third_party/vllm/{vllm_0_10_0/v1 => vllm_0_12_0}/__init__.py (100%) create mode 100644 roll/third_party/vllm/vllm_0_12_0/ray_distributed_executor.py delete mode 100644 roll/third_party/vllm/vllm_0_8_4/llm.py delete mode 100644 roll/third_party/vllm/vllm_0_8_4/llm_engine.py delete mode 100644 roll/third_party/vllm/vllm_0_8_4/v1/async_llm.py delete mode 100644 roll/third_party/vllm/vllm_0_8_4/v1/llm_engine.py delete mode 100644 roll/third_party/vllm/vllm_0_8_4/v1/worker.py delete mode 100644 roll/third_party/vllm/vllm_0_8_4/worker.py create mode 100644 roll/third_party/vllm/worker.py delete mode 100644 roll/third_party/vllm/worker_helper.py create mode 100644 roll/utils/asyncio_decorator.py create mode 100644 roll/utils/context_parallel/autograd_gather.py create mode 100644 roll/utils/context_parallel/hf_flash_attention_patch.py create mode 100644 roll/utils/context_parallel/rmpad_ulysses.py create mode 100644 roll/utils/context_parallel/vlm_cp_patch.py create mode 100644 roll/utils/fp8.py create mode 100644 roll/utils/fsdp_utils.py create mode 100644 roll/utils/taskgroups.py create mode 100644 roll/utils/train_infer_corrections.py create mode 100644 tests/agentic/env/test_mcp_client.py create mode 100644 tests/agentic/env/test_sokoban_mcp.py create mode 100644 tests/agentic/env/test_sokoban_sandbox.py create mode 100644 tests/agentic/env_manager/test_traj_env_manager_debug.py create mode 100644 tests/agentic/test_segment_masked_mean.py create mode 100644 tests/distributed/executor/test_async_cluster.py create mode 100644 tests/distributed/executor/test_ray_debugger.py create mode 100644 tests/distributed/scheduler/test_generate_scheduler.py create mode 100644 tests/distributed/strategy/checkpoint/fsdp_config.yaml create mode 100644 tests/distributed/strategy/checkpoint/fsdp_lora_config.yaml create mode 100644 tests/distributed/strategy/checkpoint/test_fsdp_strategy.py create mode 100644 tests/distributed/strategy/context_parallel/test_fsdp2_cp_grad_equivalence.py create mode 100644 tests/distributed/strategy/context_parallel/test_fsdp2_cp_qwen3_hf_equivalence.py create mode 100644 tests/distributed/strategy/context_parallel/test_fsdp2_cp_qwen3_hf_rmpad_equivalence.py create mode 100644 tests/distributed/strategy/context_parallel/test_fsdp2_cp_ulysses_equivalence.py create mode 100644 tests/distributed/strategy/context_parallel/test_fsdp2_cp_vlm_rmpad_equivalence.py create mode 100644 tests/distributed/strategy/grad_norm/run_fsdp2_distributed_test.sh create mode 100644 tests/distributed/strategy/grad_norm/test_fsdp2_grad_norm.py create mode 100644 tests/distributed/strategy/grad_norm/test_grad_accumulation_scaling.py create mode 100644 tests/distributed/strategy/grad_norm/test_grad_norm_unit.py create mode 100644 tests/distributed/strategy/log_probs/analyze_layer_divergence.py create mode 100644 tests/distributed/strategy/log_probs/apply_model_patch.py create mode 100644 tests/distributed/strategy/log_probs/layer_states_capture.py create mode 100644 tests/distributed/strategy/log_probs/log_probs_fsdp_config.yaml create mode 100644 tests/distributed/strategy/log_probs/log_probs_fsdp_cp_config.yaml create mode 100644 tests/distributed/strategy/log_probs/log_probs_fsdp_cp_rmpad_config.yaml create mode 100644 tests/distributed/strategy/log_probs/log_probs_fsdp_lora_config.yaml create mode 100644 tests/distributed/strategy/log_probs/log_probs_fsdp_vlm_cp2_config.yaml create mode 100644 tests/distributed/strategy/log_probs/test_fsdp_log_probs.py create mode 100644 tests/distributed/strategy/log_probs/test_fsdp_vlm_layer_states.py create mode 100644 tests/distributed/strategy/log_probs/test_fsdp_vlm_log_probs.py create mode 100644 tests/distributed/strategy/log_probs/test_fsdp_vlm_log_probs_perf.py create mode 100644 tests/distributed/strategy/model_update/model_update_fsdp.yaml create mode 100644 tests/distributed/strategy/standalone/fsdp2_standalone_strategy.py create mode 100644 tests/distributed/strategy/standalone/run_fsdp2_standalone.py create mode 100644 tests/distributed/strategy/test_fsdp_strategy_collection.py create mode 100644 tests/third_party/sglang/test_abort.py create mode 100644 tests/third_party/sglang/test_fp8.py create mode 100644 tests/third_party/vllm/test_abort.py create mode 100644 tests/third_party/vllm/test_collective_rpc.py create mode 100644 tests/third_party/vllm/utils.py create mode 100644 tests/utils/test_action_parser.py create mode 100644 tests/utils/test_cp_rmpad_ulysses_utils.py create mode 100644 tests/utils/test_sequence_packing.py create mode 100644 tests/utils/test_taskgroups.py diff --git a/.gitignore b/.gitignore index 5e7b73d3c..7e6830569 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,4 @@ -# Ignore all png files *.png - -# But allow png files in static/img directory -!docs_roll/static/img/*.png *.pyc */checkpoint_dir */dataset diff --git a/README.md b/README.md index 34b38ffcd..fcd3e3a22 100644 --- a/README.md +++ b/README.md @@ -43,11 +43,12 @@ Leveraging a multi-role distributed architecture with Ray for flexible resource | 📣 Updates | |:----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| **[01/01/2026]** 🎉 Our [Let It Flow: Agentic Crafting on Rock and Roll](https://arxiv.org/abs/2512.24873) report released! Introducing ALE ecosystem and ROME, an open-source agentic model with novel IPA algorithm. | +| **[02/03/2026]** 🎉 We released FSDP2 Strategy, Megatron with LoRA, GPU partial overlapping, Qwen3-Omni supports and other features. For more details, please refer to the release notes. Welcome to use! | +| **[01/01/2026]** 🎉 Our [Let It Flow: Agentic Crafting on Rock and Roll](https://arxiv.org/abs/2512.24873) report released! Introducing ALE ecosystem and ROME, an open-source agentic model with novel IPA algorithm. | | **[11/08/2025]** 🎉 Our [ROCK: Reinforcement Open Construction Kit](https://github.com/alibaba/ROCK) released, Explore the new capabilities!. | | **[10/23/2025]** 🎉 Our Papers released, see [Asymmetric Proximal Policy Optimization: mini-critics boost LLM reasoning](https://arxiv.org/abs/2510.01656) and [Attention Illuminates LLM Reasoning: The Preplan-and-Anchor Rhythm Enables Fine-Grained Policy Optimization](https://arxiv.org/abs/2510.13554). | | **[10/14/2025]** 🎉 Our Paper released, see [Part II: ROLL Flash -- Accelerating RLVR and Agentic Training with Asynchrony](https://arxiv.org/abs/2510.11345). | -| **[09/28/2025]** 🎉 Ascend NPU support — see [usage guide](https://alibaba.github.io/ROLL/docs/User%20Guides/Hardware%20Support/ascend_usage). | +| **[09/28/2025]** 🎉 Ascend NPU support — see [usage guide](https://alibaba.github.io/ROLL/docs/User%20Guides/Hardware%20Support/ascend_usage). | | **[09/25/2025]** 🎉 Our Paper released, see [RollPacker: Mitigating Long-Tail Rollouts for Fast, Synchronous RL Post-Training](https://arxiv.org/abs/2509.21009) | | **[09/24/2025]** 🎉 Support [Wan2_2 Reward FL pipeline](examples/wan2.2-14B-reward_fl_ds/reward_fl_config.yaml). Explore the new capabilities! | | **[09/23/2025]** 🎉 ROLL aligns with GEM environment definition, providing agentic Tool Use training capabilities, [ToolUse docs](docs_roll/docs/English/UserGuide/agentic/Tool_Use.md). | @@ -105,7 +106,7 @@ Leveraging a multi-role distributed architecture with Ray for flexible resource [RewardFL](https://alibaba.github.io/ROLL/docs/User%20Guides/Algorithms/Reward_FL) #### Backend -[DeepSpeed](https://alibaba.github.io/ROLL/docs/User%20Guides/Configuration/deepspeed) +[DeepSeed](https://alibaba.github.io/ROLL/docs/User%20Guides/Configuration/deepspeed) [Megatron](https://alibaba.github.io/ROLL/docs/User%20Guides/Configuration/megatron) [vLLM](https://alibaba.github.io/ROLL/docs/User%20Guides/Configuration/vllm) [SGLang](https://alibaba.github.io/ROLL/docs/User%20Guides/Configuration/sglang) @@ -151,21 +152,9 @@ Leveraging a multi-role distributed architecture with Ray for flexible resource * DPO Pipeline * SFT Pipeline under development - - ---- - -## 🔮 Upcoming Features - -We are continuously working to expand ROLL's capabilities: -* ⏱️ **Async RLVR pipeline**: For even more efficient and streamlined asynchronous operations. -* ⚙️ **FSDP2**: Integrating the latest Fully Sharded Data Parallel techniques. -* 🔍 **Support DeepseekV3**: Adding compatibility for the newest Deepseek models. - --- ## 🏆 Notable work based on ROLL -- [SocioReasoner](https://github.com/AMAP-ML/SocioReasoner): A vision-language method for urban socio-semantic segmentation that employs a render-and-refine mechanism optimized by RL to identify abstract social entities using satellite and map data. - [STAgent](https://arxiv.org/abs/2512.24957): An agentic LLM specialized for spatio-temporal understanding and complex tasks like constrained POI discovery and itinerary planning, featuring hierarchical data curation with 1:10,000 filter ratio and cascaded training (seed SFT + difficulty-aware SFT + RL), achieving strong performance on TravelBench while preserving general capabilities. - [IPRO](https://arxiv.org/abs/2510.14255): A novel video diffusion framework using reinforcement learning to enhance identity preservation in human-centric I2V generation, optimizing diffusion models with face identity scorer and KL-divergence regularization. - [TaoSR-SHE](https://arxiv.org/abs/2510.07972): Stepwise Hybrid Examination Reinforcement Learning Framework for Taobao Search Relevance, with SRPO (hybrid reward model + offline verifier), diversified data filtering, and multi-stage curriculum learning. diff --git a/data/deepeyes_mini_10.parquet b/data/deepeyes_mini_10.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d6ee3ef02d6c0cdf75cafc5a4ab546028b6edaba GIT binary patch literal 405793 zcmb@tWn5cNyZ4*mP~3_HNRi?ccWrSf6p9olKyZS4fdVZQ8oX$cQrwHX6))~C#RA2h z0);2E|9kKIe)j&Hb6y-WN!D63bB$ax-`}js;)RSR4;dO6F&Vik844MI7y!TqqXPf{ z4_7yc3mM=(f`g2djF1e23JK8Yz?({vUIlxLtI^LkoIA2TsSxkU@b>Sdsm$MR1OetusOuu z9_$E%f*oDJ4vxrdmhLcfHycNoE7;8uj4k>C>}lg>4R*7Jg56!AE?{SOsEaoj`ct4a z$jQag!^Q$?f&Ca@i6_MZcD1pBLEPM3pswhYgf=j+xEs(1OH%v^HWmx`)pJ#_8^p?0 z6bFxr5yzL_&D#k|FG^1&k4Dcy`G8c^mmUUj0Mhc?I6$nRCO(di4kk7*6QCL+JqNu7 z)YaU@#tGP%gv_Cji~=K#JHn6+bajH7+gRF|gI%Cj$SQ*!Ex`~N*aj&70&IqCnv0{8 zlMT!YY~uztgSa9qDoAMN=nk_0K2$Vw^a8V(AwOA>IV|pTIig!RL6BWxW(zfU1KUH) zp!QtI!kr*45C^Cm)WsE8hBt`K1f&Ha%|cz=Y@q10Kus(YUJDvl4v32j#2cB~#trI# z&VVPw4|8`ggSycBanL*1z?86QNcoXR5HCtR!8-t!mNitc4S?oo$<)RP#6ouSBgr?& zHgO;w1vc>F;Q)Ih-(I}PZ(bt~u%|T?=@u4j3S3sOjVrpOu$!Y3r#;lt4Q%e{0)x6R z;R9H(`N()W9nD zXk6w!=NKB)LU0oRVBtxYW#yvxLsr)X>g;ag0=1wQHDGcir8n~P^TW{ygy3N*I(k0B z2be=J&>&zL6%HItDALvUPOx>A;b2{bq5v#47SO-Q;euSO+#R4WH&^sW8tVWPp!)MI zY+T7*6l`(;j>Z@oSe(cJzzH@Njd0d691(Zggy1Ze(}y z^rC_3AB7S^oRFCzTtGYu?=ezjistBuSByK!%o0fSL~)t~JEQ#cTkG>;Tw51{;tDH{`#f0K19S^)AAIQ+MS|Bl5!N&YeLpL9er6B+pzkV^k`@(=a@X+9E}$Z8|m z{jd5XQGc&V|9~IK(0k4Je}(8@xZZR0PwxMVq<^G<uE6u-A^6wG&HoV^X!{;$MCQ)8D({k7(S_miv=`bOQ|jkGfIp0>%H#SEPLW9Yyye z#2)F;zq$S>Gm_l*GVF$gh6OU_ugCwg?mwmFucO%5{wD$cLuu~o^FO5Ke#HH|*4!)B zU$Jrfb9k>mNdN!swm%~D->3Hi^k;Oq{v&4hGW1u<{+jsb7>v|PkUer$3BW8Y#JOL4 zvY?@15`wvMie$9x4`eQP-|A0E$ z{dEeAzv+<4NEH9G2;hBbNRz7{xXBj6xK)ws;(O~y?W zPl2AOKzaZQ5fGIKc-IM_Lza(@d;tJ>z{5Tu3Mv{p1|}9Z&I4pzH6Z{6h>D7WhKi1k zhHN6x4;c?YBSI&B#3O@2q5;9AcP8ZxjLF1ecv{|0ra5}Z$Y^sNQ7(#3K|+J8YXZC3y9(ge3VawhW>~L zgIGob6XHxl&l`wE`ZOl9yd9f?PxFw>%w-gZoRR+r)6so9(B~3n0cb(NLvsMM)iu0X z-21YyXHJuU&jD~zfyk~zB?3qRHXhd{sch;mVCQJ^Q8LzR#44_(^Ze3PDz^QZz(cL~ zWUepKMUF#np1uHYe1Rm@O*}jY5@2JsO2hSvnUsd z&3jTv;o)GSabtWXJTP5?2Yy;==i2~EdAOHdVQ0M-k<6uu%J4ndLj};rvoB}2rT^{K zGT6H(rjZ^@85oTl(k6kK<20fMwH_GbFN$c(T3KaUkfL-RCs60 zt9U%t6Lo3twTd~rG)=Gf1wfh+sz}4rV=`4^9{>3HQOiajB@N!|W%I)4C>07$9u0`i zJ}Bq%Ou$)D-Ds1{xvUCS=>i8A7bC&NymjB4>o98S)M!b-`)ciyVZ$$V1+N1@`mRfy zpOcAQ-YS55qu@NWH&<(9!4D%!pFBySHkiU18VYYC)O{npCZW_OTbMsg6a; zMzK1hCN^4sTFe?J^L0z>kHJ?}Ehi;44z*vBlQ9ipX$AC7(%QE!udhJ94-_qsRSf ziUqQ|n&PP8EtQ#BxdQ3d`X5|BU#;|JHh(-b8rhkE+QH1gxCZy=sn)I?aa)!ssWy5n z7SibTEt|3E8@>{IWIY94s6anR*>OpSTXWybudAZ4bW3TSAl2$D`?&j|HMIg%xXh;E z(jl+D<+;#a@NQ(=AS*r`&^-dc6hUko4k9{sa=xY<$u$*VQBQ0#_c5Acpi1x=oN#TN zxCx_r3VcPQq{hsWR5&iu*wKB_a`t{bfYfo&+UU~OP0k#2l)0f7=aNd>onI5Gzl19-WU zUKi=Tc#`~}Okyf_dax?`xsCW&MN$6e;fqRV6WM#^Z^-TRWMjm-cSb+1gwItTQPmUK zMstL#^T~e}S{0?5jD0SqMJ>QcUdx=bzPWa(6_yUH4@J)b4#dizy%ak zio7gx^?71Ee!=wbIRYpEBw&=Ejq>ds(fB6UCUpazJq9o z>QM$| z$hiKCvzCa^vwk|lt(v*?4DN&_i(c?C zfs-HBKKsYZVDXC-=zi+RqxWR7l)TSihQt!r;YW7>%*#aWOO-W|QyOQ*@j2XhX2Gfma<}1pbI6sE*O2d7y7bFvo=bo=oMR56;}Wg}Dn5K4$fPeX&Wd`B zIaN4NAc*qWM!Nllj}M}ptd+&)V{b3L<;6D!m|t@VBFhYs7S*{L~y}i{-(dYEea2 zC3ib2;P+291yoA$6xQ(W+z+n!iLO@e&TgeWxNWH<)U3F9BJF70H^42MhPUTk>%JbC z2$tgSSr-cX@$=nIYU*m3B`#AbL9&_symj{Q!Wsg#&@!~ievy^BcVyw;G`t;hDima* zPNphi8_GXC^{ceph_Zt@K_G zTmML0tX=q$3~myoaPKFR-23~v_4a+K&oyPw>X@If|#Rj$pb?=liOw)hs|QP#~-UwSNL3% zwqt%x31}+!l88bIo_TU;V713Rijkg80Qs$Y?*)zrzqFYay22}to5Kk8cm)&w?hIeM zti1K}7GsKD#E5-HftJCCd4Mn3{Bzwi%*v&Z3-nF|{)90-RI*+GTcFUq{wI}WnS;AP zMk=4M2T|txg!gM1WvX$z=QKj>$W^c_E@5nrKHE#;ZtXy?Q^RcSIcb^_|+5x#d z1kN@hywQ_xa`Fodp4 z((!Z>rkJZ=<%E#EAxi3{)^%iJrCHj( z?90{TKOgy3OWjK(>s+n&knzgu*OViXKu+n1NY(o@qsLRzO1?oA^0$_IpRERj!62*P zAtee4-zkUUFIbQ3Qb-sN=acs~P84u3cHvP;H;dk6nYp&(Mk+MT`C7Fn*(M@N4uzkI z+G+S=;$AG;n2l>-!IZxZkSnDSy_b4=3Z{#T{`3I2G;6;UtpSMJj&huD(Qfs(}v5O(i3f3C% z0>sT&Nvwd_3(sNy>8Y!cGv37=ldws-CkP0~e?Zew$^%D?Rev-%!z>8@iaj?*t(eu| z{i0VJ+o+^&o%~j>Y zs`_Tt!O*72BYEU1`wTP3Q`DZ^I#!~TqLJj0c|sooQ~cuKr3|P2?l+=M;r@V79ifD7 zXu_+`w;%C8-_)<27P{OpRmAkG;PM9l?&hhT7T7-RV^Y+0izM??DxDNzaI}aOm%jt7 zWHs4OO}om0hsY*GZ_P8tYMPL+*b`^^9ahpsy8Ae$4gpB#+3vAD4J+N(=XKz0U2%mf z{63kujgM-u(y^;OVdCu5o|AY(r`+NVdmd{wyd-exG-tHYK9+qg>m0F%FX{GjNFl#W zAc0&S9Mv{Gc9r&NX~#2>s7Acrg8>FO%o0XS)!hMzz3?TRjeS|BIHg%FakyjG%4i~N zvM&~tJv^*Zte2R4O%qbWY)UYJS5436Cw%?2C-wTAzbHGNdg?hKsMArF8|!F#Z+Coe z-P@M8r^z4k$~coKuz`5Lwv*Y(O4-TVu4h?Gzp=m8~o(oAiKjvnKQ;kOr)N=3W^S@X9@CK8WEovp&=l1i|s$W!R z0)exAuE^Ri=32*A655`ba|t{vVKPcuzlbKY?3Z_dsS-Q|HDB*n<}C`%mTNZc#Z=|? zwIruq)-tT1%?geB&601IUA$DHVh?H}J_au}6Lk24z1Ck0I3F16IzHyM3wYkZ(~HGF z&KoQ7a8)P>d>Q`G->9Fb}96zEbUZ6&4KM#)%AvTjmXndZgW#7 zt;mJxG0BHhiqMGp;F*h3u{GU}%zXXU5B%0rV+@ANox#7&2!2^(=F|Aq?->z!jy!Co z=KG}7@j3cS9mUR4UQ^1@F{A9ex8I*Bj^XZ8AW)|T`Y*){L)N*{`~vCk07b9YoRBcv z$sVU?a8Z+lq#LAE?MGU9DHddH9ot=Xin`%4#5!~ttF8cd!gETuRI+Txx5td-E#93s zDo_)RGt;GcF5}y$-I$F_v7+BNSX%zeDBA6(ay3@xD zYk3{qK9RExyG;`X_QKyuC2xl3Fd8s2(Zs8RxP))E3OB;=UH6?gRHC18@r_C&mU2!; zWV3Md#1ZrK#j_=!3c{tw1hp$}9B8xVQ!K7@F3hPMoWBRR0emw`oLozi>v4s~=7ZiF z3*l13yu>|koPUivAv9#Iw1fa}1|CuxnxA&*Sn#N`g5^i^z(>PhFS^vRVp+{ulwSWh z^xZ7%6N@OaHE9S#&mZ8+x&x44e9!2}-&-o;6YiFdIFH6g0|xdu2252!noX*DsW^I< z13YkIZlAw=MeF;{`ca%6vAo(OBQ83sEIY!`{|EsP)N%(;Zmk9N59)p9lV4y>%%@EPxu56f zPqqz-ejKZ3s=)e*&$pd26%z1xP9!G7E}205|^S zeExP9R4U$FwDg?KU?A-M97@Iu3# zX#=!AC*F8Jhuk=sNJmgm?x@-Ay!~pWGuROnsJNgzod@%oBED#9S85(w-7(}T*60Jr z8YZn9by}qx;U1I#m z*kVWs`QRWYV&goY?tFSy(kMl*!WjJB5Ucx?!9(kGczP^1cEV`BVz7k|46Ko24^GGP z;oojFR@;m3^HUaurJkEZ7h)Eb_m`VYQ30a9>A~A7xm)EnaSZw}!iw6Y3St9iaPXyD zi3z7CI~n>%Q9NmO5MbmTEHfeB#HA-r@OpmmqqKtnOAT<7fACfs>Z{&r4fAZxcMfR=H#zj9iCb zb74sor5xqGDG@iy>j#B1G^#;9;>(;0*ea_e$jMc`uKsLSv6Uz*nry!=)e zdDPrAVy92@t(^V1FC#(Prw4p0nI^M6=t4_>xOWkHCSqR(v4|l+i^Mu3O{bgQ9Oa@C zqILM#L-N%74iHG-{L%`upEH80CI9rySBli8%-Ml1-kEY#;H!`2J+%L}s96)g*- zeKsdsujQq@2@cw&r**<)N^0Y;1kC0H9M-q>&nwoNUB9;A*%_#ey@fJ*`p|5LYqxLt zST^g*HyeC#rZx@YiJV-eau}Khtp{Y>Hv2G#R-@zq?deLalM{vzu{ZjczRbaAF~!?F z6*fYeP>+EAs=Ty}XMJ?<3ztBXAvtzcEwgc*lKCY>cvUnem zbc(#0tA3rc8bDxcf`6mOl=?13su``WEuG#=%0y!(d{)xI=Zr1eJAQOO;K7CCWEmry ztYSXajnb`l*K>*L3HWjMGYrm*f+1I@;loep5Xgo8RD#FzeRs)tf2d= zJUWYEx*(4>PrQY+S@}Xo!?=I*nMax^QOuQwY8s7RknD##_z!Q1s}#0UrOp1LChPKS zjUZ~qy|X)jREhEUKDm>o6@;?QYyq`*O`=G0?tv9rUudj(4iA0OLZ!nG z>O7Mir$!G^qflE>pAMmvICcg46)u7S%)$1XcXD-{Uq8J;K?DJfJ1B2#m}8hNa57)f zs^I#ghUdpLcPPLfI-(IO7zU*JyN&H@PTK0%M%v(DSA=>yHcrIsO0dl|CCt9%Uf`H& zzxX)cNJ+OiB+@NKU<;{oEvKB!uzmaMkWAqYU|d_0S3b`8#F0_v>7IQ@v?78+>b4=2 zms+m(>_fhiT%`E!_fEjNg$030uCI>ka)%u|c1gJ69iT~7P z+GBzOAt8O5%|~4wWRn@TQnN50xRdG}Rd$m9VxO@wk%4eI&Itx8er>PKSiqi|p-P(( z&o@|UJrLbUABk-_q<(t!Qs-)l#VS@(fN@YSJdJzoF{NZ(;t0{qmY)(~`ayp2( z)FItaRdAPcg%Vyvcjm`f!2{+6AFHW|Doi&#*wn?Bxr06O!(Uc*U!NonIpHgS5GP$S zWC^vPheA7+9jGXJ=KXc1^78w6pzyKk`attnFvcf@V!{+oSUY>aCg=N$2&(1nOCzeB ziDR;prJhiSj5zmk^h`5vtMQI5Bn*gg>L1ThjQb{QU_aa98RD4+xy}nz)9h|eQ74yd z1PGe~>ZE+F5iL#uM5-1x(AeXnz49tHTv$KJ#m8fLh1wdnxjDTw(b076*!_jdXp;-a z(C=mWznp}BsY?(r>C?z3n!!B|r7wjd7@=xp(PfN5t_R&xIJKhhRU;pK@?;Y#z{RAz z1MK+vyXy>1sT6(ql={FLd*Np`D0WudbLSM~S;VW?CGpW&Z(&$+KE&Sy%dD+iw^Ruf zGnXa5=Th9acZU9r2S*xk*1@{f>_yh-P&mj|Bx@eWF+W3Vh#DL-(s)uuE8Cg} zzV|6g^Ya)+do!0bouD?~pyg4>DkTS+x(t*Fx*P=_l0yQy)@^8S}9xnhC6y3tvW2e4GkC+Tzv_t*&>JX`BpJGyi zxw0Xc&2r9;jIBTSsW`4!1kA#RHWvz~gw-Fumi*8~^*Y8-w7HEkTvHM%^}~ew@AGWnLRaHa<7hS7B>!URhO}205N3WV{F}gb{p^6icuPOEMhGd6 z-Ojx``|Ro+RI5F7=Ym=ECB>nH-|qm&ea+aXx7vA6o-}g4w<{F$j_vQ%*ZfIN$+o~t z6cQ1t4Vj5LEEPQO+gk^RkDfev&0H+8z?}>ZNcg!x`&i>f<#uG(?n`S`)fE3_*)EMm zZL%jjaJ)w1y+jfAxXo-LC5VbgzZETsLBROu-U5!uzbbP)1+L2!Bc~7>$H&ZPX2!-C7jBM3zJ3 zIY*z3$Svln)NXp{$)4f)hx`?mz8idJqx}pIMmUO>K(@tr1D2TeaH9B=)#uOJYg3{^ z1;B^;3w;}g-HVWD5R5K?pjH66TI14rF1Tx$J2HO-DKbIcaxL%qK0Xpm#CI%-`^_c* z>LV&V-wd2EGYiBNUhsVI#|;Rh6C_#Rk+-* zPcL(ncHj;3OjP>lLi_)5>Exc2HLq=0G$VXN}+ddlx)r+tWU z3pKQ`P1t-)%wFoz4408*nVWJgkx0x1OAWwZ!P?B~q!vX$sJlK?IzJzibP^9y zcgkzUe>bfSaD-Kp?m5F^8saFJJxhVxgKjYk+f!qltFY2-9UOV zZI@SlZ8V3^wc`g#Un%4QeTV3X!WTdtnkK@)i|5z41DN%IgN^UZ7}c6-T4$=V-#rY%|Nlf zS_QoweRh8SG+O24?bG5WT))w_eLE?ul5c}FcD5yX{Vkkb20$v^m33T)eLy#I-{G8q zz;-GnJrU*rRhT7ae=7p4fxqzqSz*$!w`DiiPC(?EJNBt7#{34>ASS@4eZTq2`J7}HMY{vucfS-<#dx#h;?4nK-lC+EU$!2`>i&UfVQ^u3jhs(G;c>Q zfs^!wYozE1_m{XleYqGAl!E&VlONc3xzjdf%so!2TqpuA{>?Mq#bm2=+| zX4m8pL(h^AViDOcUWl@lfXreHov=fBwvDQV;F~+ZaE=)(!wCPU)nbuhsWcz*hF}MQoUd#II%!Ib3Lllqs)Cwfbe@3cq+FtThA*_wtoD{+6m`AiJecuu0ox7_#Cl{ zU$XP-?>!lp57Z8q;Ww=4r;4LXJ=cv@%L6V1kf@Jmn9FnF$3^VF?N%gmt^j3j(MjXH)HxOt}dYRpr6Gz4C%#UviR^!o3ydG#Ed z*VjUT(_yu9yw*$?83R@7Q+pD$L%Mqz62ixY{v z3&R>ClO%Xq2Yr^b>LS#It#D;T);|9f&JlGONuE@E@_VpD{OVfUdrr2~9@p0ZECh&C zk;myAS72(2=^fnU1{0Q2ioBS?%|1E7Cs4cnxK_}+Z`4#T^oDVvegzpLQuQ)sRG)KifS=yFF&&n$gzyo&Y!$ zp;*JL_NBVK)!UJ6rnfDg=kt1g`!3%|lwn4Rxsnt3Wu?nP`P>X3afO~(Vv;gwV^*B^K6s2jSLvMz_UU%Gmh(IShd5h zz$>o>4V8FiF8Mj;N?Rxl8ZF-_ZOEY56zd!9{Dx0c`5R@HlDP~qUS2=iSDKcl%@4tz z0=J70t}oB4fRpNZ<;L{hx2ms;vRP`hz2 z6|cjqav)kct{|qk`cV(^I4_~wD}ZsXDOF=c^*z9n{W%Ih+VY8u8@3-?2npHw`U~YH z+B|(6@J)1TAUrT`NNGr-neE~&7Z*oaQ__KXl}gNdpU4nOt?MoprbK!Aq~`wSBS*GP zt!t&tR~e|x!gyP}HZ64H;Kuq{zrwCORuP3XqcI%>JYURj=4w*T#!T-lB#j^@ul->f zV;jrdG2wuMea!TZ&JO0c+T0ywMNL4>d46;ccg7$w8HGLV3N7DKZ9y85rz}Y{uJ`Sc zR5X9Tb6WN}={5XyOMBFob!e`7j;@S5GY!j^@Yqg9xU-6C+&%(rEfq(bS_?NW5>bNE zlKt>&ySx2VBB)1rrau)%)3ad=&Ahbhy^JBMR#0JAf-#BtzYxax95a33c3!41^@Qo` zcy@izU@1Zq&8eW>OilFstW>?jU+RO-^q0}d2&U|@Fv?jA+(O;beISDNOgXiMzlu}$ zt=YE3Vyo0P%_6CWrqX6BNef+WtMyfG1|ij3yJSLwvqGto*NeV(OjVr_BqAblVoV&S z;Wv7h9!!x&zRIB)&8|2EH(mkyJ2{f>eVkVTF?a?q`%g=ra{7!G?+1Mv;ckm$GI=54 zCue)<`BG3Voo*781llXF3;0d0^jnJUWLA7$-C6=V$mS>ua(W!N0LcLYK4i4 zt<5q$qtiD>&vY+LdZR)|8cVk^u=PHgqQ1fzBu$-$WSP`thssT)3T6^pQbyf0b&Dal z62zjq%LV*6_j1fgSL^dgWcLN9{4)?c2ImiRd9loUTGtF;^OBlM?3j(M5Q-GPw?3mG z(P>BZB-_$Ax{LuZ(>Dn{Z(-?3HETwFHrcYpp?LE%j=|FAwZb8qq{tYAu*u0*XD^8v z(mm!6wfESU=&AqGP()pw!ZU2kZawL&Q(>9-VKRe6a#D>Z`Mr1<8IdX31@bwbrqi=Q zo-p0-p4asV(!h1IayUYGMVD7qi6A;eo}DWCvuk=1|JNR;!_t}KQ21I72?48~q)C$L z6X!d?bkUr{Y|Yy>ODu3uQ$}~;xy<8Tb%)$f_LZ`6O2P1GUE0=nB>LTyo}K3Jyq1E& zzZHufIK>t5#OfG6^s03e-We~GQhrrBwv08OrWK^g>VF4V6+0`QcStt+n4u;GQ~aVM zeF=#4W)L@@zXOzM3QSDh0qP^2t7TTwbtSyEmlA&dva5XD`-T(HdcJnpY`i=<`5slaLzz70rIE;D!MCfF1n;f9litTSHeBYe5v9X z1=kY2j%N+eHT|5sWSTE9ILJmUfb$cm&s3bsvl9|(y|v*zA;W_~JZ)E#Dj%@TJmhqs zI^D`E1<2j7q4e0H_BnpeYSmKh3IIhvUFU5zxK1TwNr6A`{T)C{`q!Ji$mY=TsfcG< zuGNB6cJfV@g%Z)emJ->zSx6{qW1{i(#asAaRQ>9VMHAFkx`B% zIEbr8G&omLbT8ob;9!b~&^C+uPP_U#=lyK(Ps^Q86e3T6>X}fh4s&hBYm2;gKrmlM z1Muy_n^QJNts-R;mV9;Rin&{P+C?aP5hLe&`ps2T4&}L1Ln0QsCyWb%Xmig%=6Ur! z>b4O-mQDy*D)4or1@};E_+o7s63`p4PC8?Gw(t)L%7=H?57**cb8D1K;NID zz8T;Po=@qp%Zd1?+6ZDLI1X|hN25DtH&Ab=hDP{Lf8}mhk^#E5uUohfRO4TVWd?np zJ023KNj9r@n=4>^8`=(XYa13v1n+6g7uS>z!seCwG=(fsQ%QOen7>|vqxDSqI#IEb z-uHj@tCuM6_T&4dlUgfknd!d+$)i@x#)>X7dT82a8mow!t}e4wAb};3^@HEDCPC%F zQgK2pQPoi=P{j0UCG?HG6%6xPtc<|+S^sOvlv@+5*&aU_{V@^bEgZ7;0steZH9ngd zhh1g6y}VRTbWxy2*?qVHwpc4=?d_xD@szIX6AM+2Jf_%i84B-IsNJOe*>(p|4u}CA zR`9a3yQ66%cL4jrA%Gw(39PKtQ_>**m+>zoKQ>Bl-~3cYJ(YULa;0fr_V^9A=;j-? zMW&A4bQOi;Adhl{Wc9mDuctJ8uRiG{Un~l{YnwC#eq#RVMK-r=>q)8 zrd+L&fBZ!v#bpyBNBTB!VuI?ybExjqVxT6s%$6LhDbI9VB!)EyD!PQaGT>MLDx%|H zm%XxZdo&K>)Z+rYEoh9&@#f$ghIk|r3eFO))0vp>nK;@~`IQ!#bEd7Ua1i72ru?#7 zm3otVm{8gd^rZ~5W;{QYTy11{Vn@gSOozS4oxIYH=By|QMsK>z`s;JF8^ips@SpYv zB%KWg$>y)l9yp;7Fa!=On4GI`k2R$a9Xs&uK7S~YhW%rDWEyJYGcdA4^)f#mUr@-& z|I~5U>o-=u-KXgjh98e6>{I4kd&tO|=JgxqbE>D^j{GdMQRBVl4`kO0Gn}u2)6aSq zo=zP{EBr7Vi_)^GZ?tm_!h-yo*2u9%*OOO2DaDyuL$jFLOPj*l{5+U7!|!0=1c><+ zEa&WF{G}g$<@P15SJHn;$P|u~vWS}~%>@L+#09pU?HS1&ibFjj)Oc8o{}G9^AC$})>&YQ|8NHiRjwURkMRAXBX8(TvABnF!$M=|V3YVDJ_=)l=YJ{4zA!-7mq7c#UMO-!|Qc+!QVywo#)9T zhKCjcCER%P~1kJj`?z2GF3~1{=l%2%QrA3yNZ<)8yXIUx^ZZGb^G{} zaR_m;XbaM<-(7(E_oC1@L?> zY#w-2vXeXkW-LBXz|(>qB3}M|K-Jj3a1=w)-|U@&QICzG6ilJn^~eb8F%Z9J~<4tA{aeA>B_zxGgvBU4O}2o^Q2a!0js$8eV)+^Kk*z zNF8;)ph5s+>PlqYjV-4zC07f~qdA={&WOu)S`3*Eb-7_X@uz#$le1VGYes z-4MA4BimG^M5G(WQwL-g5no?%n%&Wx4cp+ZN`4XoFXgKpmx%hI#){Fyg}EPIfaY$Z z5t)~92QX6lq$@Z##i^j=s~axD__TX=)vpwLK7k?2iJ;)D_=6qRO1!ZWDqHj`a0luP z=xDmEw59V^;Oh!V^)NB(YWI}Kxav;g6qp}yKfOt`flXyy#q318zr=VN#3 z=kjjj5M1G7rL7i7B`De=~s5$;x3c5dn9`$9asQ_Z=ri^if@NuPuzkJE%F()ry1 z7B#NM_sTbL@lX3kDkZ5Y*(3R8#`lEKpq7hOhtAp~DlT2fRq6{3IZ}Xz+@m33N`63< zDV4MfFRL|5b64s^GagYGQ7@xLEO*ED^p>sqBh2R=Tb)`TU!ampltl0Qg$+V4<2t5{ zACKmam%U_LV_9*Dma(h8RDZbl>?7t-O<@MQGJ%b2 z;nZ%IpHJ|$6Q14yzC9q$j(kJcfm|@E8>x9)nAN`s*9!)4`?r==8}k?-?20U0lxWk~ zD68g`FmU93QI_)J25E=D8nHWr-zqYD`z@|+CC{E3*y&ZMDlXN5Q|3gK96!0V<{hs1 zV`SY9JPXq6dp4XTGJJVG$<$won&S6(BqIE?)pe=A)1&9=xFSxx2GccH9hiy5LurAt(|^`G+VnB&9p@*Z+_!I03UJl#yc zrLB*zDGrNr7YDX&r!0kIs}@~{*gqN2I_+T=_(@Atqvuw{A1&FAaZFTG%Up{v__3&5 zQEB;cA$KY7UEhz!(;2eQ9z&A1<5ArS>bSNuTIB6B@YoXM%s2Kb`%;a^i?zEJ=N)ep z^f4ZI>%ZD`b@1$3EeBU$8y>gN&rm5_$5RN~0&d6h!6V~ytS9nH#`&FNBgX4DT(>=k z-B}E3zF?Ct>&VQsoj2Wh0iurli_MN7ZZ?BW4qn7u>U`b*HV!3>; z5J4p4s|Zcv zuvFsMFZnMpxUoP}mIVcup3flO&b%^QyBMZWQ}xe?44<_Eqkf&ebsx8FIs39>mY;Kv zKl#!NEQfF$yl{&Y(;{BK;${z=E-1A#ymbFzx9(k{<0mLxXH}axsGh}6(tC#HR#IZS zKdmDTX?f<1?-8&0seL6lc$nr65U-gaGu*Eu^CzuvEtO=*OwlDd{CK6A6dAwo&MHsc&eem z>{Bd!Sc)uYUG2rp$pfpDP3yPlp8EjaR36Mo#XaM!!?88h(KFA635xbYLm)`mf2T;2 zrgcde6Q!PIcUoo4*v=!YG&{y<&82`IV?CCqw|!TE@#w{N#~$x9nt>dOSX4RMjXwJ; z`1uOHT?1#0x5I3PHVRg6PH`0Oyy5Jn$I#O5H)w=lFcwfb_&6rYp02w(35DB9p;>Pt zZE~P-r{6)4wX}4d=h-I8kk(gY0A-uF)|_RgRe*n8CtRZ(R68oB)QX{$}e- zc%1M?MvgSp(@csDLu&%|#G#~iX68em>wr9jj~|{q3^em?@|h- z;c)H830nyn@|H>^&oGh_FUY6M=(>8Q~3%p*Lz6Iax-)!u-yJ zg9qJXfeRQKgT8|TqedPTK@wPlMxx>P=Oplqmp>?1trrT=2z{rzgA0N%(%d>Td*6j^ zsUK#T8)qDS_|bE2cu;18d_pl(dATJ*;VM^ZWIC)j$6zFU$}V9n^4qe!Ph^<)CjC}o zNutIFC5qoGQX4UnpWV(t9>s$??Ij!LwV)}kPx*Mt%iq(IjO0S_^KhL<19!hQT#HQ} z;iva=KggHMozdmk*5|wMj=^WWjQ;PB9KGqZN`; z-X5;3?7V}Qh3oB6p zje|M?xR)OT`mq=h5tYmgM?E5^C0MUj+_KWr_>P^Zm-E}Nd|n!K)&hq{Sv7fiFBi|& zqDpY8{d@I!GY`wcXqgA|spX}8--^|52kzYzm>;&j_aMKi>^^Sv2pru%a+Wp=2{K=PZ2rd(fz* zst@z~;M%JP-Gp#6Eo$U{fS{CKd_sbU5i6CIAS+$~SL&TNc+VLrNO=Pep+z2ay)!fX$V`O&ztVj9N zH)E<#stGFnbtST;YSY^I*DlT{bS1VnPI`0CHu=5m?*Mc!{L}I$1%gSuHDD{yUgPxH z($wc(XImamKg4OsXHWBVi#SOPx(Y3^M}9->{z&)!^zM0GA$CdCF@JPE&l5e|RR1*f zUxP0uOJhbu#$Lh2TRU6o_5 zQErb5MA1g&{Y=%HB)tI?Ck0Nc(m#{{T@&4%fMK^^l#;J ztA3ILRePZAbP9-aa4!XFDMkR!aLoMT!tQe>Sg4gv_V#&=orAmhpl#K&BzVPGO zumb_e{eQn{HZrl@7=|2edY3-nv;z)G)e$Z0l(>n^~fT<2ghD@ zj)FB?J|}^dU3mG_@y>gLUU6;lQ&+Xw^9dAlmgvLr?km!bc|mf?T<~U;UZ=Xl5P}He zQbdu3JT@qtBz?Lhw!x@UVP_81B7(p5nE9C1-P^$$wxZ6Wi@s?WNDk z>fQ7B4F3T2>n~lJ(&JmY({*dRSTyU1$C6~162~$#eCW93aslVl*0($xYk8+x&8Oa& z*{$CZm28lQCy~?)jEwsGRNC@({{V;cUTbC~a~kxQ8sCm|e-2z;s##1VxYS{cGMQF9 zZATys1D{Y2KGoi6nnjL@Z!MOZ40fvdTX5x;5|7R1D)C zGx&;`6e5Mra5|o~qKb;LDKYd>MKXf9{{RqQ2rjK*41vfm^A2&5(}P_FW$Icm)Mr^t z3;q1yF9i1H zr;AtAA%+<>8(Vl4*K?604ZhrtwNj5Pd6_=ZT-4Mdw7RpF*7Qd3OobUAA27!?@^8eS z0^E2%TWIiJ&TPCAfx#Z1QC~~1i5C9kTSFRwApHz^qh@dQ;%JGZj9-nlAxb@dfb+;)2o+{)xc#g%uKuzSkO% z(Q&NpQqV&q206+AK;z%*S0&V>0i$adZHxiOAdjzF_i^AYApZc%s+09U<461}y&RH7 zw%qj!#8)fAU!wd>xqF@qCaHLgrY$;6gN6n0N$uOG6&vdJg%(i+kpq!}3++Al7{JHY zz4ZJy(iNc^3dG8S554j!RcAN6Ycnu$autlBu%li!zu&|~#hJAHbEzaNy?5FKhLci2Y_^~E_9Yi;GXP#=^8!T z038$_GwQ~=X~VSJ@Dz`gNCP9-RVC~dqP1sU8kX&ri$@ed6;y>s0CWC+wa9DpB=U%+ zE9JLHf$Bb;Yij!RTa}4Z&6ti0bWz=pp#4p9nw-!*$C)CBa@+=ND*b&=w>8IFchL1P z@UyvUFDWh5{_o0Ma@a*f>`(Nl<(@z#ip1N_;dgBeKtcWH{+X<6tj#J*awU=kKO&Lx zgX$O8+}5lB9W6z&q-iJ2#0|yIP6zqV)0)99W2Nc2$?GC%EuueaLcoH~pdXlLBc9p8 z$4bV8HKZ*hX!5H+CztZGjO6pq2lK9_?VvIfb)?Gj$@1=oObmA1Ks@>YXTo#q{A$};-9sdo5<0W7ZL5$r{{V1%V*<6h zNawZAkIIiy(&VwejU`qSEz; zZ6(hV+w75Cb=(Gj#C|5YKN@JZcbd%CFkOfqP@;599Fm}`k=SD&ms-lCZkDe@&Betp zc+9l%CZDDolK5ui^`qti;<135ZYSPq|2|UrJ0O`2?HO0xIMA5F> z^;z?hS%(Ly$RFfZw}o_RM~rmI4bn7m+y*EacN{4^_dN4i!h{^v-&2A!*2h70b#1A5 zp8o(s)h(b^l4CZXJC_-D;D#XZr*1Gik6cyjPZ%8)BeA-&5Z%PyL5p)p%mxAg$CJw( zXSHLWv3QeFywUD$p5IZnmPu@-^T7K;GZrWg-;{J9bKK{xLwBTJ_|sIehT3^8Zf&7x zuO8`g$72iuJ4cTtwq+Fe>+BzwY4zbQaQaKnt`jt@2D(8Rhnqc)#;sHNPtEbtji zm(R?9y~sEt=xfpZ4KiF^NWyP02+2XaJmh5lqw}ubXtm}FEi<{IiYg64s!ymxY$Rz` zRanRbh?wU-m>ku*#=h|tor1f}(jq0mfU`n*Y!Y$mFgl7%P1vy7#+t8}yp20$$x->9 zD&mM0n6H)zJduia7DO2UgUHCJ<+Qe#sgelWX~@d-quv#X8I4wKD z4teyd{{Y$6hGQT?Bpt&ZD!iIRjU-JS{OThZP(W|im%KAVG~3fVPaqyKPx{t7>H5Nn ze7#u-eC^){3H0WnxVP8-%M?oX_Y5Q(*_ftN^5oHZb`|Z=-6y_yt~bT{+9an-7H@Lx z9`)LVhI$N)VB^!hYFz3%yi)~_OSpCN@qkc`-se7*O78mk?rdW=LM3KTv^SNVtWPBS z0sQJ)ZZAem?)EtU00rL5sCd{~$sDL{Ro%mb!5CZ-k`(pfc&Yv;X?otfKiTwKV}JH( zL3A5K+dr4a2|XA%J^5PiEI+bWT)KwB!Wix%P-nP`1yD)nZsFA6f`8r>ZpY`iy_B>s znYJq+=W2j3-r=`Z@~GJp2rujboii2;efC%GsZ_=G4!tn zxwm*FXx&P(oDfG!`=TvQ#(hS8J}bCh^4(RXfm|0rdSrJ00QFahd~o=C3~|ZgtvDiX zU@sx*mOKX^UgPogu9!77%J=9ya_V?ZmEYOPjmh%QJa#lyR~RHwS@SiaPATerq9u?I zx(zr_AnJtu1t1nAaekb7!or{aj4_-0d>)wuRo*07j~536Ap6tB%Js?osxsE9+fC)T z^!bnYRG>fxe%ojUGt9JJA!bUFmKj`;Qtogw{c5<1IeCum-TsmTLdPK=EU=Jz5h$YQ zdX@x_y8|Tp4r)&}Kr&;0%B#m4u1d<8@3uix?Y#%(Cmp_VNwgLow?iNhLHDZnY)q)Y z<7P)b=9_ZbqC|X{q<6vFhFoXgnrD%5HMTBouAWeSU?u<@`g7@7Ioei89ri3)z!E0% zxntCx0Q9C_TQV;2z>qq(1OEWkQd=J@Y(moM6%+;Ch6DBMPQ4EBGqzO#`=_m8h*wuS zD7KnM7j%Dm@$&e?$s?R@J+s&kL0E|4@8r9b{Mf``uSNFy*5uZ>ji$K(M5L9BlaHw# z$^QU2u47QvpwpS)$m>=UjV~7b!T7Jp8Q5pUidqYHK}L%SO1He2BcLRQWU9<2?OI zr%AN-E^9+WU%a}GLnfMj)aXjO0zs1>cmtjXZ(r8A`ETXCwOK7U+1&;vFW&-5#x}^O z)N%M!`iu{mOn2bpc2B>j^s2U` zUqbU;PZ?Hwc&(5S9soY;V?1@^l4`5U%96R~s4W=52zvfUnx4*keNNSGAYiYv5(w8P zBd0z6DjUmp5$*{f$6fee#~sCG8CqJKNnR(V+3M1p9WMSs<}8-ryJMyf*BSOd(z__F zBfqm?+8 zBys-$#PjYet~OG$I4UR0bU~y;JQK}tD>mfW8!_azIM2R72i~@)5Hf`zmr;X}kZJKG z7mAvMOu({qALp$?WC8aVInP2eE78JJwd8PODsx#77+_<}F`ukzY1a1kv0N<61PaN> zZ!J`h#}#MKLNa2Ek6olyTj!2jNm4i!-GKp9B%Ti#uHDW^8o~HNESR>p^7+EG9IqCfXsuqoJrpX1IHf3hGak+2_&N(OB9qW4O zoZHHXb~#)tp2UOC;nuQoNv#TwsPcU;;f=+no2=g6-U#)J%aYAJXRIuDGGvdHn6b}N zFaSO2zYa95Z$a^$`|5gi+*WeL@x?P?*MhrA!9RE|ee+%ItZifAd$qbd(4&+R4CEe} z9=WQX8Pja;^hNtsx;i|CVg*2DLGRZDgIcxjw#n+ObmLI8V~?~(r_q~f{c2U3C;PIa zeHd2@Yj-|_tIMTNc`Vl9hRyA`u6koX%DN%4P#h{ck-_rN=k%#`nk$^!?NoJ5GTT?O zHn*{}L|8Tz%8)_ef;kmTM;1AXFZ=Wd@~HyE0K{lOJY#nstxvOH<#I){mI&gG-uiS% z_zJ*2N1-R0OK9({B3Rn$Ifh9kp&*h@PebqNPV-`3Hsk&Tf5NDb_@vbxBzk;;uw}V^pBxZY3#iO+^v@jg*e(j#i-y=M6 zUrlM>V_O&!!Z@R~+55DGLaL8I2LgoRx(YQEn~HlLN}B%w#!HW$KMlheJ)g>7=GB!i z#ZQQOP1D=z9vqCwpLrS+pV6z{a4R=h(5!X4Zz}REk@sX?89s;lbgPW%w`oUDA<8nh zy3ZoC`2PSbCmO$or;G!F@v{tm1Rv*Jo`vy;!TRH@Lj2jxe}d?3ANSD6AC4=5@n?cB zGzlYMttFv2`$#`4<+?KC2yysVIi`&sJINPS(e1Sh2g1!8fU?2Q-bn@z9=ZPj8s1n~ zwapiyg2XiKC1dGGV~M9XP?TupKtW6p+3%6pAP#-1y`_nYPdGBEl17q27%CW^MQ{a>^1$ z>QQC^(Y4EMLf!1-vW@{Y#McuAEi9%(kTxBl0l?1giYuG=r6!|i;G2fLyZyz)(M0h? zr3@l7{HunoksNiz@kR4D%d(9Kmg^ST7qNBTQ*A9K?sg~ufkGGdhiPc$HG7*qL$RdW)5&Gl#*EjI*UWZul zb(8+}rMFQGha#@Xz==R*A&sDM6RSN{b)xKg6_U;*~ur7lNtI7WW z0r=OFYO})(rfZx_Avwy7mF>X(wcGfP^Y_F7`t|2J)yf7;1$G1HWE_*< zC(^t-^Us*~Ah{`|Rf6R_=8hxATy)@Zf!7s> z9n@09(9DmO1cJxzj{Q4Qtd}l4sNyMXxiU+EpQ+DJVOyw~xmi(UH;Swnm z%0BC14?dI91^*BK>q-1XM#xLke$vEHi6%#Oq$v#H7!0xowmsr-FV2{J+MV& z#T+wW#}gxrpY@$bpsnpJ+%tkA@}(RwcE}>RB=xz_wm2qvG>;J5-6BA=@)n28P_e~+J+fO zE4jFAoE&$^Adjf&U6J?I*68G#@O#4YdNM!nrB2MVqUI$DEgcV@4`3Bq32>_Y1y3h?~3$| zA76(=#FAgk6f7|v$1V3q*T1kCHKQiKr9keIUPKv3%&K~GkLSgF<^8pD{{RU_vwLrT z@2FnLU^j`!l^#i!MD!S4s6955RCRr7MU30oox7Dw5KqdZAs0Ttlkc3@YKnWD&qLHT zKZ*Vx(Y#+I*0%FpZy-+FxCfyl-vD>%T|bHZZEC(4xxLdT-uBW)&JgplyN^-Le6ULk z)hz7HPF=lEpsw@665Q&NivZ-gje%axx8~>Yts;|mWH+JgnjeW5R`An(ua=51k~!$o zGaP~IjAQez4_o+)cW0?dX=ZlCYU>bo zpGxu{A9!l_!yY4ZHXh36Mlf72-90j=(SRL3hORbOM6}+gO?~2Pokvd7@8oRUlO%FD z7$h$tkJpi1Plr5S)&W}g=hNAO%zBu2bbNXPockR1uRYUl?3=*1OLo|lOPG~-%V33E z`VP4IS0|(Dw=rsB=_Yo!ln*o?m=W&1{^bN?vU^vlS38r7v9Bs-7O{3TO&a%9xKZU=K=Uxq zU85j=JuBn&dyf(Lq3$oGSC-~uH18wrQcolk*mkd_{x;oBZ}8^gIJ1kJh@y~#{5jlz zJ)nLy^OuM1 z$a`er20Hz5Qwk}key5tFc6+Xo;tS1A3tu=Aq_1%N(> zitt|zYX;8Yq=M>1ND#Y#*v5EXoq5JZdd`F52=&0x`GF@Kv~Hk=$6dV-=Tj)YRN2)+ zv%f9%Iw@mGqG;tsbYX^883cL}UK8T)1l+@|N;K3+t(lj}hvs7rdWAhm>&Lz;-ZNfx z@dw3LwpvHp?C*k29IFyCAudZ2GNYpkO>^ss*S#8H%=)iu7|8!{A2MJk2>v=8>k0Zre42B9E!{F{{W96@zvIya}}hP zrqwxiA_30LX*+RLkR!i8f<5 zo&jmV>TTxEe=XJO2`--}M^pIJf(th2->qWp=00P~$HpC2*Zx9Y0oLFh%7RDgf~AY& z*M`-C-f6ncq3Qd}bvXY3$G@d|g4;#>5|Q+39>paDIadd$al84{-X&ki;O)LP_!8p* zwOL>%{(9QTKae%EZSga~cUbcLO{Xq@=cJxr{)7c-T=+{yywBP+ojPp)0DjdE^sJkY zh29Xk&za%9IA{KOp-=q;il2A5{zpfr>-sjL#cXw38>Yc*LvW69^aiWk-a&dM4u5eG z8S@WT2`ya4@+3oo%nlFIuv+R!?d2%QbJ+2pss4hb)wKyNG>4K^$zi|^ z&{qL2x&&^%XjaDn9PJh6z4bjRaMMDUh&2U?rE=-GWB`Kz*B@`>>JvPYH_hEfaDN)q zzPaha-4c?jIn;3|w{3|7>krkjOHB=t0w)I%~9e<==f%)pb6 zLFwyNJjA#8WF|%=l39qr&pj$DeL<(VFi5*4+D1U@ft{n%nx5k3J8M>0LAa65S)Zc) zD_&N&GICF0SihYcJ|%s**cYx&f5Y;rEkyCnaVa}|ph=HBvuBU-tevx4xiQKKQzhX3JF{YFHS%?&vXZ00?dE+F@pDT+s z{1@lgUN(t-1bUT*WMs|+(Xrx0^;472W5MRSzZlIezl1eeplz;}vM_Am50vv?8)jJ$9a>jB#1t6Z~Kx@EOx$l)T!j2)=uPO3F;|Yis6(U6-pMRR^+k{Lk_p_0i}a9=q2>(#;@-2>E9OU@_>t@>u!+O-o~< z+37l1*0pHE%P8G%ZcaD)9C}v&0D~@V^xY~gTJm-KOw##)W95Yj>^(^I?OwDMk2Z2O zExsJsD|u6C=(2^uS!FFIdy}=X)3z(R@W+7R)8JueX$_Qbkn@%@O5@W#g*U^|*|oc0 z-dsZ>u-?s<2mtTTQcr5?pz#H~mZnIMG6Vay2a}GZ`VY^ZI@QU)vO>2~59}Q)L$#6B zPU3_~Bc7o5KTLm|)gKagZqLOY7n4tA+S9;~6phc9C>cg4>A^mRxlbB+$VVo}VI%D! zVh80;dY@B}KVM3vqWnUV*6I{#3qn9wUI`mU!UxxAZ^oKZO6}?<^*p0p(Y2oq-A7@n z+0AXW*^s*Yj&t{v^dh64(d?~~NLhp}f*no@H`ld&ZF^y-YnoV!{?-etX%IHalrX^^ zeXGKLB=};-!ji{vq&YTn9j$94ZwZl*2Xd#L`3I*{N-|O@s~q=*^_eVn74u)pcOZF# zq4eB6KasDewA<9wH25x%j7Wf-9N_n_l&zwQ=EPiE=gL9@4CKU5Ku>kgM*I&weuMaF zde>eTggGUYkK1dik8uNd%8hTl)| zKCh$P%<*1AgvT6mn3`z!1HBvloMex2>0d|?4_f0qQ{ZcTa!Bq`0GboJDI$=;JS2CH zQJ=_<#8$MbN0*XjHK{plV4n_o+I`q-G`W&dh^}sgQ+Yi;>Eqn@=M{l-@Y>pD0@-ci z$UE3B+>C+-1D^fIrD;tUi?kU6xZv202gAIj%P{T)wd%8zbgVVN!_0R0D!AHZ-F&^ zBUHA#wz-aIqLCSu&ekdp;s$$@#csuIFoa=4ET46dXFu+d{VG|5>Cuq$OZJE1AxGtN z({Vq`^Q-o2WX#ihv~K?ZYJHDRlH*m(QK}Or6C>!jT^yP1RkgHtrccZB0ZW9iI-?{W!wjG)ARoT3e$t) zl=$ZacdrT;L=b!ZDpkLbbauzNH2tc3p>iKV!|_BM2T%$3tr&hL+c65r#k~NpoEKA( zPYk_%F;Ies<4arBl^!!H}$uzEYkPbDHuob(4A?mL7?Lr>=05}Bk#xYVy8yh(A zFe7ucpW+9n>6(XP)y;HuOLv8sbPU|!as6;{ik=JGDV1Vv%*vxBha>_y{{TLi%Ync`qdp+W{2yDqag=rG@5O)aEn1TbGKJp*U*&2{16ljb=q)NVYd53Fj}I*~J6$X;pA1~>^A>J)Yz z0QRkU^*L>=E$yPXWY~*<8v}L$B$4@j1#{NXeXmzU;gJs*Jyk}3Q&%-YkU2#Ma*S>x zpy+>G*QE!c=g8{k@kR7{MxUocF*8q@yzbk-8xDT@FP`LJ;?+5b651ZE_FMRurW@7SCR#584)C3p6*-g<8vz# z#O}f92l>r!Poy@PE{AgC5nbpxs5`W}6ZVi(r=bAxAC-aKrOIm3a26D&5>Pk&s)iK*wHseRKKH^p1j# ztd~r=mruQ0h#2j1Qbp>l4+kUA^dS2FUit89>`kX?5QQpOaHlUpw0|Spfqf zgB*K~qP@4kI!B4G{3@D4k87g5=)v)7#D4Dg!#aQ; z^U@h3C-NXry643^VVjQ#=@(#iHrM`Meu0Uozw}pg!9GlU|xJS)JLhdyh*>Hq(2qX1Gf9YzAUyr zr%wz&Fx1kVb`#}$lpo=Z#*H)Fcy{jZO_25e%(*CxPplG>hyME&m8i+%ojTRwxt~qG zLI~e^16WK+82jTf6VGE^I{aqvf&=!s^$Q|9Txs%;JnR9Au1V@vnc>BKTe6WGOZ0hh>rf0E#Q9l26cs`PUPx{0z~xGSc{S#1`Nl zk<8)1{{Vq6_*T-d&b4Ghf`hs~dtyx{E7dQ)0r=9{o5+I7=|@8q$Qk??3gq=qhkh^8 zLpA4zbp=K0@xo+5_^w5AxW0!)&Rs@G6}hY@*`T(!Jw$t$dzyGrFmSJXl7-VxGkND*8{eH&tGcsfeFy1T7XY{*bKKUER$(V|Bu|zn z1R{OW^{Y_NAsGcgQO6lI5rcHxeb761H199>E)GLvXBE4%Mmm!lji^B+7VGM2+zPS< zXE@!$`k$w@XF&+g=j15K0<0NZ<#i;nJbDjWOcA9d_W3A^WLVe!0stT9^Qfnlgbb`0 zvwYHXfV^@AK$%#?X^q5=+-Ls)*NT{?^xou@-L)6HW0Go0ELtlt4(Wrpk?GtI>T6Eb z)Jw4e%Qwt@Ju0=LW_%EG7a$(zI6uy`qK%?Ha8OaO0r%a{=~>B|#WQD5Y()4!DR1U~ z{Z-v~e(6@$IU9x}k?ueG)xlV8 zG}=d*5+s{XF>lWv!`8N;KYGVKYJTm{Ceh_Z*9OH3jvW9UfgQgpezh{ssv_hN2o4va zZ1R84y;HGfeO(=ZQl*@LF@Qh%)lKe0-Wg9LIp)1+y$><&TDlD^in%4Fc9F5)JwHnJ z--q5Gj?+t5AywF5U}Wz;r}O#&?rX|)G>&_fxSS269G-xFe#7#wWYD}rZE0>ES46jf zbUbIfeuB29XHd}B{0paD+O#7nD@G)V@shG5V>tH*1XTVRme$q$ovLLUTH@GsEI;L} zee>Urzl~GVd|JAiT3qi4=!+u)dONd`@7Ry>D`Qc#(Ppq+N68BocSs;OF&y(EW0fGD zcJO-O)~#J?WNm8tRlWV4(Os^S!b4zi;GBjY;OFx-^3J`d&!TH@YT%8nfJnwv^1s*e zHS~}6-1-yStn;PbopJuE_%qsXkkUQ60e?Za>2R0Eis=5${<-JkHFyJ1e8#7gn-tZJiE% z>P6HoU!=-^8t|C)VUP8P@HH^i-EoPS{ZD$|>e23H?c{rqySF_gT7TLiJtQBUcxvl& zV?Swe^lpEZN&UNW4;M~7vrFo$xP4)s+FcxDa3j#wIqWPhawoL4m()Be-^^E!e{bLC z3wb94*EG}OJw-A_>yhdWA6K_gI=4MWZwq*CVf)Vy=`w%4V-f!V*=sg`26%V=SonWP ziM>+BB|rKNdHZ;mR>v2Q_#FPVR$X^heD1!G3~&LfjVo(%9Fse@@We1PHM0wR_440m z@~n&93JA$*Z=)XXAvMI@YS%pZm-8QAnz`%kd8WEcJ7D*BQnu7W*dLFzYT8<`TI_V% z#JWYURc-{#IK*mAbC()Ep?D0D*|ZRzLruXyhg$2jyB#mXRwGf?ZenYu=W$#GK9Q&O z>s24e6YCSTzlS_Gb*Qj7->|SBmjm%MJ6lqf`W!9BhRE2Fn33&@E3Le<@zZ?M_sl)r2R5;q^SuV+$kfqE!pJrs>H61j zt_TLa>27tRSv0*eK!0$#`c(^^2TvfCjY0nabXD;G0PRwo8iAgBe+of#(U^jS8{b&Pe6bxfb313#U4cmNTa(zBK&bp_lE@=vGXN_wjfIy2RLNvuzC zl@cqc95((3zZJv4?QUWW25{vgpZdwy9_XJNC(w`$MdUJ<)dj=yGTnP zVM)CSNT5)xQ8NDk3EjAkgn~2rRruK>woV7j@{hxl{OUD~J-kui?hKjt<$5 zYV^;!?0u^RZ~nwpNd_j#bB(p0e3Nmx%=yBJr z9$r(Pl6PB@;l5#!pHO)IRaZRI|~UjDoq;o)RG2pbsm?fyn8 zOPP$t5=^d6Rl&~F-|($^5OkKFBe=DKO)=({O)68qG$$qo>+f_BW+^mbS@!Wd< z03%+xs(2FXSJ3{$buZeYjS0580~c2R0PM1ni4#4t2<=`Itmzu3hwon6{@O&?*jYj_ zMjoJ%Qj%K@-R^1s0BEwDZ_hZX2B+w2l>1p{Ic%Ksns~fEwS43_9bfj75%;OXQ!C%C za>DTUrTcHaK5=p#LDdciG}hG^pS_;d%zd-oklp*y&K!qG+OY$YIO8~~v+E}Zk@Tzw zx<0>-YNWRZ9co=dl`{UJtjJKRwli1!F+5sDyxP6W@0G?PQ{UVAn&kPO;a8KBTJp-W z%@i%fQI17sr_C$AhOQ|q9W*lAU6Zf)rdcL{oLku`!I<_Q%l=QHy?5gcb5hYR4w&K# zTV-5nx@J)#LQl->m}bWrZB1}<*mS$2<2Xk17qu46DdY2 zdV`ATbQ_)aW)gXf{)haU=a$BZ=+;npr|%!+)fI9LJj;x8P%`J6j(Ms&hMQ@qF6RLx zAI7v90Pp(LZ8^_gX$-}m!*?o4j#Y8*)~YsN|R8&EB8+3V}O5|tUIZIjoG;heSY!(0QJ{0 zjor?+>}L7x=d`Epo)mj})Um3pAh$b??0WiDX?HfvLmpUQhVa+Xgn*AY&-V!=9Xs{)E(Uu@JXBg>3%-=rt{)VOt#-jo;Rj8Z5&* zMv>s;BdmD#$N1L#Gh*TP93Phh=s5m$%S5fU@%M`XjCDad-*Xvm5yuC(&;I~kx;9}gql5~~RE!bB4|?))gBk(xn+Y_o-{0o7>$JTr_#BmV#h2=KBm1x$NCHx_h$YJn4^%7oJq&)UTE+L z=JL#nM|NJHg?ku!k(3pW9}!MblW$=y;}T4vzV3OeH<7YP3lO0F)KQEIfPU@4Q_$6~ zvpcV!DLZ7F3f;|j>rE1ty?Qa89TN*3zHqf?#PXf-1>8opU3&u;=BspT<8L^`@m!jR&|Y> z*YcReO(8u>j)NV4Jl4*i9I1Gb&t#WM$83>G56#D1j#T;{p#HZJO2>KNOXH?pM)p^h z^IEyYw%2MxocC7%?LS^C**yCj*ZVnvU7G}?qvj-Z=hSvJ;cwt+megKpkX@)Cn8HYz z1O3uLC)Xd9R`Ab^^*;^jin@q-?SuTwAtk>9pU}}IeMw$N(K1Hn2idu0LAf8-zdy*= z4dagr9~J4W_U2oCMlrccx5+1}ka!i^>3YqcyJH>Zn{cieeb)U!9jg3tA&2IVVd% zWCCQ2{{Z9Y2kTU)ww~RvOi~WUDOJblXy+Fo8F=eZyqN}=OAes^BAQNVITWK}#faT- zG1TxYM#|1h{aI{Kfi#4o24FWf4i}!}@IB3B!7D>5x|}kcgXn9ck#wzn{MEQkGX00l zlJjpPdWPzm_v7BK8(5icW1*K%d7-+p47>JA6v$PThrb0+U+O8}XRx~e0FP^fZ6W^v zmUld_;N$v%^{u;`{cA=s+Us^{du@zI_Mzqizx2*fbC01T+MqUi2CdDV%W1a<{#e}* zV<*}{$NlscmoGuwruVw7gh>tJEcV~>&2CgmpYR6s{{VE+PMX77&}7&aRY^W!v~EcU zt~xi@k?%zY8h+8!L(gr$As`y%t0#LMv>>f=^waNR{A3P22WrW=lryYK zau=VeG*3H5jgoqFJxx$&WtcHPFd1)r(G*;nI}p+`gPcYO(;}nYhbPerkItTqra>V7 zA%X|xP8&q3$Dtsf>-{Q>*s+3&RP|AUe-3}0TJyPbIX+%TgQN~TF-eB*IZkFIK4eZmc=`Ai9 zVj+$iSKs-QS-6@eZAr(@gy4Lq@abC?69r`;fN_E=nu2y^3VOVZq#sOiTUH|~@i{8$ zGOg)ZN;Wr(IlmcMgq5X=S5!RgCm%b=FctE z(=7Io0b{qnsjb^>J~lud2Lo|mPeJvrh^-8y^)u{si7cd!c#B0iJA(p74}ZY_09y4w z5ByToE^i}UZ$ybFz+bjo#seYimS0bCTpW62L(Ggvji?UdIuL*Q{{YUrJsZJmCW0PS zQ!KFgsGS=MK?6TSR+AyLdJeaCub~T=B)8cqCK;JHS0BRNx%R=Y7u6)5OSX~4xGcwZ z*BSQ*``?Jj{cEPub>X6|#ihdivqrrbF}ExZGN=8Q#~%3S+Z$4~vzk-n%-&lZq`^-= z-Q2nSI{yGlt7%~!m%`r=rH$iQTyU@V7`6aDmF)7#f)DU8IrguYZPD%A0U|jcapLHH zhP}r_)!O&MP)_$AV@8-Pcx5UxgYU&>C|%q+tY!G)!*KjT(Jrm6Tn$f6bLIT|L#t;1 z@N>}qRr5}@qTgs&@>yTN(MpG-0texqwePZkLrXWtc#YbJD3leLeIGEV~nlW|klzJkTF_YfacobmbQvAKNbkNKJEGeH5DD0%cIy&~`7mYd19)b117L{|P- zta<($-o{CTQMZ#FomqdzfagaR&PN0frYbYmx+}j2>o)82LlmHQj!=J{Vq95V+P2x@ zbUxym2JS!sJtXpsK&4_%!|_Zi20RUO1u zif-0=q8xS5-jM6w8DA;GCA0j>7%b39{?G?1Prq6!mY=TQ>59p1a8@S!(KrtoKD$mv zXsS|O8${XmHgDq(8OW|uL@~(9dmL9|E*%K2V^E7^+r4X+FH=@&=+k+YZ4v7kpt30Q(A(;k?NS^&3a9 zsMxQW`JQRT4oq0wg+K28wKm}KmCtTJ5tCJi-~4JwauI<)#gX);(J7Z)<;0lCb@{*E z29`sWLOShaIO(2o{VF({<->i})`c>7P->MT)r5^E+dsAS3D8ti<4qj(2i> zjybAO$+AT(4gnzj4^dOXSsl|Pt_TP9AEs&86Evc1%Yt}0=jbX+c{ki+jBFVE@l+N} z4|O;=so?o$4bC>_KaER8C#kP)&E+A-&F3!D?T_)zbPzu2T1EZ_J>wnMRA zhd9r*d5y}IBRS7pn(xDWC!31+%EPTpwFv80#k(b|PQw83qtw?@IT?3Oby)*507pUG zn(lNP2f1P90*j68TZ|9!u5-hhNMe^0Z7fa^9FPaO$8T7px|dcG|VPw+UPddlEng`iZDFKQgX#d-Or%TN%jNQv^AUa1+}}{H_r?nQt`Kg zhRz4F{-AcQVKtdVvZ|6ASN{M=khXqN%X9cB^cej1Y_1}{x@n3Hv6%d{Zmi9OO!@*q z!0(PY$*q-)wJ~49cQ+_);|DGfM$SkcU?(K{4srR?L!e)2cYnBn-v3#;> z=bP&u47JqsMAkGX-l2C%0`s_c!0Vdjbj@OIBIL&$jBVfLnlecay$>Fvn)MqQmqEFa zrv2T#j#6L_6(7`(e^L1J&3WgE?P9mruI-=ZLgnF*DBQXI4^j0MXpPQ~!J0Z~(u<`k zMDqnF`>H|4KSN!q*I=IF^GwRSAPcngAgRdxE6w!1RgIpXW?UrV;R;~z!CM4=MyI#A z7v4I$5Uzi99`(mkGBa6D$u}cPcXF~dZeg^#)3!Maa~WV=7aclh921OZfNPME%CTM9 z!N4QEb(;2-GN_&cQbG5Thy}kK{)V{QktA$<$k`kHrNwe9TSIwXvZK;#>@lqM0Pq+>;SGt{YpF%Q%gBW*`CXYo(riL~D__lz(Z$=b)8O>rbH3 zxqa~OS+}!Wy8@b&L2~JD%!%vrbtBwnxh<**R#L1E2yjSdRAzty=x8^!rLT zmvBC{UqY1Mwbk9y+EgMjPwHCm#snW%g^ z_lxFevyDZ1m{oRMC)O;IeyCV_kws+`$jY3GDf1z6olrRjvNcF|As}?-wAHE$wBH%5 zmVk3gO0Z$n9E$0dK`coi=-?XW&CNl|yedERrjy+<%+#?kY%r;bGNA zN7A1PZFwR;-^%-E{{XLwa7?SXKY4O+JO2Rs!KMN~%0p-GVEu7dBf&`X&>g4fYN=2b zJoHo6twJ|JAjixO57c+5km6{C$%Vl=LRPv9Bbdx?J;h+!KIvpr-M6pfTX0Ah9>Z|@ zS36ppr=gs^WmH>H+qD}A?poYki@R%Zr?|U2#S0WI8VFXPxCM6#ZE=FTODXOYD@FU| zdEd{*`E}0O87qIXN0Pa-_e!$vd0#VqC!uFjaONMNHfeVOFAUCds2!9j)*?MUH*ZL| zz?jmzI7vUseJLF@2*-&hTOd2nkQp|kF~Cj^H@rG?jSdaD(iiu@if_x3O@ zbz&Oh*$-xY_Kh|*8oa9$*6H{fOUtOo!ag8L3IIKBCw9&;aL)}O5ld~gZNJ&#?TM^A zY*hlQiO!ubcz3$ti;NVLJ$swC{Jj|0Yf7)(=aBe-SGJu1fN?zFr@U~GD4Syw-HN>> zUhP)?6Q2eG(I&sKATxpP~5jdZGh!lXySl*;yB%fN3g8LETAbyL#%3uP@K z&iZKzk=a!8e$T%nWK9dn7Joh|9sJShRK++v@=cw7ShDD_C2U!j`?VZ!WE6Ni{TyV>{?nBT1K7qeVQ^pKyh z#`O|1Y&;J`2~u}MWH_y3Y8YIy%1s2*$k1;7tlFglTP4v+c%~y$V^p& zuGPoE>zy1b{%G_5@W$foZUY3f$DT0U1*_$p^%HC|8s557E=Q4ZnmU0sAjMz=Zrd~t zjc5r;0H;@WK$o9XE|vZhzG5&AWGCda(gc4aY{a5f?<;1@6wp`oxm*cHY`zax#oE8X zu*&V-x05Vn$$c(+wCnr|3Do)IXO~jaw3zr`vrp&AM}J_QG%=Refl|E~4@&J@`l6CI zCh@5*t$KOZsd@73kV;?-b!WADm!G9x$G`NK7ngvpvaOS6lLvlF%>POKIb4=j>65A*^jeVv zGM5qhnWe}+aD!5w#A65BW1hc51wQJMTJd5?$A_1eYq)>R{9_1ARgKww2-gjKOZ%9z zD2nhWE&N_uek+6gZErUX1Euf{w0TM&Pu#D@OUjP+&VDZ>rH#5+YDiARda22w>acBf ze1mmbb8%XOddF1K)zmOkgVmm4tA+G%pLkUbRGQe{VuxWuxBN{t8>g0M-qnfL!$B9Z4T{TV`*NOOXqqMu zhtt@wkFxKtb<9gV7d~WMhOa1KC24<^z1O%0oV&9vnz;#T^zK*zra_#wI_`#^Xz++eiiQzzJfEqxf_4>`z z!M6oQm7%`4-J^(3xE{-@j74g7b@ys8y}4D!*CO+yE6ra2&f#otVA!L za7@iWK06=1j8xS_H5MJsMd@zThWRRl_uu~!grlD)zIv?TWMGV7d0*=9iw6csK|F#= z3c|JV<7f}z>Fo~c`w1+xDZ@>ibsYXR1o_d1c8^w~ zL~e{m`xEv#7?nJ5sn}a*c%8EPC^*a+tCPAs@mJb!mwIPk5O%jaC~}lB%&FGE2b;*q z{&^l!!!ozfELpBj>J_NsLHeC;Cf@KWPS9jM0d73?ijskJ)i^IU>gqsSVTcj!sNCb5?m8 zHkJq%VC{YMD@whQ9`W z4@;mNe~kgkTS0mb(p>&0*k_-G=5)ScOLkEyy*v^Pg1NP3#1t);x7CI6m;@s1Ual;LnS7C|7}t#&Yr=dE z858hIUAcU8q1gNk;^beOyzR$#?q4aff9j%#+wiJ33H7hsRx0HdARL*r57m4&#>27D zC%Yv*XiFLP;mO6gqhdEAEsKoPnX5ky=-=97$Kf5N{&p%#gSui+2MeK+N{5>D&zdR= z4LJ-Dc!>`C{%o9Xjk9RCQB#Gd{R-`ba=Jh{%Qz#!8OV#%eK)GzY zo$M0wme6EloC7n+(ymej<1&i>ZcWi2o< zxrX(*$8`CVx>hr*?5N&Z z(2!GQBy-9E$0xbp<3z4}-qA_V(TepAKqZKZU8F`7^Of)4`Rzy&`u|zCy_tm(0nf$jn0Pi zGXV_g8bsq1?HN83#437kz+DyrqL%U|ZwL`%h@I0XFE ztVcL)<>#qt`c-lfn4uh2PYYetHnwZIOy%P5ocB_c?~0X#+C@-N^5C^zEQkQxEB>%7!>&p*$h`6fYL!6`Llt@WHWFO+qetB~4Pm;03@VgDwm>+O{*aiZ%QbWj2(J0Q z^cYZQ=9wHAcKH=_NAuz`+9nQGwC;oZCh6W4=QOk0PLn;RaIkr}wDaE)^*K;2bxkT> z>jz_;)Xz_RlI$WLJ8^#MTZuH$-574US3f&<-~37*fL`h@O8C@}YQt}?Ogxy#a9B*; z){T{DW$_%H>OI#|++l^8b;LghUh~|nY1+2Ro1;KKFpy>zC(N{Yyu`@~08R?G^cPIM^ zn_f+ONtCfHa09?pTS8{4=I@GhSKrD3s$Ng}TS@3x&x1#0!=mOA>Z!%2(mw9e;?c5G zR_y&!D``hyl=83WGcqlg8!d!WHUdj$T^^+Zq~8SLdmpx6JD<_(Z$MWVTc>ItlO&XV zktOx3v7{@1nVM=~Ssc1L?Q=atD-6MW6z@r=pDec8Dz5BzKVuB}fT(VkX-xJ!Jw1q- z<07*C5$L%#nW&T`VdpbTj!``k#0S;f$>GVK+?Sgie)cUAVEhi_aZ?F845VG2k)mJscVrCkiS~D8`AkkYO=Z}FYx>^r(`lUb zURiSs(&4@&h$fk>5~fdGTILztz<_H4%O2#V>7uL`$LoCcS2YjTySMrW_+HrcVBFNk z5RzEVfD=?}?EfiC`g$^g-@tBxo2It&5rb18ImpO>4ODy33Hb_JBnyxc1tCIOlh0{XP+iX6zv)!X2?6A&Yfm=HSE3pTB2U6=<`A=)Lr z7Ytzi(^>L0j=ssD25!lR@~O}9!pbI@-xpz!ILBe14l#W``%Lz$v%3DBpW}L7S*0tY zxIx`Vca-}|&VrUnndC0{fO3|(7JEX}yYi$g+jE2QFZ~rCcZ6IL+?=(HaRLBV0c2Hn zdoQR01OedSJoCz{Y7>fY%1U2&m4H(n&T)mGIuO4m4WA0tb_QAd5iIHU07ENxHVNd&G{_$TXsVBH5&w z5Y-qy5`f!ASE{*qO+@`bV}Wd{*KYQQg+|k*{A*hKnb`R#vGqEdQGnoQL4^rN**rR=IbOC*rBmD<}uUY23zrC%Iv%6|yN^!9<71bNvso`+HhYC%RZvQ!`<0BVrUC*`D zYq8@rYE%MjCq>*rXK%$Zp<$t$jf32m=!Z~h_~r4qoNWrT&L7LiLLXoMSfbu-g(?q+gA^JZ4(`^pPMv1GTk2kuWOn?h5 zLBvDSKSSy@M(}ikT-MfRM;DwE4RsWq_`6-253djutRVZ-UYcYhpbarizi`BtsTPNL zUq3#qKf2PY@rO>b%N_`idCjZWWylfHp1Y}6HP_n`wvr&E7i2WDCWJBnBaBolF}*zr zP}k`zF?QD><*q02B%)XEfXR+!*&~8|y1J-l(|bBN_q+9Ss!26GHCU_W@WCMGbJh7# zh<-=4vy<&f*2HW+K-mV{=i;nZmMi^qD8M8EBM!SQRsB{FWXKdNA@aZ>tiWt?D6{cXJ6Rs|bk1^?7<-Y`Wzwf|*-@OBK5PFa{_fRZ2mU6%i4X2Kl%aeDRo(G&Expu~= zzxlCv9zB3Jf-`kpF@-~(Z!Cq^Y`a=S+X;eZxxMsv4|ZoF9}mQ`C=w~FRqlIq%PdhYW}iT?(E-Fw}MW|+FF-uj=^%yCE5@|Rw_gS0uV>BVUguA3H8S$tlF9(+G z+E*1vPnKndZ@$YiWPG^Wt~HS(@ruC)C6Rsg(V6~CZ-Z36Q>_ytXRZ%N&bKo(wod2zy##n?#NPTe=DEY2{=dWQ;O4598UGrX?A0F6|ZDxs|G+Z$_mDq8`zAWe%J^PArYb z8dU-qtR-p4L^?A685DXyOeVVA(hVy@^9@8FXcrV+_}E!XYT+T&T7p@+Wy`9;u_XECG5>aOH zb*&vd%|v_IgO3jstk)=?iu*W+$~BrG?nyKPgnC3<-ShGTU(uUZ4&^yJ*C~B zGSKv{UK!&B^`ee~keqU*`eOXXC6b=CS*#)D0ffY3i+zr%xnK01ULm`&ta$u1%Ap1y zH0*-6E%w<0s)t+q75DoTCDqK$Giq7stcASVFO5WSC?qacv z9SXxwo)7=`;3mvs&ZYPG*!W5)m!H=HAfru6^1Gut5h7P|C*~StM$g3+ISNx$G^8y_ zWChf53Caukpi|x+>zB0W*~ahMo_P1xLB)MN@eGGUc3P@#19Xf zfsm&qn|X&s;GQ0Y5XZ1yCp71!Mtpy2$gRCPHfUe$5$q=jZzW=a@qU4HQ9o3)$p!Q4 zlqn)n>lduy)Pfw{KmeV)9pO;dXZ*;042YQJv3%Ke$+)|{&$IKKW6b` zBzLjX=u_FPXYo-4#;LCQUXvwM|CEMII!2j*Bn@LGQ39$Oz5{m;>B?Cn$pcwZRSVxI z0LONLH&yC=rlbdDaudpvqc0qau6SclcFDu9mETkW*D$K!-L2As>jBD04Q{!cOLg@o z0+Q<~F*YtW0VG=cr-3?Uq)weXs9kiL7Nh_Dr0zR~Z(~pQ!00Hk#6qF5y%c_U{8Wz- z-GcYb427rY`NPb|(b>fB2&Z!m${az~v1k_@yg6csrC8V|?FF(ADg4Xpr!uo|piUk5 zWDC|KThwing>5V1?#QNTuDL&=VO{Y2&FdekW3JU@G;(!BI>vebYhwGa$t~GN;-gSM zvzL_DGeKCwPU)NbG1)hvDZ9kh1=+C}uZPhO3vc;$FN9z2{{fb2gFZELjNAJ1Ulx>n z0+Moy(8$WItto{&fE{jiS|gG;e-RKYv$F^ur2F7@x9!vvgyR+)b)?xQVHC ztP&FIyko8%j80)UCFAL`T~e9emL9IanJW9j^AGS%L!ap0lbbWOzqkeDVd#52dXOc! zcJyuZ01AEn%Tn~}FXH(*(4VYiIGd9e@t1ytca4CE{fNGYGW=S`o4XjHMR?R_Mpa2b z&2kGjXIJ8rPb-fgx6@5eKVgS`;%!7dz>bn+2NW{`k_h{1j<|HfNjAnx>-iJ87n-Aw z^TXJ`3JnmZ{t{QI!;Jq}MJ6KYt9@&K5f8)_Ie#UV9xHQ3%WyhmEz<|GvE-nV0coZm z^qZM&aL^bib*)T|4d0sx0aI{X*+CPifd+<@A%RfV(Sr4;j^&139oT*uZ+fi6W3&bb zdh@Xv{a7XKb`+bv`Iz*#wUV}o4R=w@=W2WD?v0-Gl|W)PBlnGRwl$C_bxE~|`7{1I zCVKk~TMQ=XkTMl8*wb`*OEcE!*ks>h&wv^LxP^ug5uLn46~o<;{G_EF`xh&{(4D0)Kz%UIi34BJR3&pjuDa+eODotg{(DKq9ROHwtYyu za1uIk6G!E17;guVni8>chJ(9qr63%(fI{5hn4GH7;}bRk^bE3i*$oV66>HX}-6@Z}DY!mD7plrIs>g<@6OaX}*65iS!+-lUw@*7K-)#_|YEL~9TcA~Zwt4_nUVBnJTlCT0;Pg?d8l;Gpu~`^; zLKo?6b%{ECKE%xjp<;t`g`yw!rT+!@fQ{BpgEDABQ*;+euQVC`oi>0rF(I+$WywNp zA*&gMB|!AmUp-Tj=kxENO;|;X|A?HJz7{i$WWqrjSgg?52#p5gFrVg7|1ixly$?!a z#%jF~vgFo~H^+Y96Clt!y2_0*$9v@Z&Ln@-hJG7`Hp6VQtlQq~n#1sq+fDvQn(1<9 z(k&%^yRS@9*&E8zFq|L1Oz#_mS`@0Mo~=@YVe^!AO$H)wN2nn`+!-x=C(aQ2aP?M} zP0Vqe$pUK{v-EYkNq5^+TnW|&(-pqEZ5ll`6@B6DIHOqE*Z=c4kGs#VGdmBp;k4fD!}>6l4bI+`+r%Yp5>J3gvw1wsFS9Ffxof zoo@G9Q)8{OtygdsO^h_vbR}Lj_PMq=k3)$aD_lO+NeJFg>RY2 zO~=qS8GEjdL`o8m5;7Q#%{lG7MgoS&q@MFe2qZcb%~v8*UMurhYeMk87Eq=rE_i=H zEcsAM#l!!6`qnUaH2cdA$6tgYA_K{UltTfPNuflJ{3H1tMiB2-ZO7PlYB`a?h>d-h zeYkz!aeMQIHSE45k(t$FIHiqMnyw`AqbXBdju~}s%}5=QcAp0mE0GS!_5)ETk&e0? z{HUY-TWtmd0RLA0Z3FOBl~j}f2nYZG0{jN}w+@g8prN3kq9CK8qN1XsqhVkXVPj!p zVv!LL;u2Aj(@;~9Q&Q40a2+**IN{aAFaPtfB{pU##(9zMc zFtJFnu}S&pDCzkAKX3nf0r+SLoCs+^1bP4>J^~OQ;ol&D2L3*g5&mlf{Lc*m5r~A0 zf{KQYfeGKC4G(~b00bf;0g;iBkl=fV!5;@8;Ug2!@yenQ>R6-FdlT_RrWd0z$Tjs5 z>(2gWiXvP?$7<-hsXcuLI42&TNeEOzlHrjbm7BwAtE6G zkx>7m3jr|@K7ja0$aK6Y1hP7)*4~8le358Ga_PlQedrAQy1$8Sd}cA;Fbe!)y84f_ z|5En1k39KysPmp_17qpVWd)%*_vMwS*3f*zRxS!%@Cw zYMt%}%tRDTZ6-QDXk=Vj-3(UuCf;wz-NXI&uiABQw)k6p9~Z05ut)D)Gd>35!=V=H z9@O94G#@TJXu&(KT@1X~vw$MrNB5^v$)}|d4FNDgPb1}?KUB;uqwcyU^elw*e(#US z@$&VlrfvWxnJ|fzf1d-3$S~PBL70XBNa3?XlCT7~h+w_Bo=%9n;ewKv83*e|?LPny z+=%29g^olZvMcxndGf2J0AEXwix;&VA;Ut-&jLw{awC;P2MUH5TeHwPYTq;Vysp$k z^{UsuQ?8niYAmgWF&?Wv0Wh0EH)S*GzdJ23C1ZZdoC zp~O%Wy>NHqG5`Lq4)tq$R&Pz&KY%ml({S45W?`jtzt|FK-r*Fkf4eRZnhscmx)9P; zX<@P|(DahEbzus@Bve%~x>=`bke6@4DTUM($BGDSvWvz`*=}t#y=5qjl>3dWi?=lz zkWC#FYcyqc_#r&*o9p`Yfw_vy{U@egPKUKEeeam_8=tT#)OMFo=tXv0n#bsA-G0Ln zCjACXA!pIm8V&T_5Qbgv2l=f1R38n#3j`GP=9+r`ls4=7y7N7UC%$fk`xWe>CyW@? ztQ*+?>l?)J{4e2)1sB5-VzpKh=I>~wk#NldYvC<2Mp9>#Nw?2XReea#IO8dY$kxOF zIAS`u)Fa;Uy4zA0a&~YN0XS82kxfA zXyBxu;dt<{9wD&B7$LSeSVE1oKsPbX_a?ta?GkLhP#j-Nau$aJ%hgo8uRF~yUtGzL zv^=&KcBASQz%^r$va_^(aG%P@9tBb@L~<)6BX}$#M2wKXH}v3Dd~O0 zI`4%R@xI(b(T&pXH^OZ-i(?X}jyw0y!2#MZWDQE5b(yoy4#fvqmJrz1hkpRs50cKI zReN~?WQLyK7=)!|r0X1h@>8&lpeuxxiCTW&vt=#`V%p3lMP~VkQjh#3aY=4RGhtJs zNi%uFXjwcAsQPLeC-*h3t82AU>C=?iPMOmp(2B;#xn&SBXhm9kqv=sUz0=gIzTr@~ zdglxO5#3Kl7cx<#U~i2*lqsKJ0FN{B*Fob7zfsE2RXE~KwedXNH~Sr#8MOwrm9pz= zTrM8=pzR@9FC3Z+2Rt#JClQ1cR+K?hvr-wYu2pL9Mboa|3oQ2xw>f+;x-ixgH}jG1 zkGC9mz)?+sD7|4mR;GeSd{G~8_CiBaT0iew&Dt=>Okd{dM#LR#>wmGgN#)dT!I{7b zNDWRw`<)WnD&aHe``4h|hv9mb3W%d7jQac7xXT|C4}o#+Em%Z0`uhG(kSyJ{8<%;> z?KhwdtV86$!xyss+eKV00J5fzm&iA{mX~eFtkuTcx$&`hL{1vQFmTYKWaLQQDIFsw z-_A1#mU#eCMr*72GGN1Vj&Vxs+)0YuOmVJE%UCPlOIp^$*RnPS$h50-d2j3}am#f~ zo5ZRd97wZCGR1^(cdUtyMqp$FN`%G zU9Of_RVa0fpKb5ePT#BQ2Fc2vObh+O= zm%U-}K>Y`(sp7Q2fBg*+ACVbfD$2G~jul0sM3HzRRP$Ijl=Jf9QR&}3eZDdzv(D#a z-gtI(9R)a0V`w7z1&}~xa-~B)!fK2CVZlNJgj$lX&Vf9=K@~4Ot!pF z_z?@N+G%Ms{yrP#E%~j?u)TJ`4zC)c2msgv2it*tG*v1J#>BbaEa0MSy4*j;r*IG> z{^Vp#&ktKL@PsJ6AMpy==@Q>GywL+I6>2B2ZRn5yok$I6&OQJ}!tX5k%ou&d(&G7Q zP-LjTLjkGHli1FED0eVN?OXX7DPgRs*B11~X4BAt* zVEp+8_#sHER~TYIju)ZQbP2eT_!#0KPIS{z@6!6&eqh_W6G=beZKg!YAcOx%u$7kE zJPcjI5w(66%zn0nH4{rnx~GdHv-gAnCB)Cb%$fO;g9BmMP^dgi%bXGhzZXCtXG~Y# z|76<&ZD}0O>v&Uf9KamtDT0N47$E(4hhf}r&z0anA#cY^lkf^Dcdc08fCeeC4mi?V zY^~mvJ@Hx$IbW2*+^CgBB?xtcBAhYgGvi1_B)$~Y_#S7-{Po39lI~&+{i5pU;T~zA zF}WJ~^dm+t$=!}$v0S}Gma>`j)p06ZXT41)h#~gZbv?Ou_?=m6SL~*)f$|NC*EF-R zaJ?jyUoTPF9-_2w_$xCDsjyN|YLW}_U~uMmS4(?Y7j!ynL-FDH$uOcovdAG;X+H=d zL|3j^3CU<>glv8J3d+qKXhgtfnto%;aCOFh_EEpRGu`g3*C+ZIak`wQV{8J*# z_3Nc_wfEV)VNLOvs_|(bU?itvc1fA-kFFA8NTpRmGz*qNuh;hv9b-RC zvN$-#KdEpRMf-W<)p76O5zdqCiX7fl^m%(!I%ZPsnq^6xOO^lVH4JTcS&x%zYW*IW zI1ZFaQoA4veQWV!VR6OH;alrX_#zQUKD(yZhoji1^$C=RfYA@ABD%pH^Mb0o2Av(9DdHg=e)gEH*kcH1rIin-YOBRT#L3I+ zj65dmW3W)K@jD-`LGKa2^uf9bB_UZh2Jm`tYPsFGFW2H~ ziV&Yd?nqWQ$#psF1|eJ9oOqKT2jmd-oAz`?@O|az5#X7fT1H}#wY;g=WANZkmeDbB zmS^|Xgg;<&nK&6(v%$u=`(XHq@#7c*ladHm(kn6Xn-cqW2)QDFOz3V}HBAkGz$DY< z5%=uU*2+n&7cguKwrnK!QPZ{JHfnjR1gx_=VeOVopp88c<+d;?`dK-6LPgcyl7CBF zS!KSa?9={7=rRkTC)4Yc^xo6rZ6&x*9 zqS6o}tnTrZEwg^j`2pXV1&n?x{g$`i#THLbAMCy*v87iUJ$E4_*qS|B;S^TZ>8Y3> zvcig_HK5n2ifW#23G}UJ<-TLKqJN-l?~5K}@KDXSAb1cXuhItF(K-*^ zt~!4X_Ths?>sam3*4`;e(5t~`P9F-Po+kx%q|df-ftM6rZUMlBdkKB4j0810_7zy) z6tV(Y=|eeV;YzW9ppCvwrwS8#)`ljZ@1+*_7X=p`=j4(yi_}JhYORCZ!hgnJ!!g3_JE zx=5e)5aWd=9gvR4p45JSf5c8-+lEP?=+Ee}zLMn;|4j-qYx!48uX#4+Yzn-2e#6`v zkK%;4OVRbTaY(v*YPO%;b02ur7<~}kiJ#9(3HDxV=YH!S*ORTMQr>YC8=^f*GJfx+ zj1`6}aig$N54Im(@)&kq-84BP$!d`^vR7y#g58l-RZ`m|3C*D(ra($V>a?nzkCC6J z8D}U54Sc3XpehWMUn+fkjipR~%ZNVeeF{^UGXW2qqhV=o#ic-2jZYGzmc$RD&kpy8 z7j;psdnK=qPtEi{_cXOd-P>r%8q!QBFa%kDwyi%JS28dPj^3(Xz0fE;^sm}rUmx|T z(P&LFYGw`_zThv6qk<9;@j!B-$yw3&A$y0*-`VTaBSITT8Ts8mMuhGW77qNlu7{Mx zntJ;R;A`@Z)tt+CA1_?}e9&Ld%(6E(Ua>S&zNhzE1smmiN%AJ)9vU3BHl^-GIRYtj zH6tt_m0TNKSl-|2@&-uPV$mXk5iXBkwe%)=`rNsod~!$7u?yNx$xi_mY!?0B^WdHr z`<3$0g+9ulm5uP|#;V<}&e2#AYW2{{J?U~S2@G&piO)3;O^C-#*h4klLLr+xsumZD z`4%(Be8uc&P3Hv{qzpVpc3`9@-NTOVeTK~RTg{fiWRi21pO~*T1bxJ_QfsM(X#vcZ zj)5wXT-`_AUuQhxb`4WkGG^4M>x#D=OO~m04?I8*U1DLV2pVZ|`uq-E(8@ zV@PQ$cgrv)xjwH;;*1E~Z$C@lIBdACd5m~7k5a%@AS=TxN+BLYSupsBTV3ozLMOZ| zNR&Wcuk+D{`(dcRMDNJnbWa{ARVke!RRAD0!7NLv! z>+cds+R=TOlw{O>)sOFur+j{*A+GM=JJXFxQg4P|wh1K2Ei*X(0BO%IA*YrtH0;NQ zTW9S{2`eZ{!(N+_cQuBF=ha)PH(N-V+-K>Xhm9kEaN9?NYJsj|2l;?xHz{9w0BOE& zn-)ta>Nj%Q0RC}K+L~53pjG3(%AdPR&3Iy-(3 z@|!T6{$OgrL-*DE@2ZkH?sGb;?#Xc+QlTBxh<^Zc63kMukk6m$G}KhDcpq}2)U00K zzpyWs=~=dztPQ@JAYRAN0#E~8Y@Joh_ZJ*_p;%FxSaBUJZDvjOGq>y2$;~QUb2frE zz4&k4H^O$0m$GFkpQ4ljH}gwSY8)Pyd0;P6)}NX|tHRa{L2@)sVH|!FZ{+mlF*o5K zE|Y6;Hpl4#ltRb=L0a=x#VpHZZDMH8n3zic*I0xY7<1h8y_~`|XYs32`q$J-)iKb4 zY`>IbYDZE~u$ zkxi|)0D|{WA9QMbdk}* zR3j4L##3H@l)+$?uFtNL7BsD@lxqvorrSJ?PQR_@F1d!MicHfJ-hc9cX3w9xB$Uqi z6sf@n?#egFdDG``UunNDvPSe4`NL~lFfU7H<=6x7sO@O=bH)42AWe`Y!mEEE*%w9v zB5Kntc#$NcK<SRpOP#FU-?N^A(~b@LEU6=9&h^ANG)b4wz_ z*{(so;yr(H5kEiF{ybrIfrI(M>-~A*W_1PHH%Hfs2>XY5iHh?CGSofUVUZD*2C^z} zpZPz4zto+Dv9VXgv`Op|)vQ2^rZtAJQnxV4;ar2sxCFlRBfBc;bBa8F>`U8^6WRlW9CX8+lJ}vGfCN^sjZ0R+Ne}Ysz60#6_M;{ z^oe6r#kQNUt`%el$=qLg3XX{J4<=W3uQgvgelw%tRFi1LSn3&7@xqMgsIb!>=?Igh4X=W6Ok9pPP{8hm^vO7?(@IBcf>8aL&glOpz z8{=AGvEV$GQr0XkXDjNnqd5HZkTZyFZX8r7SV&2aBt!nxI@{jt2T}N06*2^D^oHbhDqZF&=>~B1#DXA@ z=}almvd0#xY8nXPqlG{@;>%!$hgMfj{6rcnFJ8|X;~lWNGDue}nhbemFK!pHjrN#e zZR#rJtvqNnttl!X0E1!%SJwC~)dbT*hzeXkhgvUPX(GoTbGOj~v(OFg4aD-!Sa@++%xqNZXR}Ff8Kvm- zbV=2jTbm>vOyy*1>Z};}QV~=|_kK-b!$fifm2@v8L#~B-ujK07yj`Arq?m_ll1usL90t4Gc(bW4-w&JhSKBpBINc|`$3H|K0`#@^pY_B%Rn6oz73k) zlRZqHOdkK3fg2|z7&oHMk#W?~R*YYGEt0KW`m@xh!ViIxr`GCf^y|}fcIll`Mw|l+ za8J16Rw}=-NgsZKJ@j#Qdp@aQ*G^fM>uEG!g(lYG%CYft~4dj^+qI1SW)cjxXw*!t`TrH(u z&v08~a`T(b#EFg8yB!;e<(8`j$F}5J$cogAmiP6wF&*bIUlpY_+rH>e=XzZm_r>y_ zvnOp5z0pO#9F8C_9M_zwKi`jRQ!f0q$37QyCXBzqz2W|tpV!=OXyTv6d7&GS76Zvm zNLQ|@+MNeVo5=d8b)IwXBucj!9|$_j5nFd3{TO<9H1s3b7)_|JqkgB8$wRA-=!LUk zg>~OI?PBa0lWW`l?yJ8QQigScwxpONi^z+su4#eAucYhsApuqSZPj!hkzO0`=*W>q z30)+64d{Sm?U8ws3lGe0#q6*UcVguFcnaB8#m*n~=Imnv2DN+?Ncs%ElI#8%Kcqr~ z3d)rCkQf{qAhjPlCa{6RC7RjPcmDSek^hRlQ$3@44bPhR zE|AzqFrJo;!NYe-c8GqJ;v->6{Cqo(Pgs*F&+$D@!k^E3`S=;+6*4rs%F;2|#X|ag zJ#Svkf(Cr%2(zR$d$r%{8s1P-_(n!ul5r2rC;AU{f;GU^t!6fYv~dyr)e?bkG}Ywv z@gu=}&Kx|PlF&(xanCKqc>&Oi+}rD5E2U- zxX#g4N~RW{Y7nqoJa0I1Z$+Ot8d~<5L{9VX1g+na0TkAhHdu5?er_z#d(*eH&&VEa zo=}}ejp6jo7pgoOmeO#xXsUDf*87-u!;NXxb&TmnvO=;?8{wJ1NXGL6He5hE%5oBm z7^Z21DTNi05KAZIY^m4WYK^7eB+l^6%Tvuxo?(olQBn8xaG}BK4AgLxV7(qHl zfPNvUUCJiPI1Z5S6;ZAO`It@c9hGav52{0)=fDp)y_7d1+|?8>@$FG2-7ypudk7eV zoyi3^Rxvss5oR#JWMi=d1qGIZ%Wzj{fc?p8h-f1{L?A!MY_hlMN8E<-&VUwJsYH9?2OIZb33AB6I+x+C@0 z8`s9^2{Z0voQ7;m8yD(sR~W!+t63hYEEK6ll=jLu(EGJP6R>I6_-9B`b#e(ifNc2E zqU_aR%G4ZWyVCffG>tgO`tlv8?T0SDDvXiT9CwD73V5!y%U)(hfK8KGtAo+Vm~v&C z3`ulP(qWpIUIW)q!E-S>YbBH$Ck##uX9(bu&gpsXe&_SWtZ>0nE$N#x8CbLwyhn@K z&kK{5VS6fEquMedPQ@O(7!SS~l3DWpRvS~4wvWkxdJiR00c@d zhC>903#bd+CC>{cGJ@Dv0lmX~z8g0qo+jbRR2S7&d1d#GV>e_rp8_9?P#LkPCZ{RA zm-4i%I^@|mKN0ts*Q+UId+1Xx--#np&vnrqh_+U}E9 zHuE>8Zl&SfF^tcIqED-gO~-vS1!{KfSy>$sC{3=-1kzT7s*$6ZLFwDdM zAMe}&K_~|)SAW>3p(D}-zV|ncDfC1cNuGTN-~~vv4W`;^ zWS9O8X#_syE77TqiSl*bmV8p9(LG*xK@A6&?g?JFmpu0deqP!*=QXcWWO{bN4{ojM z>OhL`69zxtAKnP(;(f^z%_e=Wv8`3%nsu7}M7#zN8JRI~8iMxLOoaZ{y!;G+XU)C2 ziPx@SNPFJ-D4C0WGs zhmeED4&jn1x{?%F-UyGB=K-i(|bJn`&@Ll{`+qJBg(#Gh`Ir-5b84H5Ma&wb`o(JJhyzr&| zn9Ci+a84P5$22nKWn<7NI4AO>on0uGZ4O%4Z;nDkXB`QrMQSHjC1oH3_eOAgV0Sg4 zdu!#b#BC8tV)F+O1rJ+MQubYWqY6bT_t5p`?Vw7&|bMo7& z%3<5icVvPND%(qCZ=^U0=3mT8WUmn~x4mB0@8^S3md4@}Z*gwLUN^z#u?PG(towV5 zS?^IA)t#gYi5g^*d*BXOo|RIIUS!HP86KX^AS)cL0__hfoQ(Iu#a6k5M=a?OZHmM$ z;m_k;hL5Lewvq<&hxT%hGi0ac2d+Q*;;|;twS#s=)}J+`_Y<=J0HlEkRzEXjz5;*< zB%TIFdep_+aEYg@`BqoxkX&0yAK738!54j_Ad{YRo=-h9PPWq9QHo7U8(Z0Qn{s~9 zG(aH=2h7Csc8+-i9AhGqD~%EL`)Ot=d#6S>JUf{N)xg@=86ci;IXraDbUK~Kh;^83 z#h-)T#@aTPM2g))BO^Fx-_AQJ*hVwS-I~{yHs_N_osLsdZ9VViTUBvtGSKb-Hs0C6 zJpMI6EGRxesLk^+Ic8sCmvZmidaDK; zae_MHviGGfXzlrxYo2#>@2g2_V*y(wcC#q?xh0v-ILG1EnJH@`1Ar*(@{9XKqVIs*X3tGB+q= zfH(u522a%Wt(nD~cHHATzV3Spc9mxc zwR@*iBV!Sva=p1@>&+{wqz+|uj4#Zki6pV;2RYBZ3?77X=nVqNaJMka8n@0Q3^tN@ z!6K!$vAC48S}3@Qc884)8+U#Q>EAz2^%^DeM{NzgwD&1+&A3Py?`{S^07vK6x3v8m zQqr}{nN|oSHtMWxmhvLS8$CvSy77Qi;+4|ffbA&H4xts4%F#z62HS3L{BA8UY#r0JTc(db)7a#-B&htvQLXh<8T;(^6n&#GI9XN zPJfFPcUt&cpj~OLX(j%m?BPl;n;t<2?|sfbwX9M_BzY`xT-`^hTU=btwQLCin{G0> zB=OvJ`tw}AiLFm%rN<4kY?pI<=a4gw*soN74(N7p>iT4wwye6M+Rm`E6hU|qPT0d0 z`N%jQE|3o(a^YxP=in-L_25?Sxew82MYCQ+*F^b>#?gjGa6&e4qa2cYoK|lU1*2l-w(Rq$ zv~5E6-hDzSi`&}Z&g{Sh8~{1*k~paDu4a*jWx>xl{W-5vzwnGV7S^`z@m?jYc#qF{ zhURQ;a=_pm?%*iS2iCBYz!E`uD4feJf78=}Z%$=ez(@7^AnIZ?^yijSRPEE($UemH~M+yQgTWsF5NAZEgHTNXZ_Y)c*kRuDnQXl2x|0c^nYJ-b=9Szyh?3Hd_!l z=bpL2#Y=B&ZdZ@Iqp9pEjs{nZ3Rp{~WmP089Ot2{6BcRYk|PYVnH3zLoEB{TO;?zW z%e0a)+*QLnOpUi2$Slp@K9xwW4>|n0(5@xsOB)qY@}5U=#alBVKYV+Pa6N@UB4SFs ziTkw&8^OkU40`)gZ6K0;a85YSr7HoU6i~?+^I~tCp;P))GKpd;cBod*0PZ8|Xtaju z&d72TsbV++wV~5mXyg&HU@qcil~i@@lgTHb6)8SqU6GR2GHxiTg}$9w}Y1jCJFo9kLE9TzEd^(51$p@XA#fvCWtVh;`X8O}FFJN5;yG)o}!dv?+qi|C)M_b zwvEPB7iic>D~xU@_0Q!@j&@{=eVLL_Sz17NC)3dS*Yw7F0;2Y~U{JW3GGWHRnnaa=cc7W2)6IG<(|? zgIKZqR4|RW!A4R$ZcJeQrm-Nj(U-)jbE4dhD&}ir5r*D1c*}rK-OtOAG1T%o>sZn+ zh&9;fwVuZ2-bh`MBzcxQRV(*~=E9#+z@7)vrGvuPlSOJ_yR@~`e9S9>A}hXfxL`1F z26Ddn;<{<6Le5h87ftfY<*i>u({42sGZ(sH1|^DB3&;oNQoRS`#c23WJu1&gwVKOM zf$pG{)6aO(rD8BJThpn5Vbk&NQ_e|T3QQARqD~D+uH=2!*qtxe&dI9ZMWR}-- zH@6K3*`!B}(gLzD##`skK|S-F4*2O<8r7x8hi>j(OOtV@z9h0)07zL$3$*nnMJMj# zxIawE@9riaySa6WXnd(mV26ey+Xn+@t^p^U=D4LsQI@9_taSj|-dmXxD|Tl>UFHA} zFTdy5cc(_y;i8pT$GB~cLSceu1Mtp&rfY^*S#+10+VSre-cR(K+j1h5=Y&=BSdD`t zAPTbj?W$={rbBV5+J|(MNj_i<2M2yK%uir=P@@>{KEhVB{XzCeaN)&?*t6kSEKwnB z%1OZTzy!84*Qadts3*Gr07N$OyLmHaHtVsm!wj!qOpbcuxf?0Uz2)EbxAu>h62$UK z9!0lP_jk*H6*12lV~hckDi7_eYpc<1cc|&Gq{)aSf_s+olcPQ|iq3ls;eD}5)0CRk z{{UatpsaPWNu=7@PPW$WY++rVTPj2(9nR23BORB&1m^;$lf*Xnwldi23HGF5g(8%2 zfE=j^#|&}@C(^v}*<;jZp5g4T?4!roZtb7_VI2`vr106w{xz85xywS&0hx zA1)UPxnd8j8sCN*=UQ1Jxrt+2XjWN6G@u0p5SZW(_DSO%O;o$MTkBXYt#xrZjebR>?0<`D%{u<>?rVAo@M z=UR#%vqAQog8o;WVU|+LIRkdwVHb>UBaXdkP5tfEZfA4$WeD%{??!C=hd9m$Zh)HM z?seOrvdLiAH*B*wZzx=3%_tzYS7>4hJrn>s0BX(E@)0~Y^Sp4TWS+ta36YqL551Nj zD-04q8Q>hzD73U}Qo6mHCfSa5K#yxwMW{g7)&+ z(VsHLO3J%NK><{O#&Mnpzgp#5duw}1-%7dEZDh5KJc)gAux~6CL`(@Jfw+_F>A)HK zwyUMhe&+JeR1F#^XsiH=CnKRHve?P~9=}sb!tU&8YIg0ZTIsMtAdSAp&$c;t2ORoy z>FNBcbiTHl;3cz*i6xhaL?&h3*qyuq-#_GWaea;5ZRXLM)JZDD1;d!p+wU^8jetI0 z0LMASPjPOOLf_kWvFX9Cot_m1WEsg-l~r(~zI`h-O55m8hf8&PZ4sAinbI7zo>AE2 zA9VwORA-ZclfWHn#9D>T)8VWq)8%->d1gr7FfpFFC#O=~_#D?6V)pt?l!S;c;E};- z)h5^wzYJg>cn%I&^u{?kth;?FZ13(Z?(9r(+)P>qmAtZnnBlyDcQEV^80XTf*e9aX z`tu#MJqagAE+j~$0Rd~({n(J_rh0-gj2r`xN|HTF&UN!6mIp*!30mn&vFDA-pl6PI zd)J&nCZVUxJTrM-dZBl6E+LFYU3d%5OBEmif=)V7qG}Tqd2e*Z zkx>5RFvKwPxq*-PvN#*MWS+He4I4H-T`nhyrDhAcHW`a2(wMG(3L!2m33EDX8*i+MS zZjQR+OI;i$l?RCQ%iCMIt!0T!asv@!34RGhAdEIiZM`v?O-I7FRKbg~J<@EJ zEm!?vk8tz=Mqh3RT;r$|qdIBb+T|pdQa>Nr zX)s^4&X<2`)@^c@OKP#XcsAu&FYtt1XB?7AJZ87_qjb|rqUrPhYb=)r_fxo(N3u4W2!E8=Jy{-C^>!aD4SDK<$i! zS^7VP?yv7BmSFJ35#Jn&o1>KHqPIo^zAMnB@nLu;+XK23KXsHGWBrs~e;UsI(u#Xk zx3y{fpqs>AX%0JlrMqXK>0GjM=>Gt(nJ>uYE+Ei#J8h~pF{sNCGD{W;w7rCkPtVsZ zzz(<-Z$;DfjS+0$Sh-7kXxq%Uv}7{Kh#xvtCMidgA>C$zgH-lf#dXAG>KSVG%DDi1j7GvAz2r%fkz zIIBCJv8r3FGDUTCgY3$j6gd%-$@zNY82h>4bgpDvZ`NT?IdIXJ7(6%_>G_K9JVB^W zVHsOMH=bK^?p%BKA77<;7Q1}9WyYC!z#>JvI2;8aWC2}V>P+60o6yzqOn-0iwe71k zPjP1A_8{5gXWrlfNGZKY9<}EQs0+JEL~}#t&mc1E60C@jC;%)Ndc*eDH9+f)jQM~YrSxdjK|pcGVa9vbbylc~ z(bndLrmLr0Laz$PI@*;N8nm#MVJX~HLmuTNPdUj2faC$iRhvb>)GP)1>G8uJ&o(R4 zCXw#?fU&`n6!ka&U}W^H4Sx34>I+4+v|B@J3bH&zJj}->*}ZxUb^ris<%ErIqa)wV zZ}vu*Fpee-=dk307v|}~1dRG(u(RK(h}fmobn9zWlTwn|WPS0;9GjLs_{j^^P6s3n z++#Fc>r+pwv}ruk!yM@01SpE&a9!0$Ty)M)uQhI44GUVhW2TEcNJs%Kr+Ea4Aq0^S zT<$_~%MwS>0Z~Wc&k$)w87*Lk$q25^b03rlKX~PjF@v9hZKlf(Z)7sc|gbaCdyn*#10{0OF(6 zFZ9;Zg{8DsmzEZ%G21+E5e~=Ze=#svdIH9*3#~HF`p(u}DlJ0p5YY=sESf88P}nZp z%lVTxp?m?Hj@6$cwu;2rSfv+=NxB6tZgi^?C}tM%iCF;{Az1RGj-I5FO=^!5Lv1|! z{P<><_mV>-n^1xQ4x^pJr`{y{`cl;GyfI~N{{Y#M$vf;0#$CbC=M1JdYys1=dlSww z^%(7DytcMkEuv`S3oJ3j6L=#a{zwnktsUq9nUuzSFb$>iu z`9%+P00WfA1Jke-w{56HCZ74axwMSAMDp&uqm@!Z7GgUdcqHc|(zrX#GJ7;x!_PcH zm0n1#oJ6CqFV{bZKDD|1k*Ge8de)atQYJ@@8GMuFkQgv#AA2B;xedS|5mw9k1dTFEBy#h|9BP=x>9WO_B=SzsdJN{Gn%ZqL0xl-g@8gbNH2s$6@00gY z#yWo%Juov(#`*!XYvPoO7J%wkDRQg|%F(ZvoP5j_VU9WgO+hBJrRvg271OndNzzdRnbCERKOW51Xr|D1^jlq_GIoZ{^AKma3N$K~F2WqWpd1V}^ zF0EHSV)=`QF-k~2#&M9C_rqU0PN zI#OM0mpbLlG3r`%ox`#$W@m-1j6|F%XI2Z_sRzA5b)?SMi+et(W7FO>+RUsN44fjS zI&qM3kF7HE8k~1gStgwVL%>yEE6rdy=YYo<aTFTYup3_x%Z}eC! zW}Fhxh&IME!Fbc4=t$~0HC(sQcDb=;-9lGMFB@x*scGjhDEnsuv4Yt!098nc9x;ET`79=~c-+8l>(AIp~3{mPk z^(*5uN}D8f9z>pDJmg7?af}n|$gCB=k5aiw8ph1Xs;ZcaVr>HekeDU2$!yiwA6C-r z=b8xt+(Npmth@Qh!W@t|$0D<8N$xF6HVt>Bk#}aZn}0B?KLiyVVUG&Oo(6e6D((IG zx3{;uw$|=Svr5vzAqv}z90AA5G1r`Ajw=S^TD=PCE4*;Xz{?KofKMk1cpUTV?@X~t z@C%nnV&KFk||ajy^DLE^%U{F#AG1>k0YFOQZ&Zl zWGxy(%t#R&A1%Sb1KWy7*UTz#%y#5}I##B|Sj6_|MZ{u3-Fp5x#V4B+i5lD_bD2F71e7J0cPAiwAAf3KbzUX7&|>hU*Oqp({if<9 zNL}tS12zjB0oVijRwu*zTUWZ&F0SD-c~L4xJk7r|9#6jm^{ji}v|Xaxpkr?nmQ@3c znzg3Od#6On400)v!!`yxaalXY4J%xok?0daVD9E*El^p7SrL~94j%M4EK_i-!U(8P_lm72Y zQFl9M_ea|zg({uLcihjtifdR5zG(z711G&*g6`YwrLV5;&PN^83Ma0!t`y7>@P;?l8u3R0i+$t6Eow z=f6p>B|#mmVZ65zDytE}V#FVmdgHBai*FCEYaN$LQ} z_3Mf}O=q$uva!DM5XPl!o&H%Ba&sGz(>=NERm|RlY|Dd4)2=kK(P~p8MtsQ&TQdd! z0B3TX?f(F4Z(MO*cZIFymdnFJ*|*9y8)=ws1b}AVe@fvjZjHS1M=K}qV^uhCc=RW= zc6wH~VWKvbGaoT+W=S4206Sp))yV_fHE~SdlHTXgy55}(mh<0|R%l^zrF)IN52&vi z*X^UYn%XH;5Ut23rvkgFymt<~mlhD|_g2t(YRz$Q%A=A#Tx9dsxSP9+Jx1<9=A0?} z{{X_H-1My@6>F7EG>)rAnl$j-D=U>vNRB`EYGOzbXb6OgG?ake-+|zAaeLmX=kSe1R-zXdK zPd(35SSXi~+a>B3jU#U;I<^^j&(|LH3Wiql)>S*yG1%v6JwQI=(z&G-B-1(Vb49N- znB##?mgepp7MX4i$wzf%?e$z$`E@a=Y0C}77IxPOD_qSuh$#p_RYm8JykK?Xt#k8) zg>cfY?=N!f=O^BsaUSML8c96PHwjRK^&E816mp=lEa|gYrl)NBj8j}&#!D1lvT$~< z({?lMp0zZ3toC=YT_hKd6CMbrLpr2WYkvTX$W&1EEJ(XdOkj8;DCDlD<9pt6onIrQs;RHeRYv{@%_mgd?|Dj;H2RgJme zamYMpo()y7)HM*7>uW8>%c!>82<0K<0Ozk(!RHjzY?vhMK*=?%_fV{gZeuOGa-1;7 zE%N|#^%ZReuX#adKxeTcoPTB&wc3&vEaX zV?2`S@l5Utt9o(9YSy6) za{(aK;*wM)(Io{|TygT8e9i8|oE~c;!fCd(x|U!;RgVCv^%WJ%%HON^B)y!Yder~d$2tl8eoj|ZCavO*D6BVd?NbAZFA zx$Emy?M008r)Y14O?OJP(e%yfCzEv@MHRau~X2V;ZL zh6YJJ!0tWjcp{$hk~oI)09~Vi6dt7ddYX1t9)`ufp>24!(O=q?L6BJ_5vU!GK*((I zyQjAmZr0;TdwbaK6xtOTj9Rd7IEKoM_xkr4tes~}z0#(R3;zHpwgx|Y7KAEJVo~RU6j*|Z3(pzcTYilMUO2c$jgUUb{Q}bgZILu~OpYoRfmF#!*l_YOkK89li7G&M{AECEgw7 zViXXlt$|aWI#V-gzh{;PwLWz6-pJlns9}{?*riahF-UTXG!7v7%p_@V8NXvV=mx-cj=$Qn%7-9Bge`EG2wC6sCZQmk$S|lk2p3L z9DJSYe4qelEQD?uT$5ZiG?Orv?4$M-RLPDYe}oWDO#;))BQPW6a5h!Ut2%}XJ>U38 zueCi_Di$rW5z63?m;V5-QK;_B^|q9rAY5RMgO9?b7Pwu%W?Y{6_M9%2S@zDBIe`MSa23Hw8eATEWV&pksm00}t&&s_0?s)tuRyj8k zV?2%6BdGSJsP1AaUI?5jMYtXjju`zv{dGp_-uiVbC99q}$mnaS5;}%NP!)zq#&h*F zpt~MH+y-zDBi|KCj_mO0rG@ncn*C>*29tcj93+x2UI_;W-v+gHJzB!(+N@&v?Cso` zEIp( zYnQW#Cgz(e6$ifHRx>(4=4M~SYcxDegk+^D`_ zuMXILYz85SetKt+eNA;1)9sF6(Z?#d0g?KibH!nJir9G;v)czmh?tyh&j*lm*yoSR zu$^nkq(iOlxZ6{WyN=%Ck$L+F^^s9E-hhPImf^o#<-&4}A?7VTfMb+uXoz zD0WoJh7Uk7P*-R^Kcy(^WHZ#Lk_ z2*({$f^pn)-nOCCqSG21k!uvU42q#;0lc*X8Q^|Cy|YScR?~4i*zWb#eOJSmG1vg`>=X}9IpVf0wWDT}!FwbT z8JxKLGKL{satdbyBlSGdnG6y`V+;f18dwU+W)666qT1{zaWrhW2n%?D-UBSlHX2>KeuLN%;a55BE zKdjzbc^1nQF~_zt(fJd7*F5sL;~lZ-+Mu}cRpSL#Q!tFF$ct%weeK6SxZn&NRa#S5 zUx->vWYktdjNp;Gs3#wIcFklfEsey5NCccNT!O2d^%y6Ql={;a z>S=Br*-;iuwlEcc0n;ABo*0eH-1fQqhwaZmlw0oc{GU`P4b3F%sKM= zs;D`~(yePTi+x5ZEo4@P;xxEHox^$i#h0%Hf8Xt37;}Kb2l2rogC~~} zEegrob1pWYr>!WqeM5W41d%eRI0qn~=~e5oR_P8Sl?Z z?kpA!E?1r%LGHZ!a}a7E+lNw_TSFV1we%C^sS(_ZjLd8mj*QYQ|F6sQqtOhf%YVAZPPi=3rG+@tg<2A#t9Z^{#Ez?e;vY zy+O{_IS11<)gz7WSpwp(g ziEY{!^Kw~Oa5L{)8YDNGRm4~D*gTg|$R0I(ugnPNuU~rY#-AKPim434IAhoG>q{Ms zM01ck<+pM>{{Z!=ePQ+&v}Y?Uy0xsA7c*bYJ%^k2r?s07OL|CpK0WdeTvMjCxYORt zSiQ7Mc<$o!t*x1k_EE@?syea)dLH<$z$K-;Y{tml&hl)C%r`=xDI!9+!<_V1!u?Hf z7xx!ho}n%6^wQj2$;(SCZhgayeLG^h=~k4Z?{04kwRUQ2*4D{2rlDgDDP?;CyiS*m zJ4nNpE*S1c=NP~qnB-LljyRk=VEy8GBLs2MvjRtK@XoM|mfXZ~yVswsX(>`&(-`Haeau=OI6ye14xP(D)$9(jN{h0E8Crz$$P^R-IZT41~?$GO~igQ=&g)W z49oKJr)dP_dsN3`+qAM=#E7JjjD%ovNe3AGJ@}-G-+W6e2+EMZGXSmu_0B(?X8KrG z7;UC#i>p7&_qL9LqX8|+X%j|z zx0Zo{_v_Q!G$kkSqLdri8JmKTS~MJIAo71qQ%#y(vC06hC*q+hi84v{mjnZtyOB=hafcHi0d(G99{ zxy}mi&*o}a;DIn=j4BWAVR`yhtSq{+mK}FKTZZ{A>E4>vggFQ zOGggNf3`!s1wY;coY01wS{>e_BMVnm2Ot5(RJVm~ZB?Ws9Byn1n^AU-a~T*sXVRk7 ztTkD5*EY*+P6x^Mi1&fmmCi>#wWLdXXi^&6S_uc*VVXx`Fr#Qu+y4O9Q4L+Lt)+~} zCx0yNP`g9%(2uY5u4nr@Qjn1(Y|z%ox7tr;Dg9M^qpv?OPRt+ z0|KNr25?4c(?IJW+z1%?K^<$-WYFyKYVEy+s!t@(3~I6tV~ReC zIUM63g+s1|?GH1y(;Ds3V2RpiRYjYYW7ypIbqa{6O*yU z$Dm#u18*7Lpoubb*ZI}-@Q$H-bdf@l#~YpPEV6A>&mS?z9X%_z`!s1D(-2ZrWFq6{ z&pGc>`DP9bUS0tNmBG)U^!ggh>l3*=(l|JLEh9!@lQDuoT&Orcp!$1N=)5JeP=3}n zP=)^AZaBb#pvTW!ah z!9x$3`6Y4APh97kyfh0{xJd(|su0Vt@;x?+(^O<+43dOK+BYBK^c3WYN{Iv~0AP0{ z_2?=b-%@ob{{UoIg3j@VEJj#>K*y&QIMc1d<|GQvP5|R8+N?Tmbl#;yV`yX4`}V2j zPcLgm#X%cn*nogNSJ&ys)|aSQqaCb6_z-T$`G-oFOAWuy2_H{>w3jjn?Iqu5v0}uV z#HewOIc#SK)A`gA>l$nkNir4?H_EFf(JxMcP78DF1vtWTc7nJlyOT3=&A)E`R29G~ zY0o0tC~2N0%8dD}tDjD$uv^7bGf8U=u!U7NnI8=?PPhQ@gWs>cXK9uMN=^Mwc~LZ^X4totn$OQapk4bjQq3xE-lt*>XmOG2KHvMt@>By+@VF(P|oyUxNgIo4q7`Zc}T&1QsFCUsnKD>d0{{Ysj z&*57nxX+n;un@r=!ug)Z11dg;Jvpsb8lzeoEIiuO;_e#S+j2bd06EK&1APri9MB++ zXINw^cVb8R8F7L>UQb-taSn-PWU>U3>Aq}+mGigIax?xlI@wy;+nL#5jxMb{ikxTt ztX7K&EACRl_h$w^%WE1*D<3Ik1*4Wts0V&Y&p%q>?*e_Qac6ZM#jtlV%BX&H&j*k> z{-D>bc$Zo5>^dd=jnt~ia|1@+R7;f%IBXtp4hZ7BUfaox%>})|xsN$n8ZD)UK4XlW z40DdyuDDR-g@ctv**lQ9{llnbI3xl`<5q2k&UYV~Hvx`4DjSWhJE(qJ1KSwn)8W{x zgVday@@b8ohl4Ht)#AH*Naip!*0SP2s!sqM0Ni%|b?H#}cGd^pWSUut<;hjg{qxrz z`L8YTtZEqumz zdzLP80FbBzcLaZ0oN0DpgFG@w`OajSo1an-Q`)Zv$0!c&rAJU`M=91WNa%VW>r|;j zv8OCoL8c_gD-w<|!j1>lq*%`2q&V6}IpgyFl*2J2mFjyV06v+Z%vT_i2tC^e=to++ z6j_JLje|6DHr`Lo!9B7uijkVyLH?}nVg2LB&}59iqZfNn!fyCesow? zEhm=BGG@rnC5QDG{3!}H)+r(EM>wbeW4JqjanV2^kMrK0i8#O{0ggLUu*n)Z!2qN{ zkDD!mN8>>*4*>lD$o~L8g)vUt77LNlx#RH^{{S%KD0dE{2dy!mH7|#ZfQ5+Q@<8C9 z&a@AO^yRjJqe+^4Wyv^Rorm?V(ISc{*ko>D%P1%GrfiNdQSpK~2)SZDpVqvWQb(Ps z%2;?(+Gb~-f3u)pllMi*Z1g;g4}NPzcy<$XqS-`o?p>Z^1|^3%&rY?m9@*tl8cQP{ z0)@`w?Mgp(TPKh*2YmWc=WkHC#Oh5a_La;EPs+tk;z=jHJ4Mz$!zhjIqU2+FW61vi zKJ{}@wK4djV%_*XZr_?G)z)LQ1|Ozk8`kKqLW0QIREIYR4G)LPf< z`xT3cf7T6&$m8#4`qH(GD-n`O_B!K{_e%HYrFmptBh_ud@^u@8J9eB%dUqd6WyYy> ze5xn9mQFAOF(40W=lY(znfo)l)Vxij*)x5eq_~@58Bdi#_dms1v3SbKl&qR_g;H1u zrtW^D`qvLAU`|QJBq(=o&ls&%9+j-s%Jy-w=xM2H+S)~IRrI|(?;IWT$T8bI437Be z*w(I}CYNjtm9$F`kLTYeOC0wt)RWZHJSkyuH-{}|lG<5hWiB6XaJ>iNj+M5WJ4r8> zM{7&9Y{^L6tow!mqMz~8yq7BmpL17M;@K?Pm));m@G={W>FhN z&A)L2-x=wjQCdIPvXt`YdwC+r#>nvAhbE*=3e?B5&bh}w>gGF#uEeUXtJ4t?I)FGZ<%}h zspp=ZYTQ~yyc4wXE3=f^=Xq6NePbUlNLD=B$qO=u3|K2V<0FCSKEGP%t@PVFe=Ev?6=Gu15(zP9uF`oQL5g+V z)}N!knaO*Nw~X{!m9Tstzi4Il?821d35`^6@o({jDTO95_$gs>mSb)v1rq8 zL`0BnCut`H0(jk?`Sz_?yh)=Su{HtgI=&yQZPp6#_hfF z{9QN%n)iz>GTP3<*2*}d5Zl19#=%C`Om)cXp1JqUeBWy$-hGcxmhKB!mocoXY4bLE zEBRw|gLkN6J9<~Hd^hm~L&MQ%T8yIKQL~xhiWX;L>5_REJ2sq*9Ao9+W}dHc({^Wh zgWyv*5RM#}^9^1~myb1uz zA;(`s+wtq|LPpL>Zfhe&t zN8?qZ+=nLq5`9nNWwci*X{T?J<=7||iS~&|Cz3PrlZ>1I-&)d+4QBe~Ue8GoxNnzG z@eI4A81Auu57P@3RQ=Pr=PB}bRpLmijGe_6wxYLWrGu<@N zL2Lv|1Sf!}{iX$h7~m1^E3@%`jNrDllHS7Y9!N;scYY#ILZFTS?}q8L*O$v_b#-?= z#o|j0&GV{Cn@ch0EIWG9q;-3l&sS)SkRx76mMt!FsWMJc( zB9Sh%`2O8;k+eT59r@b1<}wE$=Olektw|abV+tgVD;G>E5=j7(a0YlB{W++T)K+%6 zt!r`iD~lV=#TMS~35m`y1AL2H?=`Y;p+34SQdQ zuO+_lwv%@&sCKfCelk^v8&5xlG-*ps4TC1&7ns34LEWF$fWcRZ0~YKEBR}KqQ2frJ zvByk>Cmz|MnZVkF=Q!kWPpAT!waNDeA(uSy^104w7EBc>jE)!%N&3_VR8=9OEC+5+ zG}dTI&Polr`A$Ip06$t`a?y1M?{kiO5I-tUFz?zVZ*Z~<8mbOhaJcLkZDaX=jWwJW z-0a}vf}{!p<;f;}+xC;}IuFW)o8>qx$G^-5DuPZV!vftp5y+=M%y7z69hj0R3ztjz zWbY~R{J9JbA(lpV+~kaW=WcQ;fMCiW2OWq5wJXUYDdaXeA&$}qw-l}yI9dM76`z8x`ZeS?yv-V*%1#?25lI&%az$(35j;@Y>s4+}z0C zV$CJOM6u(p?ZXZD9Mq^M^2Tk;MaYbn+En!+Tk+1*?@`>3Ihh=zqYM>LFr0EfI%+QW zdXJTux}Ci3b{Kgkc=8<;#!1gnoMcx$t1Py+wt%4+kM8=MoYzCF>J~b8lQh;)PT^b0 z^HXsQKK9eo(~@!y7LLbfFOCYF z5uTZ>X>PAHvgQ{@laMD7k1!v0#&hpomASRM(`~LtmQ2wrmIIP;^K|#kX3u0U=h+3Z zeZ6teQm0OI^tjX-GorsR_)_u0W7nq}w*Kw%UAwqLIRYPIeO5{JYk=+flgK=fdY0tt}%()U~Uhv+0&{ z+{nDSQG|?oyC0QD)YqZ-U%)yo&XZ|MhA^*UK5{2jYg{t!3V6i*m#ru5%SL3u6kMj7H4Hq+`&G_v5Wz5XGFjBNf0Uo73rBPlI*cM$=YL2Wqxio&;rr zK*Mxuk(Ubi;n1l0TpvJkI(K7uz}|#pa7f7fJJoNgad$&R12|Hk`;Na_oI|!f_yaA} z4mhT7Mo<+tl^l?A5BPm4@q}g=P|wN90OWg7VT5tv^70th2b^agg(z6m1HJhjah!X7 zC_Y;jb!3SSINi7t>Q7PY-lN9jkiatY#DENR{{T3oGGWf#xK+o^x;&r57$1#sz9O{o z9<$;Br&-vz(t>%C{i6cj_^1V93^@dmfOD0}AdZ#N+}=lN7&8*=*$EU_CQNm~&Hx?B z?*m1dSE=(}0_g9k}Br?|UAfntH+*5K~$P6-ahy%a6(F5Ud! zcV~BIKke?xBwuoJCf8)nll!^l`VC#{?wbh-3G`QY)sPH^b-$nm+cKlDv_+Q)ip>o+ zTwTN-P&*u5NVI*|hxoqia*2b;F^n4pH=Dt$B49*be^ay*3;uRn*6)$8E0GJauEm!- zBHkkNeOXg|QZ4sYbf>dlHI6w|zs`Nk@9Y74wox#s*^s1~?UE)QYVTiW#MDy1$fPIS z{}om!Gm`V{OH7AQe6KK7SPE%O7px;WO&n+gAIqmi(VUOtiCxk}EH!S=-Jwz6u{&;E zsPvE)#Ei?!8P$qiK4UWpa;U8T&>iag_?!RxyvCBF^VwOATqek9y)lax9U=NZu~6Fh)+0p2a+G>{^&w@% z)yTC^2Ju8k4ClALc(ikkQ_Z&0iJfX5b3&a5G3){6AIKPfxS3P&Fr^P2#0yRLe#sZ4cyHKPq>Z#RZ`2{Z z1Uq2e4nW7|wMf+TdG${ZQE(f~{|t<#)BoMwz#Nxz^vBXh68xH^5?KDLtGPZ$tu@R^t8D@XBZNJD*;57lIr1N_;J zE?)L?nHj?S#&g4nnq7%p;uWZUXUOUmL@aefn!0C{XSDtYt7@YxIX*bLy0EwQ#YhQr))`c-;if@rGUA zoC+oyc0X=sCHXNLfX7y5D_=w4z(-It1MM4&ijp;@qdVj=f}K--sZhcl;q_8Z+28Vc z*_JkdEH}zEmNjtw`5z#FqC{)askFBd)6keLH+DT)q61Qyc8=!IQ_1v{<$Ehw!Z$>; z{U$uGz!s6ZLw}jD{|bxp!}-8B-!WjtIlxcNWRHeIf&G?Q=)mw120 zG+2xK&vAhmn=SumVK!k=4PV~S#e|8A&oFK2^y=JX1YKYFP=j^7FVFb@?-4xc1>4fYtI+vWuW}{u~=t9)Q=~DXD=qRxDZx_u|)V>;#yYSUf zgenPqBs4ERg5Vm&LX8p)G_IGmqBRRG-tCnQ5&F@>CbdsakhK?0ar_XN?;j@fOP2@! zYPcvsB=K!DLy+v+A34UEzB5~Z>f;SLjr0xH!p;0u^;{zJkI9#$s}^^R2Jtt0-$vB( zM#gRDQ8=iZPKW_Jjnq2{i?Pnuk-^6nDjTOs5MN-se#G`}#Eh!NR`Xr^ht=GK;_XPI ztL4ke^n35C(JE3pO>-gvkvlTc<6AY6*y+|0j&2aScNAGuN@9V)vl@Qu&X_x`db5hU z!k{qR{tMHHn|vkNhWvI1N!HAy#EgmH8}rO6Zh6vhk*i1rgl0BLHmGa=x z5>o#+?%S&5891T3^cP$!*mlYnyrV~5xGNm3*y9~Cq6_j55=~vHw()HbR{g9!-0cXY z?(GL*3ag|CH9Nv@HV0Y>qfie)i`Sg~qxQVce2-;-t;o`HR3PCeSfkPH`T>2BLWOgY zVi87~Fe<1cf=Qb0;1-2a+%Da7VpBP~Jhl7*(UZ2mdv&hWb1q!$9gGJPGaRJ=9pHdZ za{=bjmb`hrBiAxWKf3&Hr&B`cB3nLC-l>rnZLUkf&6;vmfT}}Sc2)4GvACn7BVwqt zq-1aK;CV@_G5O07cTp!{Y5)5(ro#C^SL}*XyJ}Xiz$P7hv$c0J272$Nid>Idn+KY> zaumInrL~B`m%U^$)bxRGttsT|8DSTllD9HEzZM?d&-J=R9q~0lgrlqNsAP~NSK<2y zh{52MSVF$spO>mEnkJ}CzUc969ur;gKne*VIE#|c6%JO$?&yNnK;}o)7aE!UaLsb# zjg;H_T9@JH9pmwA?S%TiWGUS-t&Mq9Gib5C0a)uFG((z(atV`ZjwPDZLe4*+F3bR+vo z9BO}gb_vH-ouK*8F?sPT<1$6YiXg0Gd;yBRUrmta+5c&I0~{Ho{Y?GV?TZb?`4C~g zEki7mB+2u4!x^CTV)>e|OW55Lq}c@YmMJyPsA^|pt$f@)=Y6{yXd(B(3R7Hq8| z-i~h-Fo(~JnV7xP*K=R4PW@k&6aSXHe%bC(9pr~r(bLmQUleI_^Zx<#3FhHJ?+^L4 zMg8xT#C_ii73k-JTCRM`Us)^+DEh#wD~8TEOiJLds{| zyg-@m@R`$WPl62P`tUP5+MJ!xTN3Vcm45(s#b=l>LyZbXM_YsJ^y=w(8?A;or8x$D zUNVK5wTQjO1_q>6!ms_CeLx?4_(Qn)W+$%Sl5>UiVc(II%Jk^O8uQ!WkPjUK}rDD$R#HDFSkJpGfMefo6gbbgE*i`a`V9g&9WYm3`!^zLm% zb#rHqLIBT!flr@b@)#k_+ma1gn{mKFg~#Gk_47(c~xsWX)y zd1bYy&wF20bbxGNpw=6r+oUvcVxto^eau!#>T*pB2e=v|Yg#5emic<*p}?P>+#$Dt zcPT__7B1R9dUueUG(UT~)6tr|Jl*lFN`756-P=`8wacdvGH-He#y1v}1c zH|?IBoz9N0-39FqEFYsrgntJ7bqZ%!QYBU2w(TxQ1Uyi>`UxNkPXFRnV6|}|Dkj%z zU)u$NCq=8u^wTj|0w?D@&EkN;J&;xwsQbpM6TL0|0j_v&1Y*Ls?XaVW^;zIzH*H?$ zbifZBWerDeYjk&2)e$foTe0<5(64%*%D(X+!hl6l&fjYD`10e;RoA`#wS70H zxVMTqr!GBMKF|@cup@gfC64Z^e7#g7X~lQ~gfvO?$Ond^t{kQVx+4>Yw?RT6c+@V#bzQt)+a%qpb~hQ0*W>NOj02^ z7?VK@=a9c6Z!TVM#Vo2=A0UINP`Oa2_h*(H!Z+zYSC1zjDpPe?Ds=EN2Qh7Kz~~eK zWd#yoocc!Ar7hd~=CTW!NK%^Ip28y{#Wta>fwwdArAuSYKT3e^82(|F@VQh*i3Y0y zG}nGCEcgO9L=sX#F+4)WjMcRu(p2jljJE5UkOJIOmMmFnh_D2_|6Mt4q=0Zlkc$1w zV2OX8Bl)c#Qa)z8SpFQ*kE9|(=)bM2*{?`X<84e@_;Z@ppf{wd0XQosQVQvx9P7F9 z$-&LY!)O_68dM*V@v>AT1xA9JHvH%0;F)?%f2^1{Cv%^R&(s(b(w?uM0i&-`L zdo%M{H5`0=`_mfZ=4q0czk06HnD=t8f?~HxZ>y*-Hvsa-s)~Row+?i_e$lV>r*eBw z%CMM3D&FKs;_7R;`~1R8$mksC0-`sHFVn7Ba8=~L4P+B1l~bj34{inW;nm7Sz&&y< z?=95he*p?sNEqYNc^PZzk`4ExsJ3wBcchv=m=lq~3QrA5zPCkY6htwSNGnFxectjE z2AP$kgZ#c}1PAf;G=jSk@lc9#Bq7#;0#~})$qigQ82Meaqg0Ml=jr=aXrz0vsT*j3 zyddmClAjgcccsJlAArdUNOyypc}##<*Wz2+<-zt3@V)Nb**u-VF1AszOJCb}&-?Dx zVYNGDzxaIV(L%v#hzPC}=HzTS!bD-m+xCjRcv4cFzc*N2dUm>K_M)5He$6)e%qi~7 zF5?Wlbc-k_S^nZE&pbl6L}mz26GJ4tpXy<3_Z=Ff_EuDa=(Aib7dgzqrX$ck)hJEIjcE#I7)l&)u60PAn^9j}4UQ&wZHZ{^bMLnyPxu1eJp_3! zRq13|@in-UCC*rZc9|y_N?2FK7t#5n(R<$hN1%e2o@Ibi4sG7lhDE0)i51g&IE;>n z|I4;Fn>z!`*RRL^X9op-4u8(@aSprf8u`))rs(`ravg60`&~Lqpc2V^mV`u)5dPOd zV@{YxA!ewvmz?#xZT8I!`mwG-00gErR?~CKQgp#r&M~l-tRChp_W|NzJa|HUL@#%_ z(T2T;c@=z1F8ERlxjM-0LjF|6N8d&;C&OAEIN^@Ul?=y-2z54}aq-Z5=WTegNyMA( zZ`A8EDI0gk%Mc4aGr#4Px*5)h#XTmb8x~gCzScJ)D+31~UW<=c82NT7_B3TrIiFHH zd?)9V1L`%UiP*mSmaFSJd6AY)k#Hw9_JyGP_9-vkv^bTaFpaoJ?>h@$I@C1HPlK*t zoiyb;=2a2`*Xb0ThZR3(Im+dA!^Kw+9V1!-ZBI?t9xm79oNfGCx1(8dU+(W_8tP=6 z42-enhK4xTQAVd%?+|0T_M=c3H3w0>j+SffcWTY2EQ}2`f`k?G)WJ*|TX^Usu z>r`g==D%UuK=UFR>&oe)r#vTycBNiE_;Ia7nPJhABXH>cut)>A+4E9Q^Y#bvDeaEV z5D{$7Jn45{9m76GZh5^laXxkn`put}DO%LUI%AD;Z^gc;2IouM?w zYG&264+^VaKgGIGb}l5n8F9LZ~rkf0c@- z`)Vj~)T)+ybMm-|7G$KqN}B=m<8T9$b418hD7K-r{{f#seFU%gDU*ed-^@1Azg_XN zYhfnm^J>5g^Uj<)&(g`h96G)qSGh{aVPTY%IL=m1=}X6dV($hhXb{{Tt^4S!@(CrB?y z1J>aKyGFPp;uF0@F^q$#{VoT62~=C@B1C(OXjvG|WJNOUu1DTtfWtw@-ty_53cd7`t#hU48XUz9E~fT1v<>|w`u9{R zZw5@$s>NBV;HG$~x~@+L{;|ztuyyp50+&?1u;hr_Qp;gT zi>XVpj3wnZXnOvfg+NZ%;b~H#&qe)lsl(9f_kryG&44)j&~0UH&q*4skbFs#R+KG* zjv1cTUcD&i#a-EO4C&lcM+Tr`lyNkuOY0L|HK?jFzRt_U&fw_d?*jr+;fOn4;IIN+ z{WfePa}p3i0$S+sGTTF-ncpX4M(mIu=YZesAo@u&i5G^<2i_)cVOquZPpu3-lFqd~ zpx9NB--8yfS_~r{F)?!6#Vc)zKO$K)+yhyD_Z?QnGKRHpp$)!TYvDLi3xx>Yf z8jjR34ma)xB00Jp0gI0A3iPm&VOW;I(AXW^Vr12}1#g)!A(@uuM`QNRz7s_qSRp}# zzUA5v19}h0{xVAQhkg_M(L7u%^Swn65vG-cZGk|L{PuvIhzOn32a$+u;539-x`S^g zzcO8rG8FRq#zdr^W)t9<#2U#5&VPBJNaNsVcnFPm&A*#*^q)s|t)+F`P2_Nub0oHC z-<~$L81}b~BE@h7f0b8{*-L427mDYAY4e-boUcT zs2=NrfOlTjhDvT`o;lDB77{vu%wr^E#y)x~_G^FGCgC=!E7HB_chLkTb9cRTIg1Rn zk|ut~lw4>a5yWlcw-cZNrp+??a5Kw6X&FUW`)t<8@-%Nn%2i=x^-cJe?~Dj?!#i-Yrj_XCRdUo?=fRUNUqB{1$>%mUEqBekpu~RScF6NXx{7K*hm1&7 zEoAtfH?sODTjix{{WkorI6;3hp#|j6MB=I)#R-1#8!hovKE2}#6fdaw@u$0^D~dt0 zmC-hty5?0Pey2tJ56;|fvUWg^VeKvT4LrM0(C?g+_~%93*0|5@;Lo7q^e=^cwyaX% z{KEo@@*1A*8>ija{X;~G6OrzU2lB1<=Ut+T`@Zp2K>`z3i8D5wLF6lgLoXx-=!plA)y@YK>s)D^YMDg`csu-*u zF$lZK7~)I=4|qRUXhEA}?ql%@`Q&I`&tB&_KHoZ%(ZsH_id5auUzCc)hq%DeZ?!^j z9jb_h+<_Z~0U1<_AL`|_mWzo~RIZ`j=w=-vx_oO~Pz zn~4pE*^83bC}~owP?KDdtlz0Z`^ja6w)z?~ zf53&hqi+k+Zs(dGe{m_^R6zLQhfriSE;ef(Jsv*>9xrCWkd>%LaE9Be6-$Y8n z(vr5%ZV%_$>Bt-*qs2B8q|2O*A_}y!qCdkE#yyUf%I)TxH6{zjIm{W&X zp~HDR8xna?u|t_c6Y;tBOL;(r^I=~p(uquW@)WCU;ZMBsq2>ls?y%L4mo4Z-rJ^qE ziT$9mI<4)v!1S^-D9L}%%A(Z(FQ5i_PR^s+v~&jqA>T$aO{as)e@`_wuHY=G;FN7Q zM%?`o0T&~8CO%r@`E#UC z%*o$rZ(srFBQ#*~8T;9&uQn*6RfH5tvm+rMj2G9QGv$=eKp`u;X^?SIgIBDc-!wWQ zImY}_O}%tGgy}|(*6sQkGhhj)=a$~CCT7b=OJgaiSs!@{#c|+le*cZRWdXE1Ef>@< z;!Z)z`9MlBmm`9$%A7;egm(!|vrwcBrD1BnKCRFSTy?_t8I3FYE39yk7)E}FZQiLz4;DsDWl4pw7%j& z%Wj}2^^GBOv7O@0K53ttmSath5MruW3vb<6)siTEqL|qJ6o5FjfHgqAlM}d>;xSLG z_ke0< zzFusnh-BW9n@L5|C`kuA$yaXN?tChMM?7^yk;-(-EFtia&#ur(-52!y`cq=&&syX) z>LZt6oWPpA)X+&dw|Y!b+KPaeY1d0Ha$bnNtgbe;8QA;KS@QXpYlnFH5eQv}izAPN zHh@%vxjXFs^R)m;4wQ~-B3_~$P)Ds{<68Fh<*?0J_*w12jfFAlnkCh0i&$j$uSa;< zWr^EHI}UQr8b`uHop2DB&_94sIu4Np9g;`XGs6Wi*M%pG=DKDV7D z=CNYy!Z5qwQL>Ohx3QMingP@b9X|U0+9JI0QZF#`qIcMiryq_F|5U3T!I}0Xa5BGq zQ+@E`TjmpA^3hlllW=!wr;NTUS1ZCt6zxIQ426fk~+>3K5o$K2!y8P@U0;Y{6g8VVv^*N~2q zY;TNs%qx*${blU;od)>Xi+Lb3)OTOu45Iq@u{uYuPKC68hSc_}(*P;?=EynI1c0E4 zre0^gU3O9{*Nf=xzT(jPgSr=Y%~@bokrln~xin&PQj*9Z#wKEZ%B-hIqVmyDNUx4x zt~8^R31!&cc|2t;xQq{Ow6%sl9-$Y=Xry>IH5z z)-fgp=PWQvX1g6qdka2p^RBXiShvm>1|t`8#u zZ=Ut)Y}cXYhjR|00_95WkLLHR0v>Cf&d=U2*4YmC$bY_}`#xo?fbBob?*mTLR-4}I zsq^;ipg{B7h4>=T!9m(|UW=>qZL4iBLXm^wwuM@{c)~Ja3<{<-F5yIJT~p=sJ)tos zS%|g~%+rFV!O{IXc~I8^+E3LjGtMqJXc!c9Y^i8v&&ThVud7QJJ$@38QGS-_3@Iby zy6~CeT3hc$p~z}_2s&PzF1029((M~3&UpmdDrByQ#WM`v{v2_Ep%NrorJx|KQs|<- zbSd<6pE;gke7IFotc8(ne}8@=0}ga7YLA~T>$4J?ZoSl@gVR>pcx5aTF zE6%q#GbxNID(9y1ut7ckz+d1c%4Y|A>T~TjstN8>+yI%5PjV&`4f2a(h^jj0QVdo| z;Z3i1F=X0ie$GLHhG~gCg0=9d1H<`dmB|P@Su;~W49O?$4wKP2E?xR`{X$k61xtAptlO;3Puv#aUgdxU)n1(5Bs=UXrnFQZ1nZ$hF~(g zk=yCun)a)WdYb@M-?wK9myi>KDhC>)0UsDP7aWA-Ee@qi4*B`#|us6 zv~l%v^9vz){&iOCvtS4N>0koR3Ap&z4(P^N2+Oc_mUZ4{i_-15k6x5;sqkyIcDIdE zjhu2q$MtX%5saf-Zpg>G!y`S}3(bWE8B&WSR?Dybfad_Hm4P6xR|?bg3y!_eH$LWH zhULj;??$Ea>Tin|u_gE*R@4*|1byqLq^NDI6>!#ljW8PfysDR7>K9i(oOA)Qb~4VB z*zS2*w&?@98|at_w1ehu$&0z{d{{?mNAQUa2DB$f(h!d4=Pr!%g_gQ9R4Qzgro~ao z4jxnu!Ymn)H#vMJu3Q2^T?uf#NzX}f&c$oo$Jh@%lsXjYO*u8E(1+c?yA3mU6{wR& zg^8VFU)RgU73-;!c^u)5Pg-S0)IhT@#CAXd8aZbAwH4S|4RXLm3^dZC(D{mP2 z^Xkn#AfYS%;htASr|G*@W*Dnm!nAsnmOp42cdm7C<(8>? z6i6mD(f)Rhc%+^R#J8dsL_v*;98*($x-H+8syv5+d>&!|sff=3i%>Hv{-l!ox2uS! zWshtFa9p;`J;)k`xLR?*?K&CdE$DATG8g=T>Zc@Ee#=*uk9X2c$z0#KLUqN!g|!B% z_mMU_8c!pHx#&W9_KEi3;mXRNV>ZhBSv1k-heS^v@w?AjNY!*y*6BQm2bBBzav~AS z3&f*6-5wsKUopdxKydEF70nm$D#-@Z*QDXARGfe1oO^@s9{uo@(kF zV2WU({7HlSUl#beA-wE7J1N*Y6h+~plH|N6YxCHh6@ z=P3n(c2e7<;K0LsT1v%*exu(ys|;(G%~Mwzq4|?`&|Co;>UEhQx&X7PsiP(_kA60qxnO04XqPg2IJgFbS+mu#pQkJ*@9qte=>>hLEMXc)u@7R_NT1G7lwjo%Ojn> z<~q98=GOF8k;pV3N7LML5zveg&isO#dZQ$9e51;~7Q~N)6mH%9J^D$r7`*7@v<|TaYbFCX%8*EKhDcC$qr#+@O%a1^0?6`B? z8eZ#>=Il?d5;B(G%stdzSXJtSNth(GTSqf{ccSmW#Hzv{&=vIZ%4WbusXYtw>>s9H1s7)P19CrU0@=I+u4F&&OIhi}m}cVzqFrG9Q7 zVmbo$+miW?RbsF;rvt4zIAlut%wYdLKgZj!;sU3EI4gy}zlNNg2_+UF=~1ddWS zp@~q~Co7cNKH2)IZp}y`Zi+3I;ANzI={!X}T=`7WMhd2wgF9-UfYHk?Hma2*Q z*76fTHU-W_G2lIUW-v{9QaRW87&@{7XFx#9!}xn>4ZMU!QB}={Kxv*W;VSvSR^GiX z95zbq+fG?;OZfD;w2Ozkbi*ei%GA-nWwYcVq&kXxh3weoUd~66oA73Pz{i;o`}r(y zIpQ%^8J~-oG;}im#DZg_vqZ(#eY|}8$Zo@~Di%WrzwMY`DEyf#TXJS0C00MnjCiWq zP2G3l%PeEN7`C?5tPL-))@o|?D$;0NV{zLpU~U=6F#2kg?WdPI)cx^no35>5C|j}n ze<8R7&`|s%9zFN4E~Xe(544yE+u@#;=KB*%t*5T?+2-FxK`!V0sPj zTbjp^8kR0}Z%RXfU3wdxn@n_RSHvEsPX}3ngW=*?p(4RF&$H?8_E+u(^rn3j+Jb+B z#Ov(6j>phuJ<)49Z*KcXBoVT}JxGOCUaW-76`K(8G%-0ty&-2o8WJB;%W|4_o}n1H zCpw=jxO%umH{kEL5NWA?Ddp+ihg`srfXk80Qhy6xF??ZM#)_-#YqDs}d$W@^s+GGY z8Pc@yLh?A+Oe;aT9C0Sz8Qzl`DL$kA;Lcy!Jy^(v3FuZ*FVrYvXnB zN)!_}Tvk!0H(TEeTEB7L=t~DqpYx->kQe6C;{Ty=y;h*3n3xeA5zjoCGWZ9OO@bk% z)4E1il7mjf<+EnVAEmR=Ojl3NVNOMj+Al;ct=9Yka}w@4gu6X1emxu7;iVQ-JNpo7 zCV6x!qlD`&Ln0m4{b^$W(m}qp+4ENy*YNMilpQh+D_7HT`Nhy3mM;mx+iETL>}IYG%_U zch)8e?iW+rZDks?M)fYx7)tJq0+Lq5m%2T;4dDL%l)3jkoHNQKjZfLcW_MF)jQ-~# zEhabUEcV;hMoZPt!^{yb4!3OU;`jmM?-p^OcH6ZQB+#lfhnf+$`;Z{{H-F7I_by}f z7~#K?j&n&d&eHNz)BaG#BCq^*_~AY`p;~g*AH*89&aC{bc!79K6@vVW#1bpR6uXMi z%j8}&=80UWgj{>ZSdFfRe0B|;4_Zh-LO`(kt+>fz@UnWcA2G$J8DWVHUq(dfJd;H= zc_sN`_ALPj9+zy>fD?+fr&bR9@* zh{@(pEBfs&N#(QkZq(iOS7s;s9eE27Za91>DlqHfuCiktUM5&WssYl6Q0d>rad2#$ z{wr1@hD+`^yq>YI z@p|d{p5$J9lXrMmMU8o{z}>`Yrh-9fw!k;{p@Am9FNbf(%#@^P8rn3pQ*CNsGTbo5 zgH`Q_e84M+SZ}#5)}pJNw%=~e04t^oTl&7#)w@7AL$94wA;BTAM^SfMgWYUs_JRWc zK`E$#t)|e3$;Xie?szEFU5+6!M4zTLoQWDX1j?c`uZPia@8j6set2b?lO}urLDw?= z1ZuE4mV_+ms1!MO2HoWt;K$)#*AkTCDWQwyYu$)u&UMnKXwPlO%F{>f|0sSsOY=s@ zD8GR0PRTh?{6H;J$aV5~Ahi_`9D*5f`;FUV%A@Ek@||2FcDwVCB$U?cDBC$pXv^5d zn75mA2A)ti#@m)bWc&F>ldww9_+w#bcRiSd7ki^RI?$3t>yxZ3`6{B=duobZn-1%Y zYsI~v?$k$;r_xV0(-^y7T9)cc>eJTLXv3|k7A5{;Beq_^%jz?4Y$D%i`E4BpA)>Bb ze$cx!NbKl~WOF)8rc?)vXy)H(YM0n#I$24@J`TC7AQF~zHpfuKM4j$}q1A3GP`=js zjQG!)rxTg;LB>X>UA=UXYBP_0a8aVFVwCN=V|Qq*{D5eW{b&O5EIvsNe7Ju+shfJ= zh56GHJSq^v%Vo}nd_P~gIYgbbj_zImWaOhE7FEN^sSJ!j?I7`i>j>$TJb{qjAyn`d z+XavFn_(jIjB&8l*fJCS1FY@2x+S_z@r1FLR@-12F}rAO&=65FiR>?rYok`1;EzzIkM?w52W zTZPjaZNp!j#J~HN{tiFuiDiD$(@>y_cGE6Qny?4lC->d!E>C{hh37FM#t#^mhsvwKJH)LuGzIq5YCvF)n>L|{9a!WV< zX>OVdS^@q8OlMiU;HDpVn{F`xfIH39?gfK?JirpSRS{qC56}ND3P4z? zT*q9o~&T@WqFi=b8vvIGu$koWr2~9T(V|4{2}7< zi2^6*YIV7H6Diip^OfdQ?KeTTn4LuU>X{oBXH{0SLo#m3??l+X8&$WuGwN?kvnLJi+X8}K%gO&cSXWlKGgyr_4Z8itX! zhhrN&hqrXwS9k8P*Wl#!o_m4lOqhlheeP*i|Rgq@p*>pvHP`S|f8Dhetg8X6%N6)6?h z|MT&$AAp4fvke;x2SW{j#e#vug84TLpn!hQ4>13A0RHCz0}BWL0Ra)|BQgr~gceKy zEDRhREIizY5Ag8NvxA`T1K_bfU{i5QBH*Z-BT~EJa)p5kkZ7bD2Jkd~UD9$}xI-p-xBn2M&>j;|Wtvmb_i$-B{ounl1 z1?pU9sl8k0!X>FIsjZyEKTSNVFVB|O5G9IyblUmK7IoGzDXUs4Q1z_Iu+rQuW_s#~ zF)NN|sdjnam*V&BO{iP+&maW|zH^o?z z-uxuNeQu=vZ32}Nu?lJnt61yQe|Zs8L&d&#RBfdpperiz@Y4s|*M#J|vJXzW4r!!O zaEk?G$loinIQ`ei8ELu3my6`>yD32}qgLttrNb#Od(8RmCbx!TMu2wA*!%0}0*dtc z$7TE>gx>hatUqXC%Y#hxIksfouFCpL9De8=3a^oYH#{N{e!8C^dsDIDg2Wl>~8ygZ`by1wDBPDt=n-3b4Uu)5y?NmZ~8>K=q<)R z9g4ZMXt!Mdyz#`0Qq_Icm58d8_v3#6AXA8U-yr1e0CHP6;4UWmcNe;Uu>W!_M2#^d zJ`DzYxW+=MdU%lQ(n10nq^UWs5ICiMfn(etc|3PEuJ^xTWX-jz&Mi&0+n=zv;s=V2 zIYfuq-f^EO%#98P{YQ(S(Ca1KE`sj(;5g`VUi-ZM2Qki>GY27_{Cu?ib;@z=6=i7y zquJO%`g9)WHT9QOo5KZpS?V{*)tMc{+cH4j21Oi>$+FWWIfvX@OQu}y=xSdUUY*OO z($G~%-9l-~+pzY#Uy1JnnrdFu?{mnLO>S4>CdI=z#${az<)i8!Bt+^1)rS(uHEeUi zB}Rl9A+dqrogb~si4s`MyyE(A?4M|?$q-mZzBa^neg0RZFjL*1fAUZacWvS(J%sl2?SFjZw|MBx% ze5^bigPlpua>M|n$3Hl0N0`jwr}L@dWK8>Ec-bs)q+@@I%+HfoBktIj@lkL`=kRn@ zVArGk@TRP8pbxj6s73pppY!T@c{aqE;%4Tq=jg?q?j?1On=-*EILPeS=fU;k}@0(3qb!>40M;gVnX!pBPdtm_}( z3Xkg*b>U?Tx|RFAit#>SGQ3tv(O&cpZ`Ze`afekvaY1Z!a~iRE?uYb96tZ;>d4EZQ z-p#I$L;+jx6E`WZEX$f&?yj?8kJ&9BCcaEhV$|9Rr5F~J;QQ{pqIZzF)QIyJ^dcI2 z_cp&OLD|Ov#xgu7$OMktqyXXUJO}Cy^_{6pp_i7kjJG?6M=jg!sA53^?~re`;w1r} z`6swBv7&_bVjFtIyZe5SFDbe1%f(@La<9s9u{CGFPkn{u??cFL1Id zbJy?>kY4q;rxkvHaKs$)TdU)@S)&M-OC*>QK8*Lo@~vDiL~*?H(RPPNM8yYh7;YZ!)!HZAs`rP2PI-Occz4x@7Io zi$(bpmUb2=#TzXN8wLAcefSmy;23D*w{xMbslOWMPuZOJbESjuljhPB6tjZO!|@si zLd4v{42B>|WGl&^Frdr`qG7JdYI^1#6g+{M`7n)(SrR4p{kv4x>3g@2l^-tbX$N9VeE1pTK5_eFwc+nL;`pTzG;Y<_d8KVc)h&UUQVEKJT8d#b z&gHtm>k-RO(|UU;VGDZ)6)sR4kr$lD=%4=3Ia7v?1^QvTVKoM<;oeaH7qIG)elW#1 zS|9)BqHk{-rWaUa6f$r6_Gx1-P2W?oZ+Zb_@)U2uES74L=6J2?Ww5xD(Tg)|&sVqV zT6BdS@EP2=`;GHARH9E)YJq8%Cv1jv+{j5jVUQLHNe;O6Dcbo4)F?B>QCIoRv67fU zCuUcFV#j1o4KCN2(b6tVBVf0O)a>WR$>3#@P#7)C{=0HaEFNn=!ChX~O65?B zKM^8YZn=$cN7<_Cx57&NjX}Hy9<*GP)QmBtd*;x#(yiesYzXbEh?wB@0 zR^fN;)2^N2-L|d;WkRaxkag!ZbSIR}lCh>B;rj8y_~D%0XKwI>!J?4>yK zTw8Bjhnhtag@s@Fg=Ney^RJ6*(xJsV@U3~4S>2&4G%UXfU+tG!t}UF3s!M9UC-hU% z8bgl-W)#=LkJt$~-zo2K1$cOACvY`T=N3}cmw0X$J>$BF#7?hLpUHpM1QzU|aT%g1 zeznr5t(Jy67Fu3T+Y7;aC7(akYfcpITPi1e3z_M0ky3oEtH@o}?9bv$`Ld>84?}wo z$LoDeofs}yxb}QMMyI5dWv(wvF+qv*xXasIWKOS6^AKWShK}Ha)^VhR>Zk?ycwhQ` zg>f9LKm%;~5$%5d;|9WPNpiLxaq7UQnNXAKwtiypOlo>`-_WK|?3*aQWda=-K{gNp z%~QVl&qfIes*ezc<(Wi9#|SUTk?^IN6d#giOKllRizIBI-tHttH8Ybi&CivztI=+e zFzvXChdDYJmq)|>6SXV!HIydfCDgD;Z zI63I6-5Ypb7yNxe$C!^jW_`!>1@FPpvN-phC5dO@-J0y+vP07uWEBEMS)QLgxPLmn zBN(-!9aLaHf13zVJu$q``@kdM>imfz^_ki7Ddn|l?wW~PP8cQkN7)bsvVr9rObN}q zzxX4Jp-wO1EbFEDoZ|r58ni+E8Q($4i^@%5T(!J5dbk2MG7_6o2x<1yH6BMtDyqwr z_KghBB%(jqFfq5)Hs#%)V!F@~TSX-xyV@3a$nyg);%tUG{`JsS7^T>vmPtMZ*Wp!Z z%cxItiIeaEu$3Zns;26W6vdQ&vYku5YwZ%zqbn4}E%0Dq$q8Jy zu%N8qd4G3_00R1_xk5;nKb8AB3}ZumPnHJZSt3aJUuN^v`#`sKs(D6prAG3~(QEYL zwJh{I#_YNlc}{Bn%p(AbAD)swUCJioCK?X0d})ks;0lkf+W!X6CyTm;I?M)>e|-jq zcgANdyDH%B3&cZVsL!E7#8F!)@?Iaht}kqnRj5ETSl67Rh$3D!K1*67<(&r9@n+mf z;(54QxlZG8Ml(8ISQ~x`ZIdKDFG2P&F&&m$Bi`{0gI5XtOthkjp3XIIe)=tJiPpnM zFL}1yAs6&jl*oEpV%iB*`_Q}aN-H;y|!%|6mK5?$Pf>%B`A@VEX_ z{8gXf{tw{#55O9BVk8wP(a_r!g}YedZP|j4BK47B9+_p<(X)#`hPJyieLqo{NFMHX zihe#gAD>lc4`l|&C|T3{}mmUdu2K!YeE;B?K>x+ z7%!L-Wz~p`%TF$%L1^MN)F0k?xce$9_Zbbq2QSH8i=?VL}3phdCN42j;}OhKCncnWz-(~Fj_ zm%h>8jCRxQ&0y;RDua|OZXh)V{ZPHD4Hrg1nwcnRd;JLW zZj%I|tFW0r-Qt3uLGAP4)`*?I=iXn)e~mayPyDiz@}^GK@#|qvVcmbCsF`dIT?j@a0=7SB~si>{5MaAXKtL)5SSPb%2 zv($Z5{uDW1adIQ(cCnEPFFZwOoXTHUVg?o6+68W29Xbgd>E0){q_Zj(LpjQ$(}{{R6si6mr8kM>1Xf={7f z+wQeDhGYgdSj1n%*%`?86;p8*Q84Jvc&TN`l=IOl24F4WPjYA?h@@cLm{BB;@hZsk-wQzclm`5W+WPY!XVGagKjVK!(jG9k6&X57whz7$5d( zTdP?ar*+2I<4}FLsG2g86eMII{4+}EZ5!Se)Gc(&v2}E&P*IQ|Zg6^>R-@}P+_0Kx zb83ewd+hFOD$dT;tnjWRWXJDSF_3eS&05ng?PHF6Ta{FXNSZS1*FA|9!&NCaB(*JG z);bFtr+6nrYq}))mBIQ)qERx}<)kEg<(Ug-*jf)GCdO>~+q?wXOw z;rrR`&Z3AO=H1G%JGtrHV*}c~JKTl2wQs3fUE7PRcUf>SN{&JFJ;C-hoidpvEw?Dd zmE#+07gq4`(b(STdV<@^pp$JQe2fPqE_;8naz2!rCy6cZXKOpy+S3038%!;ovLE(I z=zrNK=uIAg`x`PrzlhuAZlH9?rYVxrPm#t4D!~2WR~V_b{oX?pj+EG8iQ{yf5W|m@ zXB6W*ZYLY;T7y&)0Io3q0M4d2ZP*ILmLQT3n;>u*PZeU`c0VMYdm7a!y-2$>e%B^@ z55l9lm+de#G9t8YFv>?-i1A69r4%k^dG@XFqf69o+Gyr@cQ!~9&?$PNsrG1VMKKr>XDvd)5cT{{RZZ7??F5=k8D0a_G;yR7+(v2hk0v`z@)3^D%z*RParXP!Bxf^rK)Sq?bKQ-l1eHKwD4 z@1iei`ZT&Mc8ttqj{eFUxSW79y?$ZA{uTs&72Nz4(j=E%y4DM z_}8CZYZv;BrH!T3!Q!xk$hQGTRSRU~^(Q~dy6=V>_0E~8U)msPVV)r;mI{xZSOD2R z#ENm{yRheeQ|Kcyj`Y)TzO}{4uHWn~(L(|OlF~>>1J|IeY5Z+une%n~1S7O~WBkoK zL3J^;N6_?^yFuQgH>=p!luz*oM@;8cyqo^G!a)8@SyvwtG|qgxEkfY>%P>*=%@=@s zIliZ|$*IK{`J`3)wko8W{hI#(%gjgU+5T1HlKe<+e$ArVr=gNU$MgcJ%kdjqej-~) zNBb~m`IARF=!fiIR(pf^hTk74mA~DMzrvtz5N_curf;s*{=l&)vgZ`Pu zi~TB$o+8$!{{UI5ULRpGAIgtt`w!Tz=g=>$-5y1%sP%b2z*Q;yMI?DLT-d@t+mrlA zub-v4xSZx|iDdr(enQ{Qq1rl-KPq0*L-tWU)_)pU#h*7`vtMZTpYg2O{vc@y*I2oe z+9=!q0ErdjxWz6hb4$5i&OMJ$o8mT-GlkTq`jQ9g)txuQTRi^jNOwN6Oh3Ta1B_DQ zmEUplqq2X9*FW;ZO0 zZXUE~0kj^VAE>1WqxeVVKw(N6oxpe)^{G*Ed(sAK~_6-}0|V60sk8xv}CFZUia+0DTip zZxBic2_NfR&h6aQBd7Bi{{RuC^AGopa|d3V(lPi`4!v+!A(Q_3rDhg7u_qmAyZP%} zk=JhX{;_}Wkxc&EyU+UX&{O4MW3b*2Ii=hUai80FU-R9m%dA|J{=NSIIyqSEc0tB4 zY0BNJj~bo9{{WWn`P7Sla8LU8{{X(Iu&j05Ru-?}YfVby?bg!CVX3J3Z!L}(`!ML^ z>s7wlF(2z+{`#O^Pn>y={{VecS6c;&$>NLc0(7+TTnij*cEzdNsW8duymdqRS6dXZ z&l1ZEq>;GFx*Uccpw}&SMeU@vw;;;|tF_UO1Xr6$cdPt1)mO@vdz}X1_)W*%|dt}wbmk%dDDDx2|jW? zG1&WcuRr+BdmBEMv#hcc1cg`;l|L_Y+dZk%(TS}NL%|nu*?5*UmdZI~Sr~ycx|KaK zfyRELHPHM(y^MTA`&as@ZSrGcWG=s)jms2<{!Sn zUv@cG9O00Ht%1<kDN>-)N2`95K z&OZes{`#SNsj%FCi#{I3eQEZMKXY6n0P&NbGabjjw|epILr!~noHCF>1Cd`*R}15> z2qvR%CZlH<6R*uODEr`l5cmFc<35a;N2%bqX}00VV1la7phpq!gIb!bVmqlo;A0h? zbhr@%{o1x&O8+i??Fk5CDCdXvR^MW&mny_`@<9k|<>h$QsqJ-zC*%O$jHePJ}degT+;+q8A<+NQhIu6#p$ zpFF-{h-Db$F&$eK66SN}i_oUIRf>_!1Sqo0F36OxC+{vkRjXFIiz$xkTi1?6Q4$2| z0387B$MdR}(zT?nrB$48oMa9$oK{3)Sd6TyLjjjyxW{U07l>NNt?AdlXx8V{Zbta5 zQ|%Ir@DE7S@juFw;nk4Ub$v&~_sX)olC*aFhIb$0VfmlZu&xEwr-E#3S90oF?4ssH z{v{2>K>ZXK`ik##n4r-t;?pgk1|{+ksK`yA7h{g)m=C~LMM?WQZa|T1RoIcFZFFHW zN(5vmVtefa>71Uzvu4nv)$IJc(y~q$gKPmFMfFMC|_nufVS+U}@U@=7n;W zULa!Dzw*lXl5huRJ^rVW@6>gd65>+Pw*c-!yzyLKuNl=Xmqoa{e5 zdlJP4ea}j%X{%};5s_@QFEZ~-=#v4Lh<{r2X;f3u{9V6N9-fDM-dyS!4U!(Cr!`z> zB?$WZ{#07e71g3iZs|R|iWwzCky1+@;#{2P+kslgO$lhV6Bj@jawuVnhSkV8e)VaE zYMBic%i^j}e+(xK`ETx^+-jDue9I|dr1u4eP%*M{JX7#`~3$j;TbE4SF^zxmB*OJmRP8Hukq@t1+(v9_O7(nG9C z^UZ9{yJ+fJ4_tTJbH{qFfS|9bCyD+gr-(F*y($}7(mRyfxq>JdJBRm)z#?gxQ`@#;1Ak*^RSlY? zlCjy{{7=(ZlA3kx!S$L&U*=l4i>vW{J;mCQlIZ{{CBi|JgHwb^(!KIj~ zV8D!bp$xfcuWFX>B8%IHSs)sawr6=1A;OQU%HrF zTp0o^V*qVmY88Z5BO;7bxTH9za1xAClZryrTr4gqBAQy8zKek;0*Z*eCi-^*ZNigv zL%xE?(vTBwq~B3xk7GvHJ*j~v{yNfbrtEK}8$DQ10+yuQD8hsH#UH!=^&kzxls5$W zQJDSdz{ma8Gyu@l^=eR4v`tS_=?VNz083_&&2<`2aIi#M=9PgvN8kysS<<{u;hPJ4 zeLBwcuoy)NZzm_}R=k-vo{*2uy(7S05!NnZi(9mUNEt~HN?AZ7p(7{yRLH2axQ^&K z8xm&Eq-50ixpDGZT`~Uvu5~B&UyIm$`oyWvd!=9GY8TObK_?S|!4;C!0XUXBamgavILQXr~lTbH>?AtzPil9IC(6{uXKsCjN zs&oGU63PDn76P2QZm56d!H3pW6^Z`<2(F4h)_9&H^i%#7An>Mv-^7t0_y_+0T8##5 z9XC@S^5Q=k)||FFn*RXHoFnUSzvEdS@Qe6xb=Tm>`;{Q@=Y_`yT!WwbXF!cnZK(`@ zquYKr3H~)Oj{g9UlF)zK%72YyjT6E#t{YfWj_S0Z****Z0HB+nby_q7a4r7;O_sm- z62Id~<=+$cAp9;@{A-_WE5R&q+Lh7#=sGWgO6Tp@pnC2!Xa`QQY8ycEA#eL`rmO0D zJd_>QW`bdv6zw6lw%&L-ALCsd zdNzj{{{ThtwyOhnIN*Mjd34Vc#+$Tn8e8fs{{TF17Xlu&h2SzGa5y z7>PuT+=4f4MPHQT3_e~xtC3;PHS`t!v#i-d(D<{$B6~x_+*$zX^&{qA%q?@Xd_VBF ziy|}lUr-jeByJY0I{yH`3F+>K@T$5Mk0!UbGaN5L)~#JRLbmI|rVD$T=w$G2v7%kH zH@5isupLfSzaFFLYXKn;S_VP!TBo;ye9>*ARlU`d&Qtbi6VIG~usSc*i39!xUa|1&#qwQiH(G15o!h2} zjxs^{$I$g2n6EM+^p6b2WFc>LR{KNtD;WEO^)3GZeATaq_n&3)-jKxMCAjii=;~Mc zikWJLeJa|mqVOf$dSOO+=tt*O?loBzHzZM_cX``r#eC@>#7$n`TGVHT_C&P1x{_8z zz`!g(BaDNzFHX4iuRzi@mDFuK$zRQoVTN0FPf>z>xu&UtXAR;lJ>t~v5hu5}Qv5Uw zNEzwD6-QRn$_SPfcYFZY8Rolh66p--_UmLh2P6Uc)@sWY&8rwCQ6~Hs!O6#BDyu$3 zq_sqHU7S=9%$G>$oR`^xIR?6U!`r}7w#?3fx3I?Fpzl6p?bjb(?6y=a2`zRBP#@1(y0L23G)bPDTLd(zE8C z-~>li@{F5`gOW{Jg4RpxW{^j*ckM+TTC?^-xXat$v1)UztZpskRaQwDpOkf6{c8T4 z)^gj*>lBM3kV2`zQS~R*vSy89nW7{V#t$6ly*lA0OGOJQ$dD)qKJYyW{Ay(c)rz-L zO<9b0j;hS^1GTa`5!$H9au(Wm+*(J%vUU}0VmU%w{LJmR?mheSNhP!*be1HP+elQf zJTV#M3eqhls9ElYFs9m8~ zXe6F{kVrY=xxEEOhvDcvz>jx33yYG^jBan&sm6HVdRFT1zlP%8|e!&QM{E1{G<4^-JQ-&-@Y~=k0D=$~LhT?eUkPwW9 zby|I&0b5` z?p33fF(i8#aUzrC=YUTfrbik4E1c3A7^2i?zq#@OSnXP3#c<`k**P5KvjO;fRoga! zZ*3=lG+9YV|ShF{@;)%9g9X{{Yf5ru6kLLH7Ru3SF7> zxBEq-ubDNIH&M{Cp2z*6UAB*{*lV`Ub7L_if31c1c+a69*QIcFI*f9=K@_AsY*kh> zl83p7pT4KOZT|oPK9NalC))<4quf2U@+f8~ydJz2?g;8RKAAioia3sDp;t%I^kF4s zbb6#qzFq;q{vz4UPdoWgyPm@}$#`qV*4MKoj;5(+sFCvg-@BXL5A^>4^;b!&$%{M> z;y#tyqSLc8ZtCd3f-`e_xw#qKYer+v2SHHSrqyQkH7oK>YRD+uCpoGYkR+C}#^4lm zD!}&#gIf%sj8z-7#k{%frk7ElFWfJYGe!nHwk!D`{L#}G>{wlOkywV8cQds&_ ztgNod*J;LnlmO!N(&YA~*as{?`q8}?rb+e`#5XKQN;a|fr)Ea`sgJEMl_Bc*_7q$O z`EA_NYy+!`Qygper2NfK1(cJVL2W$8{+Y(j{Dl__6~A^ri!{5FvXl5$%sv;^=Es+# z+<<$!gdh0!D*Rps*W~-w_VEw;X*m9wspnnC%FbVL?HQ%aosbIbOiAocaYbl#qjq^PxZGqlK%j})qjxwwQ?^8 zXfbuNX+P_v&fnITv_ZI>Po!(Qk7zzq&SUU~y665u#XsMb7y8w>{0*(Y-j3Nn-kcxg4R;ai)=ejt?C`f? zDwWM#Tl;wqBb6NcFyfaxA1T8}@c#f)KYr%TeudBe%U2=zd2z=5MgjipOn;db+eb3L zLO(j|JUO99a8~yHkuAq58<02qpKs?&oP4O`G*5!feQviG8k&HS6R6cpukj259eLE)GT8dH+Jd={WKK=>)KR*dj0|b00OD!7N;=346J|UrlorS z0EiVle+_g+`|o!D09kSUD`oswp~s!tls}oO@8fR|CUdDoew1?$QnGwK@cQ+)mA~GY z)al@F4yR2yPu501)~vUQ>=hY`UF`!T{h7c6*QHTkA7~C#CD!uC3Ks`)^c>N4ElWee zz8@d+42SbS<4ztC&{w9L2>ky5#-ScE(_8Mxblhi%Nvh}KABUIkU6uNPKdlY5xF>T+ zcvnLo^9+FfML2k8MIAKDZ~Nl2?YwuQ-pr8N$t?1@!lXzTx6o#--T0csGd*xP^Ku}%KO(?93J-#_jOq`olF$=Yv1A4f(X z(yl?TX?G_PU%*eHY#-K(hNLt2V@0@9B-7#l0Dyukp42`Y*~mo{A;vX z;&GLc7+RZ@7@jYe!}l6rm2Khgv){ac$8^^@Z~+^)0Hbin7(EHDW5aq6_TIA{yJzgM ze)j461sxYYr>EmykFV%9ek{_4x#9O#xbpI1-NX0W?0pa5eMU`%jGAA8{{XXWnNGQ> zNEPEJYx%Bl6CSF&iR?P&o%AK6jPWM2x~8{rDhxv(?xF*rRpW}A!>nMxH~l`t1lM`n zoD_|={{Z9V2jf|?={Huogq9Z&G&eB6mn2WYRvoL*d^6y?tKSEBl6{P`Np7~#0oBN1 zBPjm>dulF!+NQ*LTO+}?HY0@?An-u+u7|>U$nL)i8|TE@7D z#t<;Xa*gZ$$?3;`YNWp_`&A~>xR0PT{Z=d0i~@_hC{jb7y}hdk#JW+_=4+c3Ww*Ff zl1B*H{{TOob6y+Pq@8vNv2q3-asib`a66xFwcmehY5IkOTHHG*Vamw7gX#}XwX%)& z*@gR@4xRR|F~jiPGFPQ&yy+x?9e!jT_{qlxq?!|9a?zFz6Sftz(Du)<%~QU&nq@Me z0mppx6~S5(jQoXilP8*rBpC^u<$LFf(bMD*CAtItuQ@)Zu%w9E-d8Kh>E?k>y9W6+AebSwR4-aE}Y zIN-I0O_^ViIq#2Nl^u*w>I(k=Y7C|^<~Pla+~5TtO4m0rS;Z6@b@j-`Tbym&c*>7T z&FQx0i&M5R$NjA>)T)BY)(xoH#_|SBpHQK2Gw)7_U1|}@C!B3*vpO7mLPO4oZ2rHeM|_9$*pU2L2e^5-D}vmTB8+ywxAdh=ZOiZv59mWJg~T4Jc- zNFfk@@xVTtQ}wOwb4`2wgZEKJ9dKQKZn*(}9>0}wUM^+1)Dq1Y;!}n%fB}_&KEMd# zBdhZ{+(u; znrufqt#*-!@wkxL?rN}RD_ihen)C4<}fraxp>Hf;(`_uMvQ;a43YFur( zp{n>|*IIkXrC9CeNzTS9RallgK_5@!9Qx-K%j!NQ@jrx>G|RXmyNyoN`&m1}_S(RW z>&ISw>(X^u1X0>tq#&83+^EbKaV3ZuJOj{vHRb;R6|Vl#;(Hj^AyUC)!R%d${zATn zh39CAWC)ZpmWg!@S#u0kL6blDAcygYlDDvz<& zx5aDq>!MnJ-}x#B_@i}p>jb(*q;bf>9mLK^1mKbHT$6eVp{c~OSj)Srjl=6vEILZ* zmZyCzU|hz@bv$P~k=akwF%@G@JE6Xd`}V|-s*vB%)oYm^Y2IlYbI40E^(T{4>7=dd zsp#m-`GPBzo0OP$qzu4q8Nuu-Rfa|dRe2pwIi{5$@yMhET$%!w;0lu*`qapqMoj<^ z86j5NBQD{F!R<-7vD*;*XaLH66V|u%4-DRGjhKg#W~UE34!+7ii2j4U4?sx55hx>j zWsg8TYjaQVp0j=fLtxQHdPzC*$Lq=btJ1We4(XaQgKKUK*yedYXFrc`!xgL#ddgKh z7NT%gpAFoAZ?!CXrM!RDzrwcdd<&r40AAJZ8`gPA&)_gCqTNVb3eC-yqKdZqPMc^y z(e(RRvHi$YAI}wUI2Z(h=nX*kQ6TxGLG@rN2GniGNuxhU2B%gV7Y;K`BvnMcibMXr z5Px}t5&8pDrN)fE@%69A7pO~F82-4cROb2<%^u^4sLPL9d+PoRw^nI2ZA$1TEMkBn zkHm`MJV)c(d8BA`8xOL=I1}7RWSRc}4DDaQAFWcQF1ibzJr2~$x$8`MmNneR1JvHR;b2bY#;q;#Lg^*kQcPH*Lv4~5*DS;E;MYAh)~gzyn&v3U z>JV-G>koIB<=k&M!8zw9ty|8}kP=i@BkrD`jyTO0uou1py#Q0RQ~`t+M%RkDYx749m5#7RJ-N%~_Ir^#Lhv8aNcz!#}be`Ct4Z=vD>f}C{{uciL zXOC)^G@Y4Tuc3Ykp?MZK^&5*R&+tsdqkbl$)%5*vAStO`EQEZi=N*!Nx}tjh}WK>^gfW_+cbROJ7YeTlw@TQD#mjl1>CDqyjbae+-KmUh+@OXz?nw&Y@G8pIZ;q$t$V(VVa~y1U zYy@&M^&f?4IrB$d%xOi!W_N~lud-^fO&0$EGS)?$WMB{vKc#&et=L844-8&esSKBP z#75raWl!gWUnuA|Fa5DCIN zNcp{HbYdNF>+-Um#2$ZIvh(e{wV3he!R0^wbc=!cqa|&8Led6}%v{TBdNPU>tNLns}U^nd7x_s>@=} zKfTcPofVoy4KySETU%#=?!8CqDx7{LyNy5>^5Z-W*{@6!BXPV21B_O7t>NuAT0=Ck zl(^aexPvLlE%%t8VZi?Y8k1G<&ZDI6 zl}FmGhbCB*+WBD5IG^Ek>E8A^HVzBEAlm4O4U#5Cz*jJNT+&_ag zXR^7E?OG0?3}!h#U{v)+_Z^RM&#h@#UO$92q_Js@#*ucr?B`?m$$)I%^>ZFc_yjS9vhT45D9Ze&| z)_>^t0FHL&0RcTsV<{tE)e2T^2(s`yVQFc z;FuCQA2N&)>MF*Ss6Lx`85j@|3al{-5c zCt?ObA^@%k=mkTCCM5~V4F0vSB3~>n&`9Gs;+odcIsX7wpyQ5e{;w{Ax-*f~&Kf|< zalkFsug9rFJ1}VM$3kbAEiqTg|mFEa3LULGK{G9tmj5aU5+oa zJ!a0(>YBB`oQvezvr8#ODvdD0hUd^{7(aNi?M`T{tsuFQ)(^AUf>@*~7a*MEdSvAO zHFDYrt@O2EOG$B(#YVxixxn@QwafUYS&A!B2tT-zGTvs)h9jW>^%*PLyxB@Bmn^;~ zw{v3UzOe=FmnWMOMR9GC%R1u+f}|7b7;-V*y!Q3d^H{l-dt-G3DJ-%@lgf-jypDQ@ z`sTKDTiK)1Z!cwX%@T*0$^1oBoO_&<1pffCeJcjV<&Nff?e1DDu+he>Pwv2vXe{Fg zckdanzPz54uvEQ`I|T{vR9FtS-&8h{h?_MBIu3lemC$xaR}Y zu*FclyC%`?E_9bz))BGPp`9CoaxvGTZ}*3xtf;uVMLVy^C4Q!dhr|O^ztpZ_g5C7{ zMxQ1>-Y6phK?7-1(>-|nYUhR8PdiOe@w&C_XXNYveB=HFj%$6E;PxY^1@l2((riL;`_f6P6R50W6U)H=w z#*=yeHr5}n`bl5N0sSl6!LNGeb+#x+{-{$QzcnKN0IC%Wo_bfLauFW2QvU${JZAp@ za;=fgRK6eRQu+~$)KB4M$8(x-z%?u{wo7LI>kw+(x_kcsX4+hYU?-8{1oUF0az8wN z6<*@P^2S9=t7&053Em73MU~q|25>M?Jv$;<8+kDI<<+N+3nsae`MTHJvE^ z<9%s$0qKVuv5#6(43D$;M<@I#$h%N}wRc9-q}4Csv_e$M+nv9ydZTz zm)GE>kNmTY92vjCf1P?famNfx6iBHW4XUaz6nX(vp@}UZjvH`djx`FZ4sZugtyR9) zA=2YnFQi0?j@5CVPp00#mTM_W=7}2xz^LA0ISR+dOWVg z1dWQm-=IBf$T-j3U&@T+bQtO&zJu1x3xiYeI)L1NFKhms`hDK#%)c#&COTE zHT~{aS=Q3rUdrZ4L{P>zODEpiJ%@683eSoeCmB)lXBpfSXVWLCtD1rf zRZTJ;84l^lISnU&z;qvvtu;3N0mUJ=SC4pnm>0^nPrAANN$!3B0HsqhB5@x-?HZJY z^2BS%@3j5n{uL9fY3ZLqS;|elN=n3Ki=3R3^{JwdaO01|H36B23((g^pxOysOFJk? z#_|Z^j~EB>BBD;nxT~_9(XF#2MO4ZM%mC@^ed#VDWz-BulZ~VKRhEKDwRMc4EK07@ zKJnTZjDNtY{{VmSCS!BxdsWG_ zJE&o}k&x!vUnWtGj(+L-5t{BSydd}1t#cZv^CXT%Tw@K45&5wEtFqFxJ6VmpTnB4y zUeH^xBbA;pxM$Fn2Oj8sYo1t*bT;-WEl(KIZ2sH2+E7Y?kVg!su^ycM718O?-{=?7 z$3E0aJAyp9f#Yc%27M1hgT+Oz=oelb@g$IuqT{;_^C)6OgxU^&&%z{5MGMx#MK3Phh7_LV{ z(APU7gb8ykq!$sX%3avW$~S&NGINn#$ zhZ(JHSHspC?TbSp43d26>D5>cp!)v+K8Co%CH0I3SGP?@(K_7P;v0y*xEMcNV!96v z_?8QakuC2U=(patm~5Zw2Sz^Y`Bu$#GQV+jFArOIQYS*fTd98i!Ckpr@Txlx>sv8u z3Fn{=qbH7O`o^CYt7Kw|;2@Z8g5d`$-sAb#n@^_eT3(>h-(HL;IS1wWeM4Z1alDM@ z8)|HL!^C>6w}_SV$G0$*vN}0$qWasY@w?BXsUKvk7~lT@0t9|_*+JpRbqzN4ETGi23(=5|40|o( z!rGbG6ONqoUW?#;SlLgeUY7p=k9k5#{_ovCp{y$RHi_7LZqk8=OV!m*Rhrg9 zIpB@S`m26*>L&TWFNx5jrftLe`)vwR8(Eh3}WUydd5S4Ab&df zogAw%Q}Yf~=kltHx&}{aIg-W}aftxh;Pzp`tt~%NdpQ#3KlPT9rP2>wr{*7un#pzp zP`y>*9zi+}X#j8LO-pYg$8e0pDV#XK~fttc-Pe5|fp86JYXb~(^Qs~nq1w_)vxSlLmce!srit< z3@bmVMlYHyJ*RWiPP1;Z`9YOisLO3N%6Nj(%I4bU_R8GHdk`*Shot?_9XDeGRbe&oqMjgm@$iIFvc^~3E z>!ro5&Eq7^9ZJFtR>Dh5fOa_Hhg4zFhq(U$J!_lOH(kMT;T>K@isClkXK;E)CO%dl z?6BbbA8P8=vMUqB+DqSCE#;!0w_1(Kx2%cOuh+Qu`qPB0>=H(ogFHoXZz-|5%oo;@ z6)MM-VZk2%0M@<7Ouvs(vP*>Eo*jR>!>F&BuP+XvVQ=DVL;jO)7Ay4!mR^Y`;n4eI z(!Dpp-YY@n$j|$k07-}UPiC!TjL%rCKM-kgURuo07nvN9nC>I~`VoWpirWMb2ss9V zGuELy4EYDeS{&LAvpuYzFhi7+96BGm--!Kl=~yx-VYzx&(|$GZowej`J6j2!IG@NJ z2+0JD`*J|{ublObK20xGxwMrT8Yv2uY-DGVpL`KpCw)mRXj*~DwUem-09JkbkH`7_ zYp%4JWnV3hRfjd_88Jv5dUwID$4I&RTq0Fry-pl^`u;!9HP?r!_F0tU)g30C4T|be z!>Nb+8EiA5#=+UXgN%<#^cj8}!5}RK`>8yF${1s}Ojn+P_KR`10oRfA?kmwe1L7vT zv>HXj{`%c_Kv?qVKZokS%jsO$$Vv~M1;M7Q%zK}Qn!Id8Gh0jm&Nme&^A(YC;Z1(q zJGALo1CQO|{{WG%Vv=bP?NtH0<%!%a=o)uAkysNwKii?EfknmOE`z9BP4F`k4oZ>7QEV(P0?+Qx4MZu zK9;+qK4Yx1f_`8f%ae~yir=5(RV^-U+Q&|aeWp9c-!;VFXJfFwU*$MC$t%cvM7 zs@Wtr1`&0qENYyUA2tBa(t2Y&{VK16wI#QaZh$08<8I@EsQC(5c5DsAd)8H9$u%dn z%UT|XtS_EtLNagy{-5XZuL0I&`$vd%8MBc0a<+fqX1<2F3U3|Ra;ph{t^ps;zHX6> zmohozv$K9@HSJ*cW;MDN!Ts2%Tdhm{0-zNPN3bfN!r%n1@ zihp`(kLaSapTNqG#EzfA)^KXR3z93lNMu+gVyFjo!)MvEo<&meP1XLZrzMrOoN!u# z8OT8GE2+pi{vQ20{*}{s8X*pg6658PNdf%@Hd{X~SrGILMQ}-6-u67#PtY1iFL)Sn zfT~Exxio5C4vj45y@K(SM21F5S2+W)BfsHY{FbUSfmq%n9#4iZdu~-v&48?) zaonH8J|crghiC#@rx}hQasL1UZ$bR)%&)JmwJW1>cPWxX!0Z%y0oYV`7gF6$^UWj4 zEQfPDbta-Nf4XZltXp>trj3Wvjm`bv){V{RKRRL;W@&uCU)Gzsy%x81y*kd%Nzv|Y z6_xI-{K2*(JM= zdVQp-x^AFR6+zg8xyDI3J^Bjid>N;i?I)XNIaLQ8dY|yBdLnw9_N32$Zuh1zw{b`z zy(l(`3C`JYIuo>p^&lRsYah!!%8KY^gnw%Io&Y{r;F~{zl0CpbzixK$XAI_tL!f;r?Zp`yb zDh@abfSxnl^**)Jctc5c)-9lzFpfz=DgF}5gpd6U@%65JL*++B1f{9ZL8iUNuWc)3 zOW(;7AY;1->ZhMzr}M6<^hee&^|%6)bEd;;%NZP$^3=D`5# z@#+_e9!~)L(iZ(cTIswo7_jl&##CgszSAv~7aacp3Jm>DKOtC2-PqM>slTS!$6=z` z!6aaLMNqIjZDI2Mqw}kG`fuAIxUh--&m`_)qpkl>#*UgbZ-4An2s_1dYFyV&dbdb8{TSHJ^_3nP9nqTWa>1 zONar(#Ngw49_x|E^hbW-Vg7&rhQy~kD03_6Z$&~EHv)qJlJX^hr# zO^-2(Vn`+*DG%&%wTGw&p*5+dxAuf3Z*AjA^4@ey%031Qr=h|5m$)?4wX!XHs^^&L zej~ror-kkjOe}Uv-e24x;jqd<{t$!vK21H!IAs0Ajnsjir260zr@q`C)#o~`oYJH>Q^Lj8BWE(6T_lZm-RxlfI%)r`>6nZ8A+96E6=uU%AQY^C|fc9_v)Ebh!0xvOtj9>U)DLMe@UN zdlCJ|&q5B<>MDCJT3bk?isoN73~h>VNh&+3?nklYd-kWPtX7hV%q<)<97l6%+e)|h zU4H9*dy46doy=TOZqvom3tO!_RathD!_3AcjzBUA{{Uz(Af7tbUCyaCscs>?FDBIi zXH&ZXbMp?Qd)9WRsGUmh&vHEV8(Kg=)~M)6^~WE8sI3Dv%6Vf9Na@91+M-gvhkxPA z3ym@YXVh%S>l(()-~;c0`c}V)BD2*bhHG1-ON3pL836&*bmVvbhuXQj3y(56Bj)rPaDliZ+KJd4YYKt#m@8YV68XAhf-UnnsDJ-`u;~ zSlrDL77p;H{kaDm1Df2@E#7`g-->hTiDHr;% zeNSPG^&+p@c)MM_lFq?nT}5MW+*@4REz`FjcaU2c><<)sILQ$27h~w(4_?Kp_)^|0 zw@)#k@+MN?_S=(#+;d+v&7tbHw>J=fXI{CsEE$pqU;>@F;Cp_wpJTtUxHlS&#JW}( zvLGxbKGDV-Jao$tJ;|htTuVnSVPR^o$TQt3o_Ehu#GSnQ1;^K|5u&4|#OHFH<-XNx zCNaJyg@!w$WPjtXD$Ev>++@cSGV~`Letjus)s4bzYaJ~>b&*Qi?J7XW%0qFGFh@^n zlIK^u@*yxw6pgg*i^?O*^Z@4#pP>3s>9w#aT0pP6U_--kjHV zl8@djGH0`>Z{{icHPD-yX{Kp2z%i||O`aBL@Tc+WDu$(}n+uG?Ja=)B7!=PQ#Qy+= zK(>|-$b}=%c-W`@f<-;F>r@BJy19+B+Erjb@1TF33J&{)(VKr|aV3v2hhTRNy;tk? ztSf7&ZD&cLa<07(^gh+rNpEw4BfkzX020IfLw`!*b&op3RF)zX0t0}fAc8w|@0yuG z+U3FYJtIfeqSUSTugfCv>7TlP!nKvV=Dd$b)O8fp5ZGJEaWFV&qK%xe?g(1-NLpbW zX5!j=Ss4SEZX+n`pHOR^#>YZ;IlV$N;jMG~LMVgjx5TByBz&Qd-cmgnWRHCNR2CDT z4QpcAWM{CrUozRr`|*>&KH%_vp7qvVT0^K=OKEVV#VYM=bsmI!gM(aTR&t~mBM^sQklt1@J)eFNalb#HAJ%2&*hu3}<-@_K$B%jsQheBt14 z5=#`x1G7hU1AL)|2qS_I_*c+cUZEDJVI9n0#4yYK*6t}uSdx32b-96~Zv79Tub;jr z_(82FeNx&0ptjqDQNUx-zK#6;we&q=_=`!4QL&TEh)AUhNQd~)^`{wGgGa~ZaNAE# z#;j>~t*75B5t!TMWzSH5&P8o_x5KjNI=r^+mWcfENsqjzq5g;58s}X%1O4Gz?Cyy@ zBFf|tj2JK=6>m}NSJHWO`{-|Nq%pEK5;KA1s*aG5o1p$9i1SDH1e*1Iv7y zez~tdwJ|J`LvsvF92DIod$wnzFTHHTX<;fs9lR*!F^L<=10&R*>s|&U5A5Y`heB=a zbPwZ;X@=OXmD2@7Gx~lNFZ?AF+(6P>=?u}K3A{=NKI6EiM`NN8o1s{~U7|d&-Sn*( z?sVG;)+A|5M;Yam;Z*bAfn2w{(HbbXx#HHB*V>Mtwz_4JxS7uaD1j~d6OVIScb*sV zU9_>usz$cZ1ZU5hC1g8J+^9Tr_;Fia2=OJCgft+Z*|9y;fXQ<{{j7U|oE`w<9>>!vTw~4JUbsKdFs9gPu(E%HvjF3TNkC+z-J-IdP%}U2Il6xF{I$KzX^zFf&CSuJw zz;7-#esRt+@sG-da5ryz*q zO{y{xai4SVQI^b75 zS4_H`c}p_+D-#{K0FZ}2?GVE~vzq92c%!k`CAokUwY*fhj070TCvZNPB%f^KwLiz2 zZK%~H(_?{EQrRPjDFe$mEuOu}sBG>Iwc#X{ls*2MS(#gqMs~p3S$j(BaPz`(9-^F?G=Jw_`3}CU5Sg%0BGC2GVcr0pi zpaOad^zbh6Y{s#1zTl`^ty=OJsLvc!PXl)5y#~%?isrAC&8C{a_1ylTRWln7)vX1y z$qt@EtZCUJ{-fwtnJC5B@z??_yb`E*Avyezjx6&=2@U?FyW$ z24y(LL1WsamMpOq!*ga}yeHnduN$wG;iee-^TxlaKhnF^v<%#etEqT;%UFeNFHwvt z>fkBIBah@wWZtEv(E0U-aynOO;oU{GEfZdbVgneXNS8b!CeQ%?06k4_>t74)q}5}K zN}Suq(6chjm-&FralQ}?W7J|p^T>{%d+o^mMNHagt2E9;ebMWIML3LPdKyY&3MitG z3ySFVfMoEdk0HSbqCDr-r9aZS%G|t^zl5w42MZy|2Z8fTkLGG$Kr!N2+1N?Ak1e+C z8TqmD{u#x0z8JS}3f`>x#+=t_7G_AGlrwUm{nL(4+-@B*Yn$+kX>s_5IkkI)hT0@9 zrNJP+Mh5`Fy*3rUhI?p{ zMCzjA+dUP41b$7LC&?yT|!1tjAI!GpLKKp0DO#BO@E?E zcMXKrz$?A0+ZO4D<9M}0C+Mx)C#TeQt?t(3MeCvS^~(j2Or=Unq06AiIqbvh>&;TSz5>GH z)UN1VI0XH})MMKnmVac?uXyiM3aBI=6;PT=OMH8uQ ziiJMr3^8o+4g71+!4s2?t(NAr8a}JVL1IMu&aWvPE5#5$?xs)b)ivzv1-!1^5epxn z8LoP>cQu4kM=LLeHIET$H@6mWq;`=VpK7^(E-#mk5Ak!Jlde^5<-O%%=tGQ184OZ&m?NZh~9>Obhm94(f<--JSIB8CK zb|2man>{gE^J)vH*&R1doDl2&mvs;-ZV&eXIBzI%#~Y4G?TLIrGFr@ZTWJf!XEPb) zBL?E+@bVGpIYk|SC#xFAf(fptj^5rUnrT=hamKru*zf=y&0jG#0Kq{c3Ob(Ew+Dx= z65cDlQsxaR(owoMy9CQ9^^O%A*n8kFy*;M1@VVJ-|>;o4(@WQ ze&|1Xu&277?&3>}nWUO|PU$6)%R8S^PbR55ZYvxA0P#!2Hip*j+xy@{Zol`*-hVSd zZw;UMW38rpiLKio_~5JrZfaC)o;v$hidP|`-lwMN(ncoIu5RR)lOz`~q93Z2W0UGI zJ-sUpYi>B^C;d#PzY(A3im#~H$_cnwoE&CFC`Nr)aryp!wVg;`-bKdyGC1WvTKdw9 zh6jc}omPOv%EnujiyVMan?Fobi`ozQZ!3Sh4nLb#7iCzsFXvv8x93tWt#H5c(4VbT zwYE3X`F6=HkLXzteq`3fUK+WDgWbUru0G?=$NvC81NqbDVzMi)xpKYpQC-}{s5EyM zN>7vu?FSqC;GV#LTF`4NNQoa~wqyS5pXSDyEuEFYLH1OMRC2L9sAKDb$MnSxPSz4g zWsiyVT|5M^zna=H(kO^j{{X&vR^&cBzq^(Q^+N=)A;FnbYoI>3I3qmO3w=9HOdqg9 z-tG&3Gg`4NvL+JJ1Rw0i2lTA@Ze8qj@kkj#1Rut+d{3r5-LhZW1NPfNxj^da)UVWa zACIj*!ssCK@r4{O_*S%XbRhm!pTf;kx;TsJ4a6|`n#&uvDqgCdj(+T)@Gghn9<}RU z0r5%|Ng4kDb1oK9{nP#G^E-r34r`^_Ao@O~xg~`C&`;i9Z&CjM)}u_CPYL*lS-hrO z3BZyGPyDou^gmE|KH|3B8SH&V2w(^xgWOPhS2N)~Wp3>j)=2|0E@N+1{{ZWMo|V(q zRmiDDygIOK*SgC*zJiz^|QdV0b2Az=l)vWLMcYmcr`YqdYfNK8CzI z;xB~}Jlo4_MmJM4%H;>jF^r6MINVA4_N`#Q+-iPr1S*__$l0->e9sKE?lc!_z z0qa}VekwL`!x}VlsDuryxL`RLC%;a$iDw|d2N>u<8K=*uTSF{Q$6b=LkUu#*jU3!wvsP#gKyyjtdsEJw_q51*<_A1CT7gS0wZ97wVPI&M8H zNLcC4o6`YD`&Gp7)a9wKwU<-P^*g2(YYLy5W()WZPx7g(rCXV8SwICPjn^lY3U?Ea z!mw^H7VXjYviWhGgV~Ay01DX9qAjYey9_amhQH;{ESD07Qb-a^a7U=dDRrBfbs40xlE6=8E4iLFIT8Q>19m{hKqQ{L zU>ftWxTi*3%lhfLt&>NI=~_(sz5f8&?S@ZMU{-tmbxQ&dtI}=pJT0sz@3T) z%OYio>5sY(a(#MNS)^FQpjccfiX@Wib>H?)&lvfNgvsf*zt*UDgUh+L)TfE2C@f`J zrD%cyxk`n{C;0F?@$X!pU!6+HMSHplM8)wH#G2=c?r)S4BZ$6YK2!!IBah5}wM#?GOPWX=&BM(JKq=t4F3QS1Y^`CJ6*c8iuUpgfwnmsL_mWig(U#`V0Hfh_102B z9-*Y$?^A?b@qx#dI>?>HDk_7F>S1VpRIb>eGxS^ z2INy?aU81c8c3kzqvvV*;~&nYDt6}?;-kPL0FTbQTQEc+39gexfJ$IP(Au|e`fFH0 zG1|Ki3tM@&5db<_{{T5nXI1{~l@;$JzS68CPYKwxNw;DZU_Ph%=ku!n0B6e{E2y%! zShQH-1CJ{w>UpYbWq=Mk*PYL~(Bff>Y#x-^VcMwJQbsv{`aUV^xy8cW>MN?i(Zf~w9AOkF~Ap`xUC;HSCM2ksAX)}rd zT-QY=4}g^a021BC2h;c8<6PeCoDO}3b%_9Ph8H&nYVFivPnhGhrjr6c3HUna#VKyt0Pw-c=)Po=G@8x{8;m?0aqYw$?hVqI>({BDU7wB1lwv9m|}4HA+dM@%O`v z>+5@h`o5nu$dR_x%9ETZ9o`@hQ}Zdsd5)uP;(rk8AKDt6Z#~N>Q!Iig$pfI{6?aO~ zyjP|y>!#j4iIr4H(A<<_0na5zOp{RJG^}h(;xQ+NB5@{e9013rKs|pP3hneQK^R&~ zxPdewkd8M77$fzsEq@1VlbEA-jzhTbb0d7YIV9wK-FsJg;r%+}P11#(^uB7x%xn~b zzvujGpAkvQH(d^j6G_?iJ2#rz=E&(9kq`-=b*dH?Fai3%Q(u+-6)(d;BD^ZjCJs+EU zobQX*EDW#F2_mYdm}VZYPL+fivb|$Ipg{N07}x*wKdf=@3UPt3HcU7$qYL; zq5gHuf5J@!Q~4J=j1&B`Cew_c;#1qyis`hg8)R8-?ZBNU1I=6yoOf@nL|b#Z9Mz#| ze2!bhwzoDL!A~!1WnU+*J9~fi>nBBgH*8*voSGe z%V2pc(1Y$P#`QQSwJ&PC=3~w}fO>voKmNK>ij1^2q}1W>El*6+yl*6}_FId1(`h_jetSHPyi#?RTY<37izHFqFNu8Xo{bh zXSO)aQs$Di!c?N2(l7XgrRC7KP2Epju?*3x6t)P(4B=a`n z&GM=2xWKD>f*!)HPTB&eaj{y%Zp6lbp63-jlI0I-iKZDKbf_+FL>7}w-1%tD`3%+b zAaSvspyo(!ryug#ca!klf%+(^@BaX^&AYCtboSP4q)O?_u>R_LvFZ7q)S4`7ZKpvS zADBd$&u%&Wb5Lu#T3uPDqkSohN3(p(<&oykA;1yFJ#bD*>7Mwd70p>BWhcu?vpqaT zuF4Ld;aw4|>?Yin<_kt?N}i<3PB$KYZr;3RuG?suuZO_UG`(?0i1m4+`$fI6$CDbX zlfG`C^OMk!IQ1|V@#di%Qs{QK)>=f7fn@}8Mxr=+C=77k`?{RwqL$A_@KiT4YFA&{ zej<2W3wRv)cC6|_XMU{sWBwiM(Py5Hb6?Z#G_7K5D@{o?2;o%{T%zGjnEAl<0Oy`+ z`H*;TO9>W6vwaR^kc>5@yN(fmyCjRrJyi%F#A{Md3~2f_<(l4KEzz*D`*y1W{oLhz zs2AOfDxX1<#{#(f+qv)WXS}+7_ZJehk~gapllXs1Y4sI~=YfI1^c48fvdaNo4aL5$*3ld2uU5k3ug@e*Bjn*+9Al~f06DA+ zn`tdBWw)Jn%M%oF=tXByJ?ukqzqz-W9}U!J@T(DC-CLYRJg7Yb9DY?;9@!vLh6k@T zkc~+7yMg|(QT_3Q`c=4fxCtI)Y`;!x3*|kCr3>i8@#287+1l!sGJnfeQh&U0SETbK zjau9Z0qeB(uRq$rbU%ej9mKGm(a7k1RN|F_$8kLJLkW^KA&_J%3}fq4Sa_<`fVVsC zo4)at{{X&+^~G`tt6oUEB%PU0siZQoJxwl4?lkOrZ1CD^+ElQ4sPRZXStB8~FzDQR zAM0HGg{->Pn{naWpc7TGc55~#tB$Fk?zqN1_L}6Py`Ii*GTKIpPw%1HPvusv^}D6j z;=i6&l2yW&Cz1zFX~j5=w9iQRf8u9lSR`_nFdjE>eZe2nz1L0D686?P0T^M^Jr8kT zH*4eU_jZ0Q)371CV3%s05`1TO(ZJ99!1~v!cr(N?TtgSnh~@*Y@VEK@06bA8*hi*T zR;{B&soPBpkCJ34>Kkbw`XBzcYP$s6#@747tW<0>z`RZHiu%;cbEVw*z!J*@>9M|w z;O#$-+Vg!wOuN(O@~XiG6$AI`%8&cEY0(g>CV;gzTwDE#Aj;+HlZl=5INbK~uSV?o{O3RP#f%F|uls2p}*AfJbU}i{N|5g(SLVg|W$y26OAcIQ?s( zRZ@>Eo`hMabU`9%cf01ffuoFcl?F0Lw_cT-bEfF7%=X%cmhJuaUPpWNExhOWkYP#pAJkVryUnH2G`qH^%r{n2SQ#Rn9UVw6 zg?74+Tpw-XHzZv{-@0zCuhvbRnw1(nMK1m}Yf87XnoIa(jyq?!EIgv60<&R9IKksQmabUe+{J&X3rVD%wC8vcA}FgE8N#!i zfLIISHe-jU-abjqDNV_dpLrbH-hU-$kv$I=k@YdHS$(fJKH<-k;U;)QZym7~Rv8!oQ z&hw((HXd7e&9V|uGfxmZxaF`v{Kw&1zYFi}ZuNa; z;YG}eZ)vl{NVmOfa{*UD&4e3*=CAJl~sc~#_Pjll1p^xSEp}r zOy+&8%FZFo%tt_2a(%{lB-Cz7hEJ);7kaHV7!U&ET3MZ}{6vHq16 zoVNEmv@ZaPJ<-Zt5pWX)RCM;MQz#M*rza#99SH4@YV~l=)Rn=NTG1Pc33VTJ58VUl z-|?oKOMLMJW8Y}2csT5F-!#H9YtgbWyCGXhR1k{MobXZc`gE@U0Kl;ywP3MewjgBn zKfOHH5C=K-^sf8BqZcku{P`#Tj5Up2DYE17Bh#W7vqApH^rr~M)uKLa)BVBxs!rAB zoh4^My0Lsm^4r<3tc-t3-F~%!;#0+tAKriIM0W&o7jk1%zd0ZA?I`^PeE$HWNMXB* zK&9{p0QJuw*1gL1QZ-xbcY;y$uLc2vgMpk5YiOZJf@S^lj+kS`M~-U4BObth6rNE% z>WL%W7#Q-0)S3i{zw*T?<{&%Ld28!JZ3TgD3O3|G57!l;ZLH5}rA=oZ)8@+@q{)T~ z$-4zdWAk;a{fkNv9)RYR*jSb?vP$hG?6(ZW{n%77Z+*;gKOUt5ey~NYwHbV3E#0@1e~G&h5tE_(%>(|~3a06UZsdhNy+1=u#sT}0-nm+vcWBa#VxHsN98?6NagaStRJhdv$O=I|*s2eB?%e0-Ijv%> zX(ntNr3V6<)8{xj6`uy_QKZ^RASJf+01TX(itACD(gAUG0@>OX1tgV&GmI_9cs%<}1zMnC1C5rLoYFkk9x4lfnjjEO_Ux{&mZ-6}o)U>GqAjX+7MI6Fr7=&uWs- zThz>rt&J($O}E=`qgk-sZw%~0##MOp89;NKADH&8s#N)Pdz@9Ob7;0E)O;o4tJyx? zsp|GxcbuV?+FRy~LWF#%5OAOj40gw@K^)!{@LHsin0!O6FU+r;S=+b%&flN>2AhB2 z{UX-tM)4<%tz*?#?zo01jPffUNM_G1_*B-qw}*T-iywuprq(r?m?B-?+vQ^RiaY{X z{{WNPy4ukk%W}4-XQ#*E+l^mc`(OMdn!CLA*8WM2BVRL_Nc3zrGx>iNao8qQPzfiX z^)>7s9q@Lmu4?bAcN*z~-Pp*Oh{1w0yXO7l58`3idk@s}=}@;{r3VWj9+jmHyn1{~b#}X^ zJjDV2XZ-Q?p}QKwo2@R%b?BG%_l$zdC?Xd(a{Pp?1+x1g@KOYnpidRl9~E1Fv!F}Z@(1}<+8V4(fbeNQ;&fz4cFcc`k? zN<5VGkK+A*$i~n-H>&uW@vXGWQz=jNva#D7L)F{2)33F34Rztq2fvl4!w$Wy#&Fk{ z37be5f6p|}&PVr&{{RZk)_h}qt6N8;Y0xH_qyw@oI?F#kf4)b}BOlk&wsg+`!Kc|K ztK&Q0?3)rlcWe*(M90$@JAQx`{{VKjjgB|wRfonNG}iTK{>^=Ps7|MU_Iyd0Ozuwm>F)calhBf7x{EUyrQeYG$RzW)AJ+!1QBER@vpmL> z4l4GsrB9>kQrTPslsQMn1c-j?4`M&9Q3AOdvo;o#6G=q_9vFki=xOUI&KLvr6&B%B zN45AQayT>rW(I(;-njP$t;=etqC{=+p5XNQ)<2gjpSphvm08oH5ke7g#}!?;fK1MF#1*h01Vz-=o;0=i+sRI6YR+7 z-NO%J2qW0n4-}6Jr12a)XE_zwYZ8rHM6>Y?xBPwC1&ZVjraa+E{{VEHeH+@6>@(i< z9aHVrYIlD6EC%PO>}#WfVJiFguNUxlh=Q0dcYK6xm3>t8{J*7o#8PZv>T(Zp-l9ki zYGWpQ(2gBJ6#9Xc9x|kn!Sxi8IY$0stjM_)Sd6fs3t;4CmY{{P@}I(|pbsjz@V=VY zR`&|CD#miK#Gty2b&Qk12l!4`(*~ex3+qkuX_u>hu?|q?JBY`jQ-=Oy+PfnZE(m4h z{{VP(s+am~6Ryjs5_tM65z74!QCB!dqq=rxI;E-Sl3yg#GQ%LWoq<^+Z)VvSoW=jc?f&T!O7ib(0_gku;p%sTT zS9&m5CxUp>clx=%c^UrzboTy!^^6QcV?OGRF9;a^~nQ>$!iHG{w*RSH7RRxPrmd0x+>$-IS8FjqqdhwMS9{=&E?sj)x$=(ZoKxd zLGWZQw>+*3S*AJb`^F#Au&mc3Mvz|eJt8mlDUa@oQgMn=&{8#d$3Y1-f#M;VZ9Rj3 zF5!q_V)^4{~rnop`pPr{C#PEI0OY+ek(LkYG2^ z^go4t10}eRTGd}4b`aygM^b6pgb->LuWNM>p59Nr6pA*H^!4}aTBZ$L`MA4!56+lf zy}v58e`@x3_ZF^>jTDT#Twvf6gU_}(s-#sCG_(5tl$)FT#+1@h0W#m-Kb=o!X(i>t z#(|qBnvN)*X#yZ5pGxbr>lLztX$!fASdrVMCNzyLJw=2|y|Fn#4lNdvc2*QI3J$K=de8z78jPJb%XRr4YY2w{$M*yHJ4)T${*%;T#LS$Pu@ zRe){44!r(7{e>_~<*;QvIvu~~y)|RNUzc`x;Nz#$@TRmUb^%t$1fH1u_^nLGG?*ll zyCWpy?`N%7o;ECAGP{pe?e+aCZMflr7i~8Mp-fI{H^X4z~kZ3u$z=y|_8v?K_O*@r6Oh7_BcG-i+E= zF44C$HU~a~C;s}Z{Wo9M&8!+;l!%TLjN8J#Y=hLFr&EfkMMR2|l%A~3XzZnOuum7;UTfotR-qQakfkS`QHG8mv*>_?t~=;$8WI`~(VHX;lg{xv`9TRS|!wYgb3Z zT0{}qTD*m4SjpP{ttrP$4xRWM@lOlk{U=s?J4-{VTt$MHl1xG)tYw_4r(c+0bpsvi zG?le1mRo-bX!jQvT9xqEXO>1~wX-E7LQZ~nC!bH~YV>;Us#SIUM&IoA_VM{z6@sqC z^T^n)sCi);T-+!=fR-w8>Oro%OYmNwqg@?q zP`aA>`Vr51%^yQM4^!L!0IqQ>dzwyi zP*m@x{{S-fouX-e1JmNxyi*d~YVr;CIz~oR_IW;s*M;|PoW14Vwc~AWYul++nOA(W z$V$U&7JK#DznNp|O=)PJ5Y_x!B%0o-bZx8{?we7&a6Hds&tI?I9kbG|Uw>y@OBRFS zyJ_{mEw>A6Wn{K3(|eG2%-`)ZDvsD3R;cG_hr-&0?X|?V-V)L6p6vniwR_=~iGRFz zVIW^)ll3(HSm|*?HT}%@zAV)~Aure`A3kyin2pPBS0fHzb6S(Ab11CAA+US}mhT@X{08rL6-EQU<0BP1Y zJ3(S_OFvcvIUoU=p?@Z`uebJ|p(Uq>ba|7?k8+e(LE1R+zEgvYfCp@WRdktjQ>sU0 zrucH_T9L4+_M4}Pf{G~b zSrjD{Q9uq3!@2zF!b8`sM@viz#DMh_^&>S{RfXgOK#P{m^KCurpzzM1mN(K_+zr-u zN@I}^aCjr~{*}uvMfIFWLJL#I8&Xp_V#OGrl;vGT!Mh$z1 zg>U}bqpWUoEwS>?JnibgryqrQCDf)%T`yaYh99_`k5S5#_--G7uT=0(qqk6mXZLqW z<)2O6SM}rZrzLPbD%x$0w4M!K2C|5&P*0E%jtJ;$7BpNnURBz|(2P{d+~Cr>hf)n8 zm4Q4Q@+!)<M2hoJA-Nx65x;`={{jRcC=nD3YF|+uT z_xBXy+R$w#a`(Ox({D*O`|TwE0ArQ^0KQN3HH7{g*R+d|I>r3Z4tGGr9DA=^es$>r z)n~)TMBTQG~ZZlSiIf_}1>vWLLU2!dntLI;p`wc&hF{hw`n1 zs%YA=Ho+jB$vBQ!@w@4s{@rV|xY4aIP%X@H%s<~@X!}nbIR}i22Whn#l zITcZ<%WE)|K4X=hDD+ub;Ig^0^OR(xe6Bse`o^)OmettriMc%FFvtEqUBUQiY>^mk z_03*5k64Vskp38|hv7}W?2TG;>|!6#irO`y)mzXxCEVi((m?&-nMvpa9zQxjbW@Ne zm<;2F{{ZV(Xa4{SrVw&9SN0xH1OEUXr5_6-2aQAj03o08q1Rm><~0U#u|uLVIrC@S z2go-bbKg60U5|mGwp;6uv$TawN=R&WJa+e_Uxuox6lx?6{{San*18W5_-S+}mBq)L zWKh3%7_6sJG*VVK=rmDS7DUU&A8HBVHJew#2Hl^ciQ%^g7FZlfRC z*Es(Gmez6u`2+nbH+YpotWO~WtKkRflU65h_?X(a7mlt;N?1l*Bxw>=( zV1-LJPUE#`ia6gM>CoQze){U-IDF_LGD1}`<&Z}J_2gt%i#Jn>_8*B6&j*6-=e}ty z@2@Wct&1|}$pMKh26@O*dEf*UK~5sKbd4~KPblQ{9dZ6~K$0x! z8eiGs61W@N*ZNk{h6g#u4+M1vn4xizjOW*_PO21;KrPgcqvA^!kqV=4eS z>&WyR)>UWA6H0OBXGy2o$u5{~CS(98B$18}PCY+L&KGL{kVw0vwQ`ts#|JqW{cEZ4 z7>L;2ShFqKx6drt%E(FDpn9$V{Qc^lui?QR_41+?Y2hIojGer%;yE9cdG#HVk=F*f zoLR`Os$-ZIwT@zV2!2ueR`s>~DvKn#RjdqgmxQ)J@7IG|#j+VBi~GmYy4!M-+Qkw` zk0HR?%e1NKjMpq*G0{>sIbBP@kE%iEY5J6R5^gd}9Px#A$;lzGbI&_ZPp1`5%i;7f zSP5F@Up-`0W#a^AC5L?Ritpm^z1v7w%?v@hfZLDh#cfAxBEVvKeB~Jc_ZY5;Ri7^@ z&bbt>aihbwRwzosWB_EAIW?82Sim7|kohkd+*!Q`y>H&%Pj7Q9CNsDLfOySC!t-MK0}uz#RU3ezfr0lwjrYj!t>% zex2)<@vYu8=paER9&OB50XQ5f_vavfHPDWB?2b=Ry=_kZdzW3WBkoo7TwpKY0R3uB z8%@8|uNgG=^5l&-w42HW9XgL()_!}6==4t$MWb7olTO~!HtD5VwwCk&V*mmAlU9)p z=w7n$Q{3K9{iA#G#OSgZfo5V5Qmd`&IH z)|PiLMPnf$R5C_(kmL{vs$g+))0vVqXCQcE6rt|DDm zSCd$oS5UfZM4T*v*t}|ljN}Y}Fil>!@ukI{n{{tt1-e0Lv27Tu1{ov<#(Lu|^{H(= zinCjufvxzfNbu(9p7LL{ul%}OozKs%cu&XjuPpHgjCFqzD#3X^+eCfIZk#TA7XI-b zr|<%><7b*|7A)@EmN_JzNk7b*mEA@}K!ogjx4HiS3RX#vR<6i~^U1&-tZ`01XK2qZ z@@Kco2lYRtCzfPj6LDfd00K!pfF8N6Ek{F_M*=NE>e@Tp0@iWnNn-Ys18O(>JBb~@ zs!Oo0mqRrsFb|S7Fa7r$i24qL>&-(XjU$$F%*U%S$v=$@tVt#D^c0(T9cZwe+g9i? zx?~(zM{B59+XU5b+xurzl|zWG3m9P|f##FUgk$auFR48%oM1ntblx3#ye)n;ZAVGB zkpx8B#eV)<7ycJSAhG)69PmNrm4uF#FAr-zC$$>)jJ0;t^mxE1*8^!Iu!<~gQR&7{ zV0zRK;#uTX@YjJXuC1Vy;iN&EtFzshkGMVa!0x>%*7{$F{vzG~0BdU(g4aunC+#w> z*rQzXAx!+owtJqWo-3f6#hw`O6{Na%hwP2j=iQ!1Y!)N_ddb)R0H9;O(^VI6V{~ta zk?0nOQq?}ibEqfHo2ft0?ECL?)>if$Xa4%EiM&CeY5J2*rdm(pj}cDz7x%HBD}Qe) zA<4&K`r@Uv(EMHTB1rEwTaU77K<1#tIM z9n!e(&+w060IgH#jGvjqS^PED{7HZPt>W!cMV?GyDuzI~VRHRIaH zh&5e)^X)pN_X92A?d?I7Wld@;0MJ-+5HG z%s3>12dM8}4SQ!MpM7yKG5NHC2$Q z=YlFP3taj7END;75h!a~SXNc%gV@$PI}E24&In0A16LAG#QrszJWU&IVe~YyTopT5 z@$2bBT~H=yOt@gf__+LgP^?OUj^33T%ehJu*S%wEULiKOEol^L(x@BU@&WY4N>>=z z+~jQ-W7nLDbdlti$ZuQb7WD5)4xMHB#0MHB#0MHB#2Bv8?^k&r6ZMvD48w6(Cmu zN*SkV=0_4r2wvQ070||vr{(ROfa_?>)x6R`H{3;+jR*03xZC*iSUP2_Z=qYk7SfSE<#j!`bDw;Qxu{>uV(WB+ z3Wsa+oyrFz>&NR@#(wRd!8qP}Zh3O6w&Iyr+{ZmJ&VLHs(Ctz+65RQuDd>xkPhL;- ztsQ&pk9lo(Z6e2Qf6EmLH=ei!m=1X3tv>e3-W>+urNI{-TNu$;9nJamHP0AHH)FDO zBPgqv(^I#(BtXna;d$%tQ(C<79WY`V-22LKAyuBKlW_1%bVF6K)meEbC2a$ zq_~dK&83z6xD|OkoDSS}=Zfv4PmvteYI88vZsD`DmRa6JOoT5NE}M=652kZmV7fLJ zS9V&tmhR5=M4o(13pe*>Oz?Q)9@y<#dXB9vt@2&X5WUeaVe-Os1a)ET;DUMnoL7@h znhOJQr4-h5QpK9yR3+d&@(0X~{?g+;!LF5|%-z|}2;*CsAxTzLBMBfCwMN~av%3)1OEU*1M5)DWp!@hXSb0{F6Cf11|Fb{gY~7< ztEQ|lB6FEnkM4qK!YYy*kPj@pepMLf`c$aCVG2kY*lS)1xj zF&dIf++@l_3Wk7Cd~RP!6m{36S%Hg~ufNSA;$t zXxe0OUS4g~uP_-*iTjw(`zhb*+4>6cJ4@NLtH|!Gre(R0q>J}Pcq1Rg2;-hBxwH7M zb7!MX3>Ij*i;xmVJEDwZmIJ9j?4CW(O3z#9U6S1OpA&eyOYp6zmi|)rG7QCQ8O)M< z`!BiU>s}|~Zy#%ZCEpCQeU`<)<y!ItQx1kz_W(bN)9s+@qe+rmFvHt+Bf1Cb!#z7L4)uhtvt1((OMkwpG)ZvGrsGM}I?+`YdJdxaK)|Ync_Z(bYtfjC956X+Q@za{aVZ~!O z^1BqB(GDpo#*&c4#Vr(55K_`o&;^LjP-p2+9cma)%tcK=6GYoxNbGCS9(?}*VCuu9 zifv>g)p=jhkL6x^wccq;FTNXUdq!o`ZehC~p>wgnNBt_6;yus8Q;6B(W*cRVU3u@` zy0GecF#PMz{5b1>q1s5YG8d1_E{ByQXFrB3pV59QFPJU0ByG#bn5XZ1=lUPcvT<)z zQS36rvmsO0vGlBIk)x2IA294IKG$Bg)Zi~?an9n!Ip`StBuTIPbcmX95^{pGs0K2J5| zkAsnrPpxq>=o;3Z+(a&u)&rIwRX9JEYS%A7vs<3ntQcfa@rUQ3^{VT+d0v$=NEmzi zRM?x`VHiB{!?~r&Amgv4N^nOt3mhB{O(I&b^p)dplkYIc=T{?-$&X?C!>v*glsIp_ z)Ya&B&g^@Wns*vTlmiipmZOV^z^N-XXcSRJ02EP002EP002EP002EP00B7np<(JM; z&|$uwwdYgktd;RUVS{F}dn{geF*0V14=zrQ%(2C-2?B3Gpp<*%B zTc9Jc{{ZV&fJGc?TP=ak4{z#L%Buo0IU}cUTJ0Y3qSh}0M-1&9 zvd9~a!1Kdro;wQi=Zt^#>Dna!0BP7Jlh2)(cbnzy;PoILeY5RT$zYCxCk*kZ%M<)Zs6U-Wp6G{U z?9LkXjrW6X!#a79%jP&``>Mo%NbQ^&s`t@ZXmN+AOZq!(fxg2dEXJs@SEh zm+FQ#TZmpqOq>B9L62+?*1YlW*6C#0Q+Qk)F(V&Z*AE_L=xJ5kZlrPvr=8z#%HsjM zo~gbMetPeJDd~GwPEYJlUaR|NMK2<)>n$cXxLv zEYOltNGcDA`P^C4?k$Jdw>nXvu71WFIAHc~=hid&0WKim|bQIi1Nx4Yh zgaA;+g&}zP1;N^orr;J{9sUR6YWJ;p`NDSK^@%%ED(edyC3yfyC>bstyL&|yWr88s z-_=5?`DGz!7tpx&Xw{+ni&-*1A+c8ozbSV?W?h=$Tzw&pwNRdfqvU!!3{otEQNXor60xb{928b!{dAI z_|NvvxLJ2W$bX3)4$CqD&7(h>HHHkl zygwNfI8Uq4fbYXO@}}uC>~q0!>EwB$H4l*e;!UI=;wGN7Vt?Vl!#FWX4@JO68+b;p z6Pa6VbZl5++qKOFaP3y=&8O=CeX|@6Gg5 zx8tBc?hoG4tJ>A|>NgUjZWD*9FQ1ZoQ?+MmgJQ~Fq-R4u8kgUvZXN|y4E;!YhPckW zS~0z@Ko-2n)^twSj)^^ujqnn=b)a8wDSifEy`zt48mTbg7K#b{V;3^-M-RZD-$Yti zm_$;s64Se-+XqHHE;zHO>Co(jajFE{Jm()W?^7}SroNuzuw`;XD8Y7qFjd!s+lsx^ z9OphQ^6w@HcRs_LRCDr~0ea1fyjuN7HV3 zP6@Q(FS}N4FU391M<3aY9nd7pO_Xgv?rEs5`AyES1oPMV(R+r5j!u61P~i0I!aWQPvD$(zeCICdR~WSj9~MO@{MrpIhcNhJo$h7%uIEC=Uwa>P-DrOpRjYz zR+|6kdA&`*?<|1`l$UG$|ohv1Kkw_e;v|H9uQF4Zi>(PQ35yjEit$$_R9N{W;ag#5P& zXj6|qSi1@9O#5^k)OM{k-ejEyeY67RRl(6&MkddM?M zc|_7TSR7qoBvv-JwWT)VUXk7_emR>$I=j&2&H2V)%!4&=f3}pZ_Yp&yPz^~@UQtOh zX=_NILRlLfMH{>@gA(IcFY#zl4GOBXOOTnA8PgNIV|9Pt?!NS@EABIhx_6DeeTlnl z;t0+x>~aV})EfA_+}cFSOeCdRl(*vG-qe4-6786SsaLEdpb&e=0N0=j@b9{Qn{6K` zMV5J?u%hFAPSG}gV5!Fj!8YFG%Dwp9K8XwZLIL`nrY;3}k|>xm1;p}oR^e(Y-+g8? z)HpACr{nw8iUZhhHDQ+9O?JTt#x`*q5KewFC9Ko$k@cOPoQl))al338#owlW`}M;F zEjLu@n(_8J43UiJA-|Ippu<;Jy^XD%{X!+18Y+O(kcKEd*u<^1^+lUnH_XnY52d%$ z;r}`yl4hPb?HsU_Z;=%gr!%7HM8(53Y6;CsTlm&c9A8u#H06&kKP(f3>z@vI;XKm< z$K1h=N76kK{D|JkFnmvtR~g*&*)2tAa>x60KBe*{1i&v({KDlEXJDBozyD%qKf zVgUc|g9ZB&ArgUnxZH92`(_vJxBX=k?{e<>-pTbxZtQ&}_xbT{n+Kdpa}JEgc3YXs zu=%m&Ne>dSB?m)w>rYxo`SMpJ&bh)6iPy`dK4!x+V>!*TKsBL7>08qM50?-(NYL(w ztI)Q<(u}l=2sMO@lC{m#vB+@nMtPeENycpnfAg-7^0iY(HR1Mtjv)b4ZIl9<=insp zI3(kQ|?Z+K3^gs(zG9m5)z>*y|bSAsgYXf$}ph%8e!C>P@? zpwFO8SIR4A;AO%A^%-Q#Nc-pUXl*j0kb18y-7v3fOYw{ z4_#FhX(Ps(bp5m%(kMYrK%(Y%P(+77CfXJ8 zPQ|c zDWw>Y;tTJ+pCm<6lS9V>{>X6)2rNoH#V_~H{Lr$^=G~BtL46%}KrQ#zoRlnvI^ zMuiQ+xTq#K{MP$y%cp%4C-aUj=7=v|G;c&-pFZ-Klw^J6Tv3cHzIet*a8#gX^c!Z^ zYT@xaiLd2Qmq4Es_69IR25ca&w(bVcSWgI9*F~kXoY}##w!<>sZieJoTIpk3Z*8 zZp{;O4(z)*K1gEj{|O<(@q@(r;_R zx?MEEP|9N1^U{^KEbpBQB*=3sm)Ib_eBL&U5q+P3`ZxPaA=T$iX-S;i?K4_5L&p)9 zP4!S@`F%9Xv7OsV_tq~D{$6uX{i(|d0ccdkEnOkyFrr-P%HI*ynPb+XTZChy zOT(>50=i~GQ746~bn6n$8?^?d@CQ%k@y`u+lpATRz^G^M4offJM0_&28xkCti-4f& zz56osdB35OL$B3QMPyFyP8Am>c$t>m`e15i7+a>iFJEGV2_Dg&nh`vTIsdfvt2K_y zmpb-!@J&Y#l|mLXjpLaD!2#pp+*|k1qmJllQ*lG=B16*R%<}`*C|XV}7BQIB5?;=~ zng!ji$--Ll?9m;PBMpm!1iLm=U86P?|HYV<&yu+X73FP_`^1sH$q0%UyukhI0lijH znu`)t0i(WbIUUw3>TDhzRrLFn>j&TzfU$o64}j7wHiFu7Uy6J_3gRLGzpm@vofw}@ z?vUNKMC-?3Uptehz}WKgVE4}pwVrNh8%-5Smj$Dc>q`GrKe|y4IMT{bv}MzFU5J=h z<1Z%o8`-;>gGVeaM9T3cac){p%41JJ;aGx-ilD%dPG%@NABSrD_C?1w?xn;OhoOE~ zp!c=9r`qIY-4yRNeAU|$ej1<&G~qEB6(nWqv4V!ENZTAMcFe0RwIs5bMs|uw*$D1J zY%^}2ENUbX&ho{^EU(|XU5q==`(4p^KKLFWgJQS#19F0=nX0x9lPOWWAFW))*guNf zkV%NB+eo(@Rug!#e&qST+U?`0<5P1%h&YjC6@G}J3Ehj#%wavi4%<)lu)6%7-v=zS z)%cRBEGR15NT))KX!rTWPF9bBtcS?3p5PW*d|A;GIW4;>HC4~iJ1QK9PK6XNr%A+| zl!m~!L6jQ_YyuMW5n#Ot^Las(dq`8ZV2G_4=@-PZ{cefO~7CGBzuF) zSMizBMA0*$u`AC>o@k@mMCbfKyy(5VH6Rw>R)@BCyt3s z5!owrm^(YmY#(tC&uSnH-IUhZ^WakdJD6fjD->x&9&QpK)>W(j7^|MVM%OW)b~Gzx z>!(ESEO6+H_;&J+cN|V8m-tk4={P6Uv)@;W+|NnX@U#b>JbM6HyC?>E;)ZB@Uutl# zORJy+suoaPZt3`1NMUtl5X~{+lFlZsL8gt;UAcQU9@IsE?ygeu4UfLNu@s4Q)tK=g zH<2N#cw$oTm$Ty8=P8xO;^2}0*(Ei@Tjy#x=?5ZBpG3^~02C%BI@=FD6 zutGzQdBVREXzDKp-yq#pT`R2d#8g~N8@w5+JTGR}3LLzxaW>RSSNWB9V`jG{ar!>+ zFNOcYynI_;S<{6r{;CKEQaoH|WEwpjcF00EeF0~OzCrkA!l~n^_^xq+s()QNpBkJe zFscvGHfXLs%E1tsF4d*JY31eo`JSTHxLN3S(GME3w{-`bzbkYNI_Ak?jWSc=*Gx}3 zH)Y8)Q`{Yv+bWS{1HVk1C$fV2f3ym|g@HtLV-K8?Q~I@Br%B)~cec^ZsSCHx4&yFL z_e1Hgn>avqQ#O;~QO2!QoG!x?g`a0^KCB8??I$vpBznKEEIXTPsWE(zReHPhwZ~GK z3a3?CLfXi1&leX!O)O|NE61O)3ZBy6;i5Z*ck2`X_>t;_9oSH9i5g6WC1A1k}D1-P**;VacfqHv;o=o6uu zu1BbnP0MP&$ZQx=&WjevOvkFKeYymF3*isonFh6=-)vxQhCk4QkGLh@NXq$BdQOWv z$nD}uHFg8}r`NfAgJHH5Cm*IRnCo!*!J#?egs{4&aL3^~@3THcw)QLQ4mzP!`Gi6; zD{nfQME|nGo39A*=xaaB5#Ml!0*Wz$LmE0AxYV<^fca)DE>sOW=k}pa;(T|~gPt6P z#9$?d%tkX8yk=m8LF89<@-g{n#c_K0KS0;VymNn0;+D~UZpHhMCIG_Vc%(1%yKJR& z{S=I1`4+d_^A5NrFR4)~%F8??(z=WXrGwv6^!=I&@;t^Be9j7*2~l8ZMYVA6&=lm# zJ9_jM9Tq)C6OKNED5(1XJ-DSrgJ-P$1qC3eB)O8XxAb@;bTKM-=CnN3BXVlt++0Uz zk1swYqEl@OSAifiuou6jh+?W2DUUzyL$4&lz4YcW>py@-*7d(y3%l*tL*^HKNpe(V zDt=DC?e5xarUdj-qq!EvS}L`e`zhrVnSg5&9}VH~BXxlrnIX-5_7t%nQQ8Y@XPX~` zPJgS>GRXZ*!Vv&|QTS_!z`aZc!P3QwA9;4ZHPKoE2W9lOJ6tgy`|H^^7j5V3=;9Z_ zUGR2H7^Th`7C}jD#>4kwSfPRVMua4fQo+Yb|beCs? zK54k#$fQ=Ous_fw&-tS`%T=PPM=*t4KUH~QJ@ZVUDuR(#+?7clqMc!kQuo?FQF*2grks~WvT3}viPi54d42m$xms`n zJ9S6TisVbYICx9GyEn#Gb$YsCW@jaew(VkxsCZpKQ>i#znSH;8=TgPk=B;Q0;*-8s zt>FaO$5-$Rl2j>2fJ=G9?a|Tc3NN&k{yO3I=>%+8`-wUr`wJahmt_<1VZ90m^XA!X zs({{&ed#r=W9fATc=%Se$9up#D(Uj=0C4R{KEl;;4b0R0v|4ThA?EYcfW;A4tT;3P z>5xY?hy*B7Ay%uy*p*>VysRU`l)R1xMiwB;{-q9+{f%DwP1CuHbU{bgHswSUGuHv` zb;K}HBNj1fo8_5<)A4Xu@t~`?m_s`&pJz8s82RZF!zV6G&ChE@Z!6Q!6HAflDeSNQ z1JoEjFIeQ|pAerY$!b=NBD9iEcnFP)gDY`#y*1DyaFSRBq?pi)xF36746jUFxBL8h z@)gKJ26OZort^vVByi-B|BS5GCVA-hc=EW|F0`mkp4H*PEzPYe{pk2UTO;1cMbeeo zdfH6u>Pgok-PHODMZHlb*-)-R;m$oL?*7n!nfMa!+IXxp=f>H{p^BVudIK@Ir70E{ zE4(ks#jfV`{PiK&Kk=)qiYmxO%Rm`=&q-7{jIHIbck*#)_j{v=2GMcFKiUeUaDxxm zHA-*EO5@^z65-=m|?-L*n6{iu=OJ*FU(N~6bUK6Hz1kIuTvnj)SUjw5~hs*j@|Atbk-DC!^yq;S@)7{HKwJ^X`SHabh%3R>!!AiIoyS?7G)LuZt` z)e*D3xUJ)MKcCE>(90J2d{_Br=UCcf6uP2^vfO-Vx%#-mf}Fu|N2E8rzcK-!5Or8E z5wMwBb2)Q|Oi$ksl{#MzquS?Dd~McX*5>(IQcVC^quXea_Pi`RR?D|-Z{$Xt=GVr^ z0lQZ3&h#==O|&0KSlLRVB)4A<;!9$+lcjau$w$bc+@hRSW)Bon`nxS%D0|fk9r&*2PGNs*g*H2T|v7e^-BV`;(>8$d=6~^qeGK5l3CH zTiW*cNM$KE45kN+LSCpX{{t**`Wwf_vyA@Zu?zg#IsM-7%k1>yLhKdJCZ%%`jFE37 z5tU9K!(4thBgg56E9an`axue+T1O7W=k3@Ke;Rqt=FZ-kVph7Pzc1^&GNkZWC+~|; zPs>!^QlWv}STyz+;P-z3E~-**@0HZ5uZ8tBjp)E57C7IoknOwPpW?zE4tyT(YD!K_xMSs>wZHp@J(DgN#(+FoE{AKy%wLAZ*h&ei41%NJMd0RIy zFgqlS%H18Gm}wE~MwJHC)L$4>`B>(v^4+>v&`^^ihg;msoYP`8R+2;Y*MZ2lhcbpi zMHYJu2{bXUp#YzM&ULjr^0#s(sAgsQNVt&B-NcC{@5)U~eQW*GSeb+`ex8A2=vBY| z1Bl&~ip+U8NvWcfo%^-sQfB&87=C)40NbBYzepuAM^iv01W%WYeM2UkBjiOMu=D_FR?~I#Wj) z#%+4GpQA66QnpkN|0(WSu>oo41Wt;|_Az5G-#a}gkD@S|rF-WYj3>;FC`2MxS^$tw zV*16%d$baTrt;#h*@cy^iSU1+kFQr~{X(%qUauC5uaLsRD6E_OaOx_W)X7ZGMV73`k}{4J zY-A~NcQk2v{DJG=TYlr%d_kp{QOqL3Q9z{|rU!{F$#(O9fM&j}_e^>nWJE*?Z32?1 zWSuz#!**edci75QYH0x-n5f`Aj}7Ki)pDnR#Q_OTYiQ4KzShI^i^tO@Y6EZiA0YOApCpy9ChDQFfCr2<0wt3#}S(jLZ3|=bXEo zs(Lt@4hkiItu0$IS9ilOiZ8(JG8(o+BP>cz;eC4&NW5)$#-`KKNHDU*Gu~22=RLPE zSllor9>iIhJ7F|GOK_Q2slQ&)-V&j$G89%|p%U#aurqG6dq;B|#m~Z7h zTDj}2NLfP0-0$G2u@y=)ihLzQD#J*|E&>qKbT^a)e{dd~&`hy#ewx5EmiXxI?UYb3 zA*)t%0RMV@uU$Ri;JQZYia0J%ZF?J=k}V^h z_Dj3wUd6OiVzvVWxy#l#$2Kt?#jC$9;XfF~_L)}XBHxcb z*GCTT~W4aA2(0Ki(S+O{YQBa_F*>n*tf zh%Iu`PW!l(McXNyaA?5D5#>2Hz^u2L09I|Wv6UB!F;R4V_#a^2FuwTuD@$|UE;-Et zjHg_ozU<$L#+gW_YhWrc>;_gf_*=BSQA>7JGvgjX| zN!-ys9>T|H(?YZz^Z64E7j42D*$Aq1LV@yGZ2rzXL+hX&;nqQaBP)=elsyiV1~fP+>zLn`q<=_Ds(>?PB3fmZfk8W}*am5)XH?CcJ0E77^XoC6 zmt5kgBwlOM)kUg9GHH17!ubeD(`(F@h?WNfxWJC3)6}OnshYqL^rt0J$RM_c(hp~+ zLW+bX4rZ71lPk*V+aUqKl5{3Go~Q!KVJR%ppR#NSYSvF zH;X7apP6Q~QzJKV?>;B$7=eV=JU%zZ{Ri;-lx=JKW$?_XY*K7mXAM zIF?=Ncw=JX*XG%s++ZX+;C3g4q2e*=aws>#Eebe$>Jm%dIYo#L)Zr~@zR$}f>m z3@VKhWZGtIW11Vv97_JQw{%cC2^<2udqZE}*05gOWtt9(ky2N&rvWBW+&LC8VP?2) zJjtHd%g8aq4imLPsLT(=F$PKR3=|B^J}KmX3u2`iQt17rhCC#f9~k7obz5G)PGw*r zq^y+BHh`eOrx7lO+0+fjbo=VRL#U@RatqM_EZroog&<|OkLmpDl)&5bf&T!s?`|il z5x2R(aNt-(Cwh}0|E2Wz>k;ZKJ5C6)aNog3mfNFeWd*LYyR+Nx?>~X{L&xlJ`W>2ckp%*lkgTKUlz_t23+!_to@K zOkshkANLRAUT;_suEVh-|3CR=&j21K^L)*iH0vAMDGyonVXc(>vNF%?z>fJxk8*XF#KNs#Y;RL9$CA^=HML!kcl1*?FU^6N}U^<{i)dQ!tK)Vgo~Bop2DU`#K97f18WT z2U#sFI=M)_HWx7jcIGvPXhZj=19I$-LY~fyCM;d!MCR!2>4FAubV+XeNp~(Q+q%6r zPs?_n=`vjjHomsKy=qcr+?;mf-l0Pm<#g8#Br1C?bL^4tP#*=Firxp}8ZmMg0Q?#EzuJM=+AFyRlwvI^Cdr3%fq-;)c; zL_Z&z#+)0<6~_J9%W%cZ3G0~_`(1v^*Z2i0N!ZJ(G~S*{=6^5Ne87KPhw^sHkAo!c zWge?IBxFxcP!xz8II}j5vLoVHP`r|BK#wWn!8+Mi%px%QfZki`*A(8J@CB!+2Q;*N z*}ru%w43>7xL3{d@s@8-70Y~~e!|T&`Ryd%-=k8#(F=kj*ZWxg8v5y+!m`$B5;JZ@ zj^-O=YX6G6DxpR+)t1ll>7PSkX7L+&lEIAS^1mTog#Q6ND-D0t@(G*A zi8Hf`A9lrnXnRw!|3STIb9lMxjba1%cU?lf?geUwk|07O0<8VLboocRb&cO%Xk`Gp zCONMc@E%)%u`e%Wou7*YoGb5C%H4mjV<7EJU_VvXQ$0qPJ+1S1fgAR^)khOG<3Qpj zKW~D{2U-*Jd$B$}71Xf}x89gIe83k^o_4DFOZrmf2jj{A)3M*oU*2)xt@c`gx*^;u zw*C1{7YO&!aynkIWmi~6p(`}MfGD(>VL!hjtxb;sKd_#>Dw$^52EDhJ2*GLe(=$!K z7w1*LNSI=Go_0EzAu!@2a~U}^po{MATx9=T8luW$P)XtpLGe2IyOa=LVX$U0r z{PT}IDJg|7goJ3OWnl>1G;FDhdTPmn{D zzUI%he2Y-s05{`ctOWPS)*3*g8iFo8Fh&q>SV|++2J77p>CQEkN?8di+=d}!S;jykaUzb z?RKvB`K#)>K)q0RXY%#l{?EOP+grk!Y#JYqp^RktQKlaLxLKu87JC++uQ+vW{rc_6 zjvjO7#-#krP9H`@@R6n#cSP>%Bsy{5jqa@(#j#O+GNQw(K6CFUJ(}jlcJcXw&7u?= z-{t$Xy9#OC1m6y+7P%02aHDz&Sif&hkU#Ap4{~GOy+B!V{FKtSbe+)ppcGD5A!asm z%85agL!Hc=nVUmHI;d-?h2?F=4`;e5u9&L3UA3RL-j~z{cbflZV z6jIPIy&mD-zDb{itVI(22MDo!+Ak(y%#HKf&JE8^d{Nr9oN^N-JfkXQGEo&f$TG8H zC*-8RN6(sRnQbxL%2wupyXV-j*_uD?5Gf3fDW8w&K@{?s7~CB71F7mQx&O&Jpa*nS znEeNsfvV#~5q_CQE3UC`tpG=pZe^cIcMLD!?eaVf%b?RQmW()cZFMyetwi-F3Yhuh zt#`y5s-`F`o}bdo=(MG#U~#$$1!vUlNG>%`ay(*VDsUTxUvC^cF(d_)D84j{9k@T# zsSf!l2W;nR_!;k}o*R0mO`V!iEof9UFSv76+7&ghv)B<9WdJI5LzM{cj=tg&-o6Z+ zsE4bncK0G?A7{D5Q^f5*)w9(Tf=OQL`J%}BTVwal>?O}K$@EP90A@u z_$#w0vlWgnB!j9|#z8;j?S!ZY3!z_(ZU+&Y*QGSQ{C z%yBTa?H|-KiIJLWF#p6<@+Ur>MBn>0o)Z(k`a|rkr-N2$S_~_D8hZEeMG)?u)Q&GS zuRL#jXhdvRCqJqsJC{rsk-G_h7JHYMT=+5qnl9C7kE@AW3kQi|>mU_poCyd=vhMtv zz59?KKv`PW@8|SV&@N6(WeIoJi4g?Y##|>xnlQdKl+H7*q~SJ56D500xjF`Zyb~8x zzj1%m@qExPS|mD0iRO?Wfsp&_kS`??$2fD)uk9J*GK2p`U2np@W-~34rLmy4DRHxv z_HL>+Bk2Np-1HGqBy^<=b!CtKF$BxD+aAF*kJ18{K$oIl)@Lm1MmhQamq2ZP7Q)}N95C3Ee z5z2o6+=?loSC;0bidz<)v=1B8%~p zA3A?-%fHndmY~f+5Cn*s0jwYK4nSgsh(03*RF+h3=78{2IFg!|s*v zkcL&HCb}BPdEz7P2Jk%5z?v8 z{vbpC61NB_h@|knt&QHk;XNn*p7HHLQyYJygIw3(FT#>8n=!ii%6F50D}dt(A0f)F zSy2|tw)G;sf{^Rf685!Y9S*)Br0n%ss(G|;^JP;;vHW0WdngA_wMSHVfBOr2g7r4l z#FeJI3AlJJ?2upSYd;#tSN8fD?eo@(Vb-b}$y3Y!tY%zv-VtpFd9aJ!LnAIglgGyRl}pfJoAbiL5`RfxG;{2XwF{KVGFlKRvJ0!W}IChw@Nv6NZS>S25Yw6r3PlI(T zOUjmax&4ef-#0^Z$G)DxmslflD~G-mqKn*ZhlutL%mVM-J%(J6_I}p9+jPJ2DELL$ zy`W3nnh;oG%h5>qj|)xYRT8M;|85T^x-*XZizB;!<(GPk&iOjxf5O9BJ<@W@qr6SQ zYpQoE863(xNv&-PjYe?}ZEkwQT|2XBg)eLq>!3Ye>tp-ge#JITL!qbJgZu#12mM^M zDTksMS#XDMy!ieV%pF#p_9rfqsm#59Sb##vyV466k3III;U8-&4V0VkoMd?#o)EF# zR7+J?caj+WtyJkhPvE9XN;BY~s7%0o%I}bDm7{pSl3+?1^FzI?#LyYbBc0lBFDFm@ z2KvTE9cNhV2e(88aH56U1^$91e>c&3msu+KBTCzmM)&ts|0a^W=FD2q>FWp7iRTj~ z=pK7b?tcKrq=HwL4)ADv#-y@dIqKjJ04B)u$?KrY|-t z{F8OzrLtjeHu4;}J3S-7R52i-!zKC3q_h1naJHr}_6{R|DTL+Ukx*U{k9D~*wEm$e zG!w+hrIS67ewlyU!S2*kS8pLhb@_g%SD`+r#-TG0t+3K~EfmHxHHE&Z!Wt9O>-F^+ zOZ)8-e0-kP2CY08MK;-*}m!>9Y~@wgS88%U)J6vNF=~Jab~(P*jc?`@XAAV@-5?fJ!GIN0M2DW zGo5_E`5WW#7myIN1=vXxnSXWlzqK5P)mOcWw4LZ+Ff0dY;h? zcRdydC~2xHZjQ`h#6#t&Ng+0KX{LEsy&rRkA@{^89I%QJ&Z{XgCXP}~gYu2~o^S04 z^u8|jjVbNKKuwq=8C9J?y;l&KYLfYxTHgEP66tUQyrxAOsbm5*YN}WCM4*v2h zX%9a4{XriQ23h84flFc^5sXNM_2B+a@Pwta+l$@J#SA<|1wuVoOY+}!3vGCw$-_^> zVJr-&zwS6cx5Q$OuIw{@Xet$6kMj7mv%Zc#0R79!cfsMy zF=L%C(nG70>|J@LzbBs9gOq(tFRM#UJL;jqT={U;Y(xgb*eNC70Z=a*-mLG#VqsOZ z<2)^{PvLX+WL{2QA2u?i&Ue?qHIQBjPG!i3q!|U3C zm!x)k(TTzW0jt?8ffdn{EuHxs)H_)62kwX{m21^jbds7#y}izWOj{Z7s^x=9g+t%mK`)!4V^OKaMJYg=D# z1&+YeeCv{z0m@WYcU2}zL3rxu*Nk1C=mthh6!&9Ky3$tL9){eI=tKa5n>wRLi(h8( z-KNV{4-a`)&7`|o8j#78-cH}~<=cLQevKJ>BoL$_soeDzxiakB-qA=|Lv*j(JO*>o zxEZ?8JK^1qlzuWBk6rv|KvCHyrWd`$hwh0J1#=u0@dMo_Mz;Y-DN?0SK5H`wtS0QR zBJi=m=*iTYhcEc)a>d+uAp&ahaP8~V^>R z>HerjpG@9JjdznEi2nX`Zm%{0Nd&vV_SPpPkH8jMoh6K`nUwricGNi{LyO8atciC z`*;xYnebKw;V+8;$EK?|w=bXRuT^Ge$eVBJx7e-i_E3;=ye=jY(Q=31b+dk!*?-`Z zk|-t-qz^wC2Il$&)r6l>x6wGZU-B2-!>QBo2gW-n)h+)uD!{wdiQ3m8RmZLA$?TeX zYkodl*2yfY=Nw4K^$D^~iJbv>V0HVpqu}I_WJBnmb>qqA;)kRL(gFW4K`a6DkxJ^+ z0yJ(Zbwh#663SFd-E|1E2Y ze_GWt-s-xcXz8FD)NrKBfQ8r}e9LD7QGby_ypEK!7h53jQt3^IFHStnJ!Q5RIef@S z-+KM64rRAyJGn~nW&VKfOV+F_?^DPgO^QF2TOICR*7hCMcY_k?Rh90r+t0N8nuPJE zJgH1qL}w?gyj|XV&wD>citIGJMuHOj&mg>YGx}8g#*b`Q=fKR$ck_b+YL*VyRsF(4@=;HG;N7b&2#quWq%-;Zo=9tRW8blatT z8p=L;?xrT~>q}$56(pA@J>3aE^{vu8XCgSb7QQdDysn!zesr(5JH5be+E__QUM(}E zJX*e<^uUVHt0b`&Fq5S_W}R0`tC5Y7;DPYIc90U@sy8tS3X*F|kVQhMH+_aUk=uz} z;Vp8OUV?x`2;pz@9bd#|CP=(!!hQ(BjV`QHF|@GFf8g57PG@_vspQbwWu`3komBcL zcFFP|!0Ru_0Qe0QCVrdpAE2Qiu)2eCYa{$(2>Cp;znsWq2(pY;d_5uvpbE6eWcSh4 z2Aqomj_uErCEVFvNl&6QdZ+6CJ;v(JQ~Q2$MWv|wh`{s)jX z11VH`ZX3JY@f$fBgjbNPxENstAq`n?VT=x6Xf9Qax3_$mmzXO69>H1XH5b{7(vG59 zB@k6sJBu8Fr1q;O3mHC)WKzIqGF{>R3;DTtly9I*u)PjU?@>`fGJ*10-evXL2J=5a z!hG+R&~lPcHm=V}=G1+LS9gMjEe6g$C(4Q`NC{Z0PWz-Aj7@b-yi;Vf-nz zMs6wgTKONq#{8iyG^o2z6)v0=X?<-KS5x&0|0)CIWL9=zLW}I<=aJa>6YwxeeA&rM zmfGMSk4CRg?G=J(80OZDr%;p2$+ z-&Nz8D&>5D0#~Yf#`r6$+&5gv%Xb~D+hy(u`?6xpyhM+lCjUQP`G5FjkKeRpUi83q zgawdTlI{Me_Fqh75_#zK8r}R#Yqxr8cbl0|8Bo(GzUOiN;=JyDSK_xlf*8-A%U=3K zOfmb{(Q@Q6q0Y}pUnOYzeC4-_cTg450WwK_scWF3b-gc}HvdJVr={EQ7Xc?!>ip8c z_1X8?Kdq3a*jRDD#bh~pbnbl-&I!o7<8D0dU>}E9?;cc+9KnuD=5_%%0VyX+`bKUw z9%I@)!~P)FwPN0>n?Kh`ShFTaH{BYL7AkKAISfJV!>$DA!OL$11PqF6&J_!II!8%s zYrA|@QY~Kvtub@4C(I+7+Lm2dqMTgc zhg~^LJY-+s*|ps;tmQRXx*}Gu;=&nA(#Mu=0};~_f*PLlv>nqVzEx8f;veqIRup50 zIiV%f9cPQd=JDNB8E)cq0wTM*=`Gt+nX>Fc?*~w*{J#s5BS)YM5EUgvx){XFS%!?2 zc<>B6R$X$`I&|Lt+E{%pTDo34+mB;$4-Zc!&Uw=#OIPo(!;nRU8BLOZ9}ol*>e#x# zSKXSl`J}SKo2<9^lf8J4CLO(S`nU8uc5?8={mt)mhHhP-umjbcs(;6*16XZqF*ZkC zFOA=Vqp+7<*(lW)xDS$qDf4sv`09rZg=6_!7Ga|ML##hj%`!;VRMpfWs>98^i7^J+ z*{@(=vnzWD8X8#3NCJ3}o#rHme%UVE(0`OANWYd$%q@znK4-CjVkXb~MIHaX&HeK< zrX@<;9eOd#R+P!0q-_vyI%b)J+JI|ihyrWM9<%~3Aioz?^}2U4jP2)7wOhv}4Y{~r0201Ct4RiggALg!Z0|v|6nZ|QUpUm9HxoF8 zKtdSSu0J;eC>=*m;YJ0VHeRGW1srkaX zQOsPPp9}ZJnR#8{6@KFQ18Z^sg9>#b4^9N z>v;+3v3zT&*-!iP_X2lJ@i5hwzSLJoOZh8~+?vsQU*m4574^WTS&~Nw2{qNo=6wx= zLWvlIr8IL-><5F`*Gk@3aeT~s4GS)U-8JUjX8pq;OCa|>RIut9^8HrzJ;&$Uzk5jt zPTHD`o=TR4FZe|%?0l`(B}s%s5@(Qemm3_ZbNkcfZB)VJ)i?3iAZ`c^ zdv75lC%rSzJ4zFu?r65@uM_@vy4frt7tzR6DR1us8EVzMXriB=n0Hv-~x zz#!Tp1$&ei*DJJ+jVS!0JLXbrJ$zC}sEQxQ)QmBe;;og)PEETKMwJK}%ky@)l;&WAC0x`3p~E8%j+knjeb>%Q+0RSt2XPu7 zLbD&T%*a9SxaaQKI)YcEK|*3nNx{O2@JR&@3fWOYjB^?zxGkSxLf--ZjqIU^`*Vy) zyhslF91FyaH5Hflvn%cA<~1N?VEZjf=(DCsn`LroV&(4QW6hb$8z}h3i+bV2@`KEH zft&F25_o`{LBD5Wi!g+|Ts2T9qBOpVJNrA1q8W>4Xv*jgT0TQDtEGn}`FqhLfE|Qw zL(pKIYb!G_Bmd`cD zq1=CfQ;p~XyrVmZe zI`QqVe=BTbowo#D#&B<|y-%(eRNW!wSBY7j^;n>qY(z=WNrv`Qb@?Ua`CsoE)l^de zAp^~ctb+mu;f$Lu(_D(^QN_AFl3$#c%j7Ar)|Qc%WKy2W-+t>yx$Ka*Qdu>dR+Sy886}GYA0Ktq`~Mpo_J5vf1PGDFADcmtCgj&GSaqcJ z4M%;k`NNqNDe{T&a%}I;(!U;h*S-8BBMGX<+)dE9oUxnVg;pyIrMrDiPn;{yNZ0@3 z>??!Xj=na7rck65XrZ_iiWc_}+}*XsTHIZO6eu(#c!A>X?oyygfMUhno#Jlk=KsDs zyEFS~cYjPW3<((yfqU;cj|CybBqhbZM@6VtF}j*eH$D2pPq6kgRSl*YJnLGvzG#*5 zPOY~*vOQDf0yqnm6j2vUW#)eDWoW(=(xa%^Kh#YcqTDD5s2OC;@^b>Z8yWP*?0gPs z9|x(VY;!ZbS{mJD<|nCZJ5@w9DaA{YS(K9h*w+-^jYFX?GD}~Z=rggwZxnL;qr{pm zd}(TPlfebXrUL73b%~MqVI6mvaLF`4A$Pj=A3S&i+UHBnw8i>KkG;v^^oD+a!GX@; zG1ObV01mHeFu0yy=y3#=XSEGnEyf_ui^!CWY27rJGrjFSIno!M#C~Ov^M;D2R(K{c zlIvl5T~&tlQd5MXY$kjQFEYhSV}Ho+xG6lA!MJ6XDQvIpuHI=?sroN^wT!X+g5c!E z+A(=*#@khV<2B`zvyjx1V2kf1+(qRgVHVn2J65k*7oOe0?UwamBlb^T%t`Sh?iT^j zpzNu}v<_tEsN;L);AXU89+Ks-xO7XgY3!!vvHQXzc2Tv@W&q?1H&tsEVNrtuE6;m< z_@k`7EAh_~Pd@IZG0OPw1US2*wW9Z4RTihS(U)1od)ggial}QbR8#DNMae++DRySIS{>;{V)Q!OOxUT#vDNA{g zG*30%n)t9KNa+U)yC}Ck+T1R{*gDXLK?W*Be+NhRkoT)I$E!;|j%^x#pEQ@kUYgq? zT4|_x*54c=X8@4%y)G}IH+{6SXylC2#Qw&ha@l90TzOv5ZE1g0fRRO-If zB`4FP)KSHZbK}|*4KXO@uhDPe858$m)EVR$MDpOv5;KIvjj~QjQO%PbBA6PYol7`k zF1Ez#WA2fpT)Yv~F6?dm@qnfdwC=*5S6r-%{6vS^X3tNu@1lG+ zVHm>E2kNt~Vl$42FYV%_3Yo`TerP<6VDx%4RI~K1vht+6I^+kwOv`aXHl;rbv&ZfE zIr+9)mM20>jjM#H7Sa8VrkJ>H3Q6ArC0M3~KAB_Il8lUe5`0 zz5IBn$?;9raqVfT5rl1X2lxCqg`zJArNg&XPqY*w_`&|~kvHnU3l!=XTn@GRLl7gk z8j*2oV!^gy5#>W~bHfx@LgEp6FQ%;ywF|V+f2Ad%=uErVNlc;flTzOc z{&8W<&0CdERa;Y(mEF-Iw?VgqNr$Lh*nV(d$^IL| zLWdJ73bSu&SEC7LXxa*NT4Ta6nX?6qDi z$qNo1sH2>LP-FWGt!TokJN}XoZ`CuVl6)D9#zg)W;qxuiA;RLz3gxV?ei_ry8M)DJ z+B5-}tijyY3pBB&4txWgbbYx1LegJl26o&_-2H0@md~>SvNTO~PLEx+ zX5K`OX)uCVD?FWH*a_8M^y1S?teL2|+^1oglhwDjlmg4sLI@-)6$S zUhrwTW*gsReP8gM*@wVdYnfKK#5yygwqjYII)0V1+v6Td6lJ!ly9izHG$dx{{l>KPRn~K*dBMbG zTV76}%qc$z>;MGa&TZDx zP6Aa+l4JVO*j8iugeWkAP*jbUB)Y5 zIMMWteF%m`A$1qxMqd*rFkREeT?85xiyp_44x9|`YyGDCq@|D5BFwL38!E@01X8C- z65tb%Umk}imyY(v6FSw`DoI%jkNHKO&?JVklG~Vf9me{_i|TN?DUx*+4IJ$cB6a2x z8x`u8kdH-A0n8ifij+9V4Bv{C;+mYF7IOsjNn(1B=?{vE>x-M}XK&Go&`t7APa z-YHj;S|{FCKBc`0e}yHWbf~>v{_!x;i9od-BM!75th>&sSbSrtsY0oUeOtPQ* zz+dNJ;vL1;&9j1+nUS=a^+#Q&AAkCi7S%Nl|F|I`wICUGhc)jjovCsCZj=?;qS=>_ zPNHEE#&lN~_*su*m+=8MsaT|O(&IQxSmkg?Y&5L4c!pQj9`h-`oVpGBs@BI%7MuarX7|cuO5p+mWY>RkH0!koW^z^%Q)rgWgXghfAZFFr+f6PU+!zPzki2Lfs^jH z^$#HV!JBS!SiV%vsnC|HKvB+>Q6@+G82B4*!6qPjmu88%>k<6v?FY=T{W&NU?$^Mx zRKlcFADmfeiw)6ZzASZ9q?S(&q##{;mdgPjKvPfDfB!r}m+?29`M{kN9FYdoe^9Xb zM5IhiNfLxau4;~G3cf01`fH+uNKdCl)?ZGo2rOfAOJ{B-+@%HNwh;S#R&YLJ(wZwy zBo_(41n;IT5x1XKlsdtN1n_F$-~9<-tCHhh_fxMZu0+4+C%7yDDTDokz!m=L;X!W} z=iBvLt6vT^TQ&Vh1$ub-cwm}ShX0{Y{n@_Ys|5>1ck-A|8fSelrE5kns50Hp zN=FQQ=w+WpfN_1NGr`>qbOVh`P!b2Tkc$^N`q%n&#FgfG14z=-J_bDO>o7woYWp^s zX$jEp=Y$FMw{F1sAT}7SZmKTT{557kmWR_)dzIAd9{{V#Tey7BirnMUc~{A(?2CsN zbD#6A!zWIPW%-M8{3U()_36HP#B}fz8;gU{JL*J>FeR3k%xd*(zJ(K0HE|H3G#R3 zmc#ixx|Ylxe7{v>6PP>b5>`r6LHmU&<5+ndiaRHgf7!sk{ERQDMnNHk!Bv(0)_>kx zQU&Dp9CQj^Rn+;d_tXvB3mIi^lX@|o{0*P6{Vf9C z1vO~C5w4>0Y13#3Q=g=BOOE@ji-Sm^jK^_mN0+D6&pLRMup*$80d3y8+=b5CZ&2|* zD67<1eYYCtZd`G}u*Tw16;0e_ zYVjkd7htrEw}k=3t}58rm|9u|+(e{Zd?Yy~y#Deof?{NgZM{VTyOooOGUs}Na#LTi zuq?QV%o=9j7ODy=2^={;4^MCtCNs~RzZ`{#Y$?V21Y{MM)W-|R#mr%Yel%AOQy#IA z>qXmK8nmk{e8J-saI}onT2F#+i0GCn8`~rVHdM3%cezV1Zf8?XUUN!-+mJ7OHXEF8 zS@-j1&1zBy8e1$n<&Hk|i5wj8eB_PrsCe~f6{nyP#>6>fw6DQ>ihHL9Ng=JGoCj39 zfLznM&-gp~3hEk?X$E~NNy)I503Ht18P7^CkKBa$%zeg3f?W=)x79mrjyRwk1tVoC z!`GDzth(dSOnLfZd3SGDhb%g1k|=zc0VZ-Y!egqoti|F@D!45RpdV!O6&&QYAN??!~^Ww5%mH2?? zP$XKZS|wKQhvmu_;)M9Xwjf>T_&||;|3P4%&+S6ACulitZ~PmSf9V^>%8z<$I(Mx* z)lqM>#RBRN!a|SsQ9t4~VLo{wE9puE<(*0%N^g^1Sntq!6qpIHXV0Uv&>RWQ)LYWB zpz;6_DCVV@t}f7SEmM78D|`@_06_K+@aA&Ae+a>z@%yt&%^CbhdX`eCa!w&{ zST-pIYR91p`O8X~kdBtT3Cy}!X;K(wY*_()-QSO{V5qAR-k)s{vEW9ieS<4~6QXjn zv(mzL%7&t8{=}hB6Hh!B*Up;BU2IFN z`Rv^@D%*o$5j-M|MU)BeDeJg4^unmy)bzzAyPTG+>9S;>f`#eY))9hHSK}FCw!l!) z*lo+TX{MD z-e^GY;V8sN;vO3F2c@Rv`(urO^iC%K7CQnP6%6+-ka034#HoqN~!<*q)nZ$s4|X&OC0BzLp=9FGQ%P?mVWWcPD~S5wYLaAHFmj zr0v{;lbF*9mkAZd14?f3?h{OhiG8DaXWCQ6-}$(`qvP=8yCC_V^ThBmVW`32>ZFLJ*( z(B?*PA5H{|oB3A_)v9KIKdh42Y#GZ92GzCXxZS4n)$-7oqonQ z{{zTPX@||JgyD6bGA=;YI85h-%fTdjQ5*OK8(qyX?I@0I@cF3jHpPA>VGZB4S?bm- zUb$0!zj>q1F!P>UXB=Z7zH1@U-d4sJK)3$vq39=@+{W?R(4Wy5{G+-g5Az|RHTb>` zKclHcVByYDZb(l7#b325KA~jqDEWRp>HF?T;#(eW_3gY88<4BVFlYRmR(-<9X0IONN1?5F^ zpkM!|5D#3pzJK$bcK1|FU&!Qs!uK2vW%u)J`iD~EYU2VI8WKBW4xUZ3sopTosb}TP zY9@)vqo7X{qaA6`0ejmNtw{+|&m3O|cEW))D5{&xujF!HWjLtL71pRVM zzTd>cGq;Lu#`Ozpo_@C8+#oM4W>I_#ns>+143*TRmHKRPV+sC-%`)c{4@l0@ z{K}N4@jx=ANFpRP`so~ts5)eA4d+hIcMNrZYPmu!;YP#Ya64Oxe0(}F-n0EaUA~q^ zl9lk&f{&R1nhY>=+@LV(uf0M<0KdU_`$K=dv0R)bK}?bZ)^4Zs*<;y6ILwxuQ&NaB zD`*1+L6B|IY#PNBif~^2Hd}MO#VzCiCQmLJ*zpfA3Q4kcQ$cJ;3IQ(*PnWEtQ!C~r zmk3w|o+je=^eHo0$*ta}uMt6Su5#s`wMnostbYZH2BM^qvlE3{HS)V)mN|@@rOJ&rwGYY17Rh%G87p@?L5KsJ79#l`xNE?Fakfckn{p<#+i~gl0%P3%lI5Weo z2xkW{UML(jpDNzu(xr+A&oXWgk7(6$#%rk#`C`|beeyw|X1}m$X(#uI!X`_KkbX8h z=cKr|?g|LF1Zv^m*Jic0#$1Cad67=~P+AtU%Kbu%3qfUI%{sGkH}`Rnla0!+FJ!8Z zw$;VK0&V+v^tUk|F!}^!d$YWYB)I2vWrhAKPbGX-U)iKgxGJI7s@e3CWDY~}bzcYw zcWH)#MQnNU4W#2vRvIsAFqk#|#G~zJ4VALR41dWu5TbHFd_>`|2a0UB6YM1nFBjdg zT>`;Rx*UaxZveC8`}&cRhK{O zL?{Zg=cT@iG#49O+u;0B+Gp0WA`u5UnH-D!_zrcVdVS!XfAYfy-1DZCa2%yNo^ota z&0x)eb511P!wO zhShmYO1GiG3&wccsx&EeN&|41|giOq=+^q6ibJ(3gfOFGh`XbCs!d>-L{(23SZb);MA0(KeGd2c3vu0s)zgBL|jOGGj;xqy~*%u?=-o!Sk}yUE`teM zOX-`0)15s3=C1yYpfJ)*=`_`tL*>-)xKC9;+V?wluUahk#FNOOf42R#+Q>O-JC>(h zfDHbYAQ#cukJfmuC{*$VcO?nNK8jFkw8K|CwA%kSrXFLgh*Cet$Ely~F!;{^Ues9f zYA>jhQeTf+#H7D$&wlaNZcL-z!ty)?4_gxE+Vvk3j;bP?^MY1mH5|`@Zi=6syQG~3 z^Az@~wNRezC{38W`bbq&+UJ|$}~MFuR;4d-S+5-+qb)H%-WMJs$# z20Zlt*xsBdh;pSMIyCIF)8@!73Ej*tlnwBUS zK(&s0y_&a4$9b5fwAR`5b-0SotUwJOpQ~<^A!3El(B&X1VI!Ts(#;D%mN4Zld{bOP z+T!v#u!mP|sA&6RVi1)rX`ll2)}(5$@LR`P#jz90tZ@r?>FP2YMK;CP{+4qN4k#BySL*L9Ep)!NWMmMF^#f; z#L_0ae6-=noN5W}OYq1${*|T2IPSEwS55+{&DPaaqny~WtYx${{drLrsAh{+_(uU4 zrZz@>*<~x|XL*qg#8AUd4O@*?Aqej@0;GOS$4J=M|NFO+(**4+!gPWXchrOq(~`dR zIZic0Dp>^OE+xFceZx+T=tfsJv(emLK zlH}lnS4Ln_-KU&HrQob}R}|R~j&V1et!J-FfGxU^Ri8AbF0J~Z{uO&rRslEAPjezR zO;Zu-vacvhN=vnGlbeEtifTpORM3;dP~i|3-$~Z$yUb`ZIPt|eI2QolYzqg9Y_9pu zm~3pC=%}YPy)5jdQcX7JW}_dKqfd~;ILK*aMV@-?Yx0lpZAMpbU;TYR5jHk66BmE@ z8_#|5);l%nCy@I6rWrVksju?Mk|FrkoNDsc?o_#XfuocUb2@GKFZBxiZ=MO>D&&>NqmyrXi+|i$RzDR zks#*n9k#Z9D86|=PSsi933sQoSyGD}w@5zOY&&OH`pUzD zDEE<67(U4hdj=LOA^pLlI)kqmcQ6ne+1MuI$$YVqJS=L|X|@9jGh zBZfA%=Thp=)n7Tz+CKV}xHAxT(ES61XmAt+1$2%v#b9U=JE3B*fsfDmCSu{6%umy0 zVe09bFObipt!8Rvx{Yb$GL61vvr>iSI|a4tSY7<0y-%MGBRd`vZ=dv{rGbDYrO6GB zuQ2vlej4*)Eh#diX4r2vy!nRTf7+epXAothI8=CSvmX(kxqjBAyHa2ZR8Ja@psDa9 zjCcgCxBVf=DdfN8Wq_2%J9Zm*?GYZ>57kHEPl_gr&Ym-(3}=iMKw^L899ovY4^Rak zAze+Yos@R@E=8LiuM4Ni@t6EvkxZ54&%TznkZp!(8YQea{*$&i>@kT8RUudU5=Q70-Ntd2`^3zzY9bkHdhG) z0`VQy{Fb6@%6*S>%(_!@*Ba$OK}kD5Sv|i#+g#sfruEYN&|N0W7i5WeojOZKY26se zHgNTgO>e5u|44BrU_Cc_KFgYlM}a2X(MT^MMDc_C+ptwW(h6T~CH2<#D9$TNuKpy5 zIyEhEbI%u3wGWd21N>@c_avD^)9MGi$ae@=5}qPnLRv%P1=*p5%%FWX6Y9QV7!DN- z-$XM6mT?hX@?y<5A{(BHMlH$s(SD z`IY@U@Cr1tOCz7Cu@7t`BG_6xne^ma@t_FX-){)D0=c8vyWrkr1!Cu~DKAT=d2e)x z)BtWgWtYIL{)})pe5jmPy4GKW7O=szq2V>Ly2L#x_kwE4ccXAf&d1`F>hX zkl0J8w;@^m$`!=|k_C@e$mTDj`Wv0H;}In3UfKE}sXHsX^o%vm;|Pzcs9M<5r$h|0Q!+5@fb0Xo5{-fQ3jRxUQE9WxHJAIjvq9P)ME6dFp%=rt zSf40Djxr-3A#ZkjwPFnRJk7aZ_DpGsc1A2@&57yTly}K{Qy4crtolMU;r$)MQ2pUI zI)y9SoUh~*bK)4Kr&xf^CnNs+O?@EJ9Qcu@i9CyD+i%?8Sq>;$zzy3adAD~bCZUfi zr+;=w+i0$jvbLty&zO6_7iZom^?HnKmYK+c(N<(H9H00ffJP8<`|OH7KnMksnD+}62;;AilLOS} zT+G?JkVYd|liudxe;%Woa?n5dXoPm+lC`PwTtYih7o)ty5>wdW(<-;7)uAE)8 zGIQB|)ee8OoewDm6nT`(ok1drPwekMBC+U@^;xHBw`P!GKh!C(jp<~IWj>@8+qRnx z;IsRyC#1Y<^;n((@-`oZl83DayS(3q+;A{~A2?o_c|<4iX&whqN3?vrH3*^_eWg5e zZ>RuHDUv@UR9KQSwtIVa9?>+PfZ|4!6Y{j2A)?}U3?i=sf{anHH=>OiD(`h&>gI#! zMWxzszZ-{4vMC@guGc;dM;eIE6d?^WkMNj72S!;rrH%F{_as9w;nbzk+i%U5S#pIil3*YN4 zx5TbhL5$m*D}f(59uIhnay_E-_pH^Yp?DCH+S#xxezn|gyi((-?DqjhaU2b(T zVH{~!W&-XUyTM9F;M$kG6RWT>A*57#W3`N%6UXaWv>*4HUZRwK21*E`eiG|qQdd6Z z77290YmW0UQ(jOSwX@oK_j$$LlPdFKg$BwJc#6`WMExxHFFRD`bb07gdKd!DsQOVp zf6$&%5X*<%^7GLFxWWrwY7EnxkY|Pty7@i^S9i&V)UC@A}E}2JY~b2Qvf<~n^r|!VWL@lX4NqZ4!Q=b6*hl@aClQ2 zuhd_fDs;P$!lt~)h5OOKyXTdA$~j|<{(5oZ7N5;SjR9w4ZftJswV^$$V57~;wp>RB z4en55V7!~}&7moO;#$9?)pFLa=(q!^PnYh{gwa4=ZN~&L7Kd0IAlaB-a9%N{*;6! zL=`|W3J;CvJPMnStl!&m@K?3N&Su)vsFRNBcqzC~I}ufi9q=I)vi88{2ZeBtU9>;d2@jm+$sh%dC(fQ$M!lcCFny74^E z(Fs;i?aNDd8WptY#YwkPQ(w_m%b(!>*1c=)RJ#R><8U~wC8O)&BkLvm5VU1%7>(hg z^oP!pvlM#mC1R$YW6e6Ig~}fdmDVRrJv{+UwPmz?Ns8NJwugAM;|H@agrl`kqKsy) zbmF?{hcwV}G{-%5Q68UKiX>$?UX(|hXJSD~`6~12I#|^0G?U=bvZ4I5W-FAF^bzj1 zqL}Bv<>3>nFv%y01CbyIvUU2|PUW}Uwg?axFo&u3o-!aTdb|@g%MaM}R{=j@G>7UdhETB^6_o7qlt7{M=2eKs!-KxYDW{V?KNb?fl58bMmU}r4`yjF^;ZF7dJuz+qVM8>Vk+c7wvKEs#3s4hB{GPRL3`; z(F>{5tHz9RYuDHD*Hwu93*1tNH%(=7$A;ar@%|}Tqq2_f-uiwzV*0X(=IpXlUfRUW z&U~jHMz8Ef#Wz}LTi*hw`3xx~JT0vvNP}!AH9bAhtegA;bO`B++&sJ^74;NAjum;F zqclk;AxM(IEH&VhY17jw_*YFA_0$?GsPwP?`_yXs$q2Bw1L&IYH1CbeC_~dGG8KMW z9?LQWD)CpRJ5L*$!617<=#uqh7x%e#?6D35glfPl3|ZTCh7vo9V*lC~^c7?5V(hW{ zoN%~dUxfN&7~-aERv0Lp^Y(E<=;r2MUz@!sxCMDle%Fj7Tlfb^`otR0yi=%fiV7;F z?5QcC&H-x0yMRi4u51!t65UwBfK0W?T1i;*5HH8@Oz-+Pj#`71Hu9-sl z+*g=y0HxZP@os@OM0J7~LM2x53*4zpQQ(z0`K!|S!!+HRog=Ca2Goc=2_U|mTi@g# z1)Ga=L);SvO?Ry(m)`V{2kupA)5V5tyt0`U+8vY>05*mxxF+ zFG9^1@@YO!#1b`#_YyX;5oKgo-$z>#_)lWgt1(LP6ft!0U12+y#!EU}9S7PSLCuCK zYjpHR)zv_OEu~#qiAfkmfCpSVZc%+9%q`e*S>-D(>yrK1cpSQKUVqYw_T#b+*xZov z5m*_FxY#(1E?fi<-0e%>p_Q(%lzcdmz+Y_PZj(!V)1zT`sns@%@CbR@Ft}6JmcHtd zCB>^tkmZ9ERea^3bT{iex3rX;SSNi$~o9_H;xfAHlyOMDrxx}9k_ONjGN$_;+SWuMLWI0hsrNe&%3p>Hru8U7 z>z?~Y2>(ukROwYZxxsns6~7MNxi#PUyRhZXI;T4JYN|hbxkdUlB>W^3xs+wJ(@fKs zuqRyZ;3TJUwFladnIta}+1+NGq}U$|NK1b9bxm8)JcQ(8w)l|+mwS&4Ns9IQdvFTa z*FVrccbvj=JSuVs0an_{W$YnhIJP1(C5)VhG|*zg(~oFkJSY58#w_(U8~lB}vDWa5 zgm)+5qmAqp;meKd}z*GJbe~YRjz9P&P%(L0djsq%Ev6yeMXbx7??xB$EomM9^ z#+e8vo^b8RZjQw%nxTfR$;i1{5`fpkE_AwtL+nsbqDbr#;NEq0!~cnH*`AyoO>A?? z4@s)exKUZ|HN5B|M@YV(@Wh(1*o=2fpesm}<$#)K9&xhkHbJe0wydEXmdm}#e|SY| zmZU!Ei_iBtlD}}dnZbX1*tJz`a-Q$P+efq~@MI*6>S6*1i=8}0vl>+NR9gaGl2~nq z=zcix6@RQ;)v}MTP*Bph*@s#s?+h8t8^K!~Uw6_VTm3)G%Xg=A)`r?=E!m7%A34^t zCA0}->4yYuv9D@Jhc`q;{gQu!BrmVFQr5Rhnl%xA{aTL6u@64z!C3Z2^ShYQsILw% zHLUAuunGL8!m0+iYS&_s;Gj?YR_aPJPC=`D)^%pz7cZe@+9C6bi}5e?86bXEZrL}b zQdaImPfzNPopQE18@B)C2 zhJk^OiGhWMiHUl4AnI`dCK1+4I&Mj9Voh@(y()r!F)A#($TEy8rLO{*StdP<5eWVq#zd|5Fzlx;N^E zL4=7#$Bq3`QWI$IN=(la@{B|(E~lm&hk;k?l+?oQ5AG{QzV)|f|0(UiD*L}HEcE}Y zvj0`s|E_BRfQNyGdh;-dP?UkvXh=K$$GJ~m6{o*P-dyslN|foH`>Nrfye>hq?jV_W z(A|(em6CLwaDjLc7%B>EY8$=QOwj>-Fk(+utFwNQWq>*zK2%j?sr^BM8MS(bggvq0 z?6ZXolT9lQke67hKyV%p3^&Y{pl_miG&cF$JtYhzP&%`=XAa(^hUC?(!)lU8(nMbT zWRWy%z zbxrf&BWk9@^er*QnVkq(p}wwqJ_|+-2k@pg_$VtSuI=nCZ7EWo2y9LCX&=NSj)~Ub z>i+ z&$}T0=Y=neS{_SaTLz$g7oeS&{zK1Mz7RrF;#ki0>5sx|W(o*^WuN*6Kzys0kpz|e ztxR7EYcWrws1`N|N-ZQ`bU(%wNuQN{Bk$-$#8dq94z;9fYs$PLYjxC|^Ssw%Dkx1= zgb|-#vj5BsdxR87Pe*U1WfmKxYzTpHfX5Gn5WgNiUcbrb?sLUL7NWXuEnEGGnZWIn&6`mcYp>Hwzc@)MSX**&tnpsz} zZ&Yl%Eyh7^>+4+&~v9H3ErpiUFn9umcO22C4v^mBXoEW#S z%9PridD+59o$hy2hz3^@*gaGN`Fj`1Qr^&1E4|>*7p1xiIzR7J+bE=-&Lk_r zw0hG)9QE4eQNBawy9Tp~?xG$Zp6Z{{ucUkqn$wY*219~+M9LaFGGwnd8FM){sFp5`2r}ZR+E&gxrV~V zKDZi%xUrvw(}dyH2cJE3{>Cmb?h8v7+)exW1BeAtO2+(s$#7bNzI&_wbZ%RG`h=PZ zu+|64q}=N{U?-Z;B<}9nTqvbmAwL(sd=aD%juAb2dGXL?ClT`*h{2ZSRTPTTg>h=G zxH}*c48gVi(GtOQU!0RoIpA~m?56PL5Wo~QX<*~S#1W1?Sm=AdAk+4of#8QPuH#2k`*KR7 zyDGGIsL&FZ-edFD~thABZ-g^=~*Ls0#P=His}7eL@lk(uyA(T`el3X8!ySgNkZMcuhp}@$KfP9by43HIPHnSJWG3BVZPn{ zVvUs`Y!($1iG|FbRQkmeP|GE`xB6r^dQ`3Yt;9}Mn7iMVMoOOintFA#_L~)4tR?3ELp@?Y@+TA$wxT$$Ko$WOR?NKf1F6zxgxIE{&VUnBt8>9Iz7AV0ZYr6U1GG;s+~p4 z>z`bV2CmzQ009Qt(XP&$K0jF=wYd z_Z)J~+j9u60;jv&0p*%XRtDc_WL_w!G_F0Sg}Z4BUvRXMIq`8KMrpkD6G`SLoeK`v zZj1|(zg;>Z!X`I_lNb4O-@dghqPeM?uT;vKvt?yHKbk`7+N&$*O&bIkQ3m@xscY*e zr9kCUsEeXA&0k0t$ux6jZ?;|Bdz20R$ys~tJX%&G1MWy*ekfB@v~r?{(%(1TYBj&= zj_Wu0LiJ=DsvsQ@{n_~=vV%)D^OM`{l^CvJ6-NdADr3|e!VL>RA!g*!%fKBXxa{HG zc#SmC;NgLqVx`wbzuth7k^I}=A=7fxk7JjlW6QUZTCJQhF4gEQI8HnWP}HGLZrAa9 z&mSBh#Y_+QAH^*sM%wQC087S@nf~b?7;ks(^l2oYRNXa=f4d#i2Dfnm>K+kG_k_JO ziGEnUbAm4&$@+OGse~vYA_W9%A14Cz>D4Tzwhb+Vr2A;@t5E)+SCvbP$$(_sj^M4d z%?giiHiqcgpE-inJG@4F94+-&-%I5l>vR8#3_%&XPbyo7slflR`=& z2>lA-eJWL^2#%2wP>dYB5qPVGbul! zCce7dZcZp!Afyd7=>r>${cO@ne2gai0Q`7vJ9D1$$q=ZvIi**yUAiR0NS#WJZ1|>^ z)U~Ut-AofwvWatYH$DzfSYX+w$&E8Y6O#llJRVG-zy_abn*`u-)@Vhp#{ge&62r%n zAWxC54)6CHtc+@_{AzJoCvPG!Ca`Fh!5Jzw@_Q)r`>(5YX>pc`kQZR1WVUECwf(F- z$WFcEo#FVRE+40|W68ItXASK6-dw?f@(!8J9_Kfo8KomuV1YX1(Yv#w;i zci#qwWoonc0F0T65A`j&uwosPpZ-)~N_|lSqiR2-Cxs)z^>*68O6JM7Wkk5CY9b@pws5nNz;;p12VC&6ruKry zohbYev454bhaY8WW#yt2n-{Nsem^rwwgz`!*CBXaM2(z6_#EXd&1nEJCaSC6e^{@H zCkA`ZF3mWdTNr%{4475XvHty7hvF4pm*0#rPR_yX+H1qfoiZZb_2O)Che@nXfm;50 z9v6J=04Zc)7Gs3;sKj3oOA5`FE~AkQElvOTv_v!|k{~|3Z#&loyA%0cd|$N4dM*pE zN(A60J5Vijdf*)irTGa&MF1kFR=Xzt>CKyu}b0d zC8TSU4qW4DD|z=Rhv}Ks#Jjf9y{;4V5))j$3&KzGooiCA)OP7pduCq#CQO&$l61d`J2L>JLeTm_uDHf0{}Yx11MLJULtcTx?lzdVoS6>C(a@7bx!x2wBwUh0vrn>*J-fQm z6OsU)!sTtNBnAudH7%)#(_TT#vpqIg+VRG@)|({aZ9S_%`U)QuI^N;CjnN0jijh->sMq4!9sf?`lyzKdqLE@ z(qI`u96^|@MfiJ=tiqQ6hm)ZRml*6S-0CE!K)}?hdk@8bg}qRTXAPNUs;T5pEf|2a z5<)Z$CsZ!DIeWLZLjCV&?AWK~U|X&uhRSi7h1GiURGKKpSn6lz2k(iXxd+f*YJ6LgknPh&{SBM`23OhPu zZ59=_Y3VK|;XH+dLSU}Lxv4=DJ<+$cHx(H?ns3`=TwDAC+0qr=6i0dWD>zkNMogXpt38tesj(A`F8bjgWi--aM`KO3nuI zrg1N;{Jw0SH+J{5@83}Fv45a|Ea!}_Q?@rO@4M@6-9RCC+;Hx3{0*Z)oYP3&3}CG9N{| zaMT-Q{5f_N7YA7dw>39nLgiWqAthM#uq~6}d`p#XZDt2nu9?LkZT)_s5RLXk=;)lc z=7h<3qnZVcC&{7OH^2m;LSMB4ek2+-JF`oIL-Ea)>t?TH(S0GSo+3P@O$Hf#$xiLmkSWv#Z^?%@0r7XDhg zYzj}eJC3#?5akO$pg*agU5|7&C}W%WTL6dt&g+k%-Ou1aO~1abbo?slB!8@!#Vu;n zW~=8kb*$Jj|25GL1HV=~d|527SHqFGgx4xH2_r_y{HrJZQJ&eNBomi{L6$*+={NI~ z(AkKNL!)O|L0q|IoX zKT#T3w2gp&hk3syHyr5b>V)=J$@4ZqtKAdN643bE$k@I%Hf<4&B?UncW@?JQB5f$) z_6J?t+Od`F^27vW3MXP8==}1~3~08;U#LQ7XWuZcMLdT|j4{SX)61^qb(>;3B+!V< z39#a$9QkR#G}8yCNMbvw-mC8xTf6q3pK}eBnD3O+fmEHK2C8g2S_1p%sY8`0iUD!r zA|CY7!IzeRK3-QE|8cB5bnR3X*r8D8`Uf)WN-pMgU5lE|Uf@0JIaC%sOy|#7V^%OL z4KCw#WEd`cp4SvKpcqoPL6s`l&9FOI>BeZSzCtTdlPx+hKEcu^4vB*7uje#RN`Iba zd_fbh3t1yzbU#7k0Sl6JpT?+Zi0Or#DNFY%72l5dUsp_m)w(1Qc9}mCAyjOSTHHr6 zoNikHwIqw3kk6&+ieGeu)Z$}r2ZqH7q@p(Hqgwp~ekx)gvi~qkWB8*ee0}NYt$DdE zS%aOqNazrFc4yw>@q3H%CFM#2`U9N#<#k&&!h+QopqhcFYh+7$f2R3jccETaul{LI z=)>S6i{>Q4IKa1@(HTB+@0e>tGGRereBYDlXn{^h6kkmr-L1c1{v<}Oh!D1$WZ*qf z91v6nXFg)@`yDD<#bu#n+kA)dsO`%^OR~CyV0q6Z*P4w=IY)zFZeKG54(h|~2rqhZ zJ<`0r)>2QkztjXc2kr7Wfb(}WaE7(Rnetu&F5=OWVY8Et+fJ2jCkiosj-rBayH66b zoLEsBTwSTu)L)6&`nOK}AGPmq*g-TE%&$H8s3eZ1`z#dBwG89AP?m_4vW zKkqtJFW;_F%72{lqR~o`fGa;}Rq=MM!}~sTos%ZJ4@{_>u*%V=lAl7fc`ukmb!cx4 zOc|d3fl7q;XwU0*RWlMqW{%--i}@lCF%JmUz(kTz;4+?|+$XdAMSyOMn6f02nO^|nGwT>On|g1RnsFjx=XE|jfro92Qgm9r%2FXV z`bL*aMu-Nw@vIK2JG#ZWZZj{-qiEN#CZj_O2YFUG! zJp1g5_rjb5;zF2F?ETTC?k2v)A-{p7MzoA(^|@?f1CmsNqqqP)RCk-^4VrB9J=Wq^ z;)(>vOq)A+ngBHOGaHih)NzA?=)ek$)aA6_dr?`%H~4wxk1-Q$4EgZGPRl%4jq-rI zc>6z)|4s2a*TPNpHp}+ejM51tqKnxeVvK{+Z$>HaCifOrw=2K@G)ct!I`6(TQ?*7r zF`#T0ez+8anz{RWEg9{Z!?)V;MRJ%zV+B4-=O1xVkf=Rd6FP*InbySkdd_jcor~2! z&n|_>?ZlqHD5Ux2ZK4LGL^pX|W{s9m>>p@LTZ;dLIrU5hP0pMZ;!8b{zX^>u_<+Sj z!@s)QflmLWin0uKB}+>S^MfhM371j_o&LCt#G;F|28wEWxOKXyaMmjx+-KVqV*=$m zY9LgG65@xI1LkO2hPwj)lQq}?0ouk|Eb<4#RTUaGY|m?{{~lCi#(NEIG09EN}aG;pl`wO z%O*U|2Ek1p^;=)Vlmgibff{EvS+eL`axEh60Y{9~k=O3Q*lvCr+D@nV1~Eru__q{{ zF??wE6n1Kip926I!&6T1R9$+I`pk5hoF?iH1ygm`59bn3&yy>h48UWTEa zZiy?8r1)8rBhy}C=V04;yvo2kMA&>M7^#B#Kz8fl5h_{n=kROvBQ%Y8Rli&f81t+2 z16`kG3&a&wY3rcnJ&2J~7Z`jq^3J@JQkc)Fh zAg7_q-tJKR$q15toL7&AwnLaLKKz6i)s>JJxEns%tvsb--=qHn87Cd{mm&QUD|NWy z0-5R6&`H4Yz)yG_&4&lxg0*;OD)nIx_KB6Szty;1r{M97l|;-jn2CZ>fqOpDZ31^g zsaWHTr-Ppo>~7A~u-@Ni@rIT+j)%n+PWv$J3g&9Q{Jr`R1Kreq!C#V4WHtZJZLDs z3)A~HT(k%KcWNxfBw1Syx6^WEpUgT>qCp&|ac}D3xZS(u2dCZ?U{$DX`KK3pOk>;< zO*HXU1VULql=liXaF1eWoEgpNM)0;OebWh*KnG=XltO%)o#+~}rK13nKXP5KObWQB z`)djO&nWmjqtFkPUL>}ne=DzRZOBF}X9CjSm@(JTf{|o@#(`gmzMp~a zegmHXLwR%XazTecJy8GED#I%ST(!^pS=ph-sZiz@2~th`dD7g;`O{2M_*Y7Nav!Ke zN3#>3Zvw`iHidLtOmy@@PgivefUr^~x7N%a|Y0&K_s?&$1$*3*b9DbF7)`eDr}{dzgfZ z$^E^_oiabijr}$LCS;ljF(V#Jyoq3%OkCrbPQ6#1E)K_L6Z-<^x!W#DEI4=SysvCig*Y}y6$<@^K1VcOYTQ+y1%DMIqk9Y=i2-^;=B z+p8Va5$W6zpQSst>x!d;8pNE-bhK<$&mixi7j-&C_$N^oee|I|zt(KQdw3*QBxrL{2L0zw3Fp%lcar}qfd%R<6KY4j2Raba;}gBc!}*N7)whzSM}??%Q=>p z^x!S|v?W->pqN>=CqqG<*)%H~Sgwh~_7!eUdK@|O%Tl8v28VfH_&a{FGftkge%bwr z`{~`O7i9@B4y4ZnAcYM(Rx%^Hvj*Ref{%@-0|bjXf6Z*d(InULr#`mTtNx1DWvrec zHX{_f4m-xFyd;i-H^QeP{H1&#l8&2;5d+1M6(8^MKBUacBw(ojPAOJUg(&Dq{t)qw z_I{Z`Zs9txHYPYp11c&InAp-lA%9T zXz${$@mIKzQAAITdyt>d{0a-|q`-~42nHy!i8U$>wRTI1ElfThr~HBFPziEOb|471 zJqr5psO8`3^@u4&y3vFDTTGkdQDXH}hA2>CfbS*s8sh|g`E>{Ti`1&zph2L$7qEUz zK+XnB26@75fw8O3EbjZdZRMmywx}jlg3h~UzZvv3aN<=3F?;|pIJ=Ai**=ZkYpSjj zU-P!@c40~enO*aO=0&jUZi+>*?=_nFfd!wbS!-St5y(-Jzt;WWM3dA@#4>`p8_ZNp z{Z2g=WC-LBEvo`^bvrWpmrBmLDPX=*`mW|k>r z400EGO|4mesBVmX?T);J9!O?HaUy`4KSWLJ3JD(0ao>ycBLzLc0H*2`?aRs*lVi3h z1V8xkmG*DunnA>A3j%ntC4&sijbpq6TOf|ffzO}@$x-K;2*98IQQ{xS{c%Q;98Gst zV8#i0oLr?>*$yMso&t1rz`Ai%ybZrlUx8LR*X?#!utVk) zplQ+$E>Sa{W||fX!s);MeE<1D^{W$P*$IMTVCPc}dkah7m~n?v>ucUm&1h~~Q2}!1 zuzvRgMhZr-50YTs6?3EKrsTvHTl;Yu)*kc)<#l*9+Lt6(l~2#;cdpOn>1W>R1jfi3 zS=AjhtFIX)xBldUU*dId=fhQRDh(2)t@LNuNaq=M>1ujLfr+E z=4ci*WK8{|#`EM~xGt_q;qy|w*$BfxW{)jt%Upxm0PPpJIy;~EBY$SPjp7Ip{+YWF~02eN%c_)dN)TzkQoGNZM`l_{9l06ibT#9{sR?D zPeO?+3bUBl@##KqFl}=3o|hBJKd4T_uyJYwnRzB3&5J(=o%)y3G!KcA(D#yACUL zza22kMmju0Y+;($uwhu$tGVV4(8RBC@+c=+@uSZ)z07Cav}iI>zo};|=56embyq4t z83C90p2q-`ljvUP%{T;+)$t0|3t9uf3o+6&$6OiIuk$2%cwF#ObC3kwUa>R_DZ-At zpRcQ+62yl_md1Sga8LXf&<4G_Y^OiQ1R3D8h6D_%gpQ8m+L_uvq!dk~akvmVIp6UY zJn)XHL93Q4(ob3Hcyi;G^7#R0ta1~B{$PXMK5dUbIVv&u9YqbIZAM&FUdL*~Rn*{) zl|(81N16JvqqjVIt5WRU1<6<>^5mA3c6~qFTr<4LH&eI z6!}r$OcHEFqs6#Fn!yj8vRhe@u;c_5ttN^_5rI-(#h!m4RhNIDgEkg+vr7PflQ48E z6%lk>Z<)M3e5zm}b^d`2MEfvbuD)1cQ@q>@+6e~T)EMSsoPK)c>itZZre4jxMYQ2q zKm8x*VjparYu6Y{oi>y_A(gUJK3Ui|mM`yBJbYCAR{Z#JmVW84oLR__z@fI)YyPe5 zbN4ahnO#@3UQ2i+&OjAm_fW-m63s!=XAj9c*amScL#??7ZldW3J zS$U~1Po%}}d&rhH;h%jQG20!#Ne&KfSEkL!EFFu({btXr$Ozc3P4$_?8@;WK2Id4GjnR$KEP(|KjLo1P951ytZ)-zT}T z?{e_$IR!FqTw(c>)+0*3E;}Z{%77=UC8t)KpT7+|p3gm<6bu;MPu+#fUX^TIW~D1u zQS`nf_&V#FWie(qDA})@kbhlx{LO7KYhArw_S?BGI*Z;Ump2PoRny>N-{$5|_6I`E zmid})F}v#A&L+E+igR)7fyWObKs0{P9(bzJ)B4HnQ6;GLWuk^Nn zw7#^+&bNHI0#}@j2z0Yz=;x{zdW#-@mg)Y-_B-R=Y%j{!7{k>V2Oz>`$t``t6P7eZ(bOH^s`MPCmq7U?D*n z$RPfYIvxw13DI4ynW$a!X{w{PC?yoip5&oNh3O_6-G4f-5pEC~fxO z7G^&E_FHc%@}?u|Z{i57zuv(iXv6c(59<*Ik8m#upaaD2Z_i%hFCs@7Q2E_lSgWiO zff4}74fqZDu_#v@)IhtF zP)Q;F&h8DZddm#!zJJPM%1%CG+dhF==lQex45n{nIaE*fm;M2vMQ*80<4HbW6_5<6 zNCcjySW97~cKV0sq zuw%~1$h>h?jiFGs(La2z7e`P+U1zdT+4p{5Vqx^@5>>5|`mIy*vVP=$!kA9BCLy1Z z`<=W(aiELKt~H4Q+SwPIo6YjF87u0m75KY50KP;JA!GddzXWu_E(sdkAb)aT$SxpR zPO0i$LM!3_KuINh6*>G&D;$q>CTBAgV%~6Oe|2Om_^GiZ2zac)I+n@PfY?&~{Mj@u zx_h~+fZrQEnl_#6)p8ew%5H$N}%iFq9RInF6UnMr0P)au<=ro8Osegb=-1XtS|@k ztcjKFB#*@4ZglNe>_`%p@3{NDj#hS&W6dlUP-UI<>7QXrXoEss1n}`cttxPm9BDmj z+4{aUqvKb?68N-_|IJQAipy5VN;63-51aDZZLqlr>XA;oZu_hVBudKYj zTL$UoO4grR<;?L_8YE8Q^w+*W^<*l-qlK|8h4Ls><&aaB%{SQ01@NG>s+!dmV!Y*HqLu=^n_rpU#rN_SV>QqLWqHasmKCHg>6 zWlcY#Siv(~iatBfx^YZeC(X{gZ*)Z$90bxyF~Ff1ltXd7E))C>E_rjubH-$eZd-st zpVvk0tZ7g&8AOn4+SP8%qKv-w7(=;3s9AWiUSoIxE76tsJ(t1sGEEk!iDZyzj*)6} zdXY|AAxl6IreZ$E{xZp>=27~QU5EOZ7y3w`NIatP5hN0l_}~PauRvXND?673_aJPi z=oJA+S3A800Nm^boTi`s-AUZGTlz5cJ*9$Mi7UPuePI_ER+1&ht^nn*GQgA&=WCn{ z=l%xv2@U*~4^`;Powi@mBx8+kTXD;YBiP*0aD9~uT&$SmDe7UGd2dhkj_4ulD@hJN z;AqA)97pfV=^~B_Qvx__vzS8u$n z3bSscwL%=&MSSB|k z_w#2!zMdN#gUno_ zUVp%B1_X`gNM%-bN*!w3_0WHTPC_g2r@e?$Z}HcUAKyh=yG3#?1MSYERN*S*Ktj_p znfV8*(`Lt)WzxVxITEZgKqneTHr!A=RNKK>r*Ph{R?D`e@2h&m@Vl`u<~(z~e9~F5 zaNg4JvBV?T1QIG_xbT*VWp%G$ri`jyz6Z7Ya%f^(Wj4j?$Md4N4>9tZOEUI9wS6*} zl0ya@W{1xQ&NpriS3nTCRqT)G z!7gJxj>kFsS<^ET@6!d3SO<`ivon)IxmW;%QBer~VUk~pk_MSBjWfeb_??N82mlE` zbl8Zv0Ql0XXxSStajPmF$2nfbX9bYZk>??9g z_S!BI=3RnsZiT#Yh1>ouKS(W;8#rn7@lgkj+%;$2o@s4PTiA>LuHUcgki0VNe=-S+ zL#)nNm`_)X1c@@|?!yDH^O4$PI%TYaDMugCPbuQXKg&qgiOmwHKE*z03?OTQLScrN zr4e4@LvIBb+xq=9d^vZX#imY|INTnJOM_gvnJMVje~nRxqM5WRAec)wZp-!GR2jTQ zAEtW)o1Bm5olg$8_*UZG_DA^PqpPClbNxab3U!6ZhhiyaAx73 z?&GHp0-g6W3s%>9oHMSj4!Q52QH{nW{BTk6#!J4^>5sgy^*hUT_4;1YsL1Gz!(-ZL ziG`0hJNMD=NcbsM)20EEp5jDLWMpk6_V<>TuR#9H_VAYTqb*^|^I$nCXX`LuvmdBA zaV{)*7evXQ%-1=UxK}Uo|gC=Gcl_Qi#o7(Ib9q_~Rr5HC4r)Sk;YM{Qd{(QnmOvA4`WW z7fo*;P@Q5ix-2&WHcsm{6v|E;!Rl;s%~XnW`w16TmQ#4M;%D7M z14a;YV0R^{MK@s=xwXxk;3tLQSTsXWKR;o<;BH7i{@`Yk{@RlS% zQW+@wV_W=s?!$`PPjW{m*VS)T;e-$3B3Y9ye-Te_M(gt@gp0|wXa7FozF+%YHsxk7 zOXfs3E-O2Ka$~>O(#(yn&APHcOfq0HU(hmF+BM(V{}?b??Dm7TOjwVa$`fp~kiY1| zPZ0R`A$*%822@1g89?n};caiLlr7W!t{a_v{{q46TiBky&Enuzfpf%@!lkM=C!@%G zm1l@Q@(=XpK7t)Hm;bbbpULb|$1=;S&dFuoJ309n6&j&s|KY2GOv$M0>ar?Mnq>Qz z)tB2Wamc2@!#v97sV4M1k~4Z%&hCEDxUj$W(s&68+59HT(CetO-RWwB4LVcbsg?Y* zCDd#BMr~TQmo6#vSD(SGvKOY1{v3uJ#G=|0eBco?J8mQ23eIC)i60#Xj+8(7#F@lXTDinze|6bTZiO0Y&D>YMY*Q43QP!ea z*$G{@*L+DIv#F;|a_?dYgQ?n$Ys4t75{6G7JaN^a30|EX;~&QkC|`ZqpT z@Hvcd2L+f5->;Pb5Eyt%?o zy$zB9Cwq6*SjS@Pd!_b1g@{mue5 zfOJM7X5hzK;C;Qvwy|C)fUm_saT+(;hcRDyJcpU7>J<+?kX=@=MdNhNjLS-7B@H4L zpIiXVmYN@#j8!u_IY9&lPM;k9>$s*!gk#io zgyMyU0{T{q!ctGr-kr40)-2v;VjnTj#|n*gGI#n5%PH$ga^rtMht&0$r%6aMTv z5lIX{zM64|R}wxoK_dQvW{sf>zmVD#=yGDv481WvDzu;V0S_r!GSL15*ro=X1a&J@kOSIxR&mN!co2a zy-L`e1mGW9zAC!B|6Ker99NZAv`R{mq_aU}a~XBw=~U~61kWM)_wcWb7gS#4F3^Ud5_@1u^z}hE-F6>Ax=mRRJ?I0Tz&fRw1U{~J6(tt| z!5z~0x^if|F`m@b?aJ&UZae=&*;)(X9j~n!)4U{@~vF5lP7( zcyK{0Rqv$5ilPt-z@RcsWII2Z}`D5a)Jl7J*S6pM^ z(CI23@=qbxRs62!%^9Oz0=*it~I{8cF^ErjQ~KdAHq))aJ;r>ysik zn5d=Zto>+cZVQ(iXGBi+O&#t*|k z682~xNF)EODIdAQ0bL+v`nV0tpMFjkzwi6n}(m{ z4z@|5P=lo~#dg=HDjxtb3dY{E4|>IyLVU*vVLVoCoT2{YHd*%Aj|BU<^$q`i{Hd=p z)8iN41G*!eG(%fDF3?5;K0!Nb^2eL z1R>X(a=pq9wmaxA^vEc~>~^P29?rfedE&!;!wSggf|+$`p-e;$u5htkCcKJFp&^eG zMH|5Ps>(v>v{0InFREVe>(q7D#@R1pPJIey?4ugv;Jd~+a7k__9G^!G8(*ELSdXq) zMNNEb)C)PxVeG2qG3><^y8tWHu^L&9aBPa#iYVdfy26#I>GR^Y+2Pccym_V<6x-j3 z(MmtjURGD-qUywYoto=X#tV}p&3{xlQd?ZfyFpJ&nTxYN7Fjc~SSPnj+Km3vyT6)x@N-)rbN!RX)W6tZO_ESISh&r)-L-VjJSzC(EmJB&Gr;^9n@ySfeK zJBa#QZVZ;2I7Skz#nOH38I0(3dSB|QE^mV^XKHe1Zm`a~W4@C>bW#gvW&4kJ3Tdc? z>?gyk>IuW_a{NLbvYeN?kvN9s-dt`Xb?P(hGz(LfO1_`E8$#JGY|P@w`n@7@6KT~@+Ocih<&m)A{6&hB zgtkUbq#$m%jWQUh3n&;h7mf{hElVs-@FWGWx`Hc5r4*BVSRuL6FxcOUQw#lbwkiTi$k-rr=H~7n|E2`xcPxHGX+9?uARRM@yEW8jPoRzb3!{p=~PGICJYMdOXTXY;fmg z_iGA3T}WNoz=W!;Od)7ZO({S?-F=T;gF7$``=W;YPovprEbp|6pQiR=VkRa` zX|E_nUl!vP-g%s++w|sg(~tyu)~;tuKLEv}nC#dW+E>sTH5HEr;-Nn-fNMUIG;yyR zCA$zggrNu9Gv{5SW3XE^x9qKR+@W8V7?Yf*vAMjl_b@}_&0u^i`yux;CA{*EAYlLHm23+nS`qhZ+s$NkLcKNop!?NhH4`E)*-QAwVE z3c&XilMtx!HMC9ha3SitWFMJ;cQOEg;owhj+wBOD5}VjfyrVu*AQxeeY{%T2LB|)6 zN6a}zS6K`8L2K0(VAgN`C4r9e7+!HpE9BW#md^BqelBnZNmF1fYPqWi^kNWigpTjL zmd2#d|mXgrbl`%&=lr$Z3_eo5>p?C6noQF>`%0g&iu=BdBD%^pL;{Oq*S^uVo z`vbS4P`lYPhQWPXr1oy}vap6O)3RaE9xZaPiV(oCsJHJ|9_B`DqSC{_znL5-UjlU= z{77%=qbb?OU!-<{O^C$l!h@ECJ8iIV;;t^+R}cbPf`dWYtW*$1)3j|WzW3cR;pL-F zwROnHRt&enD_62qtGH=Y8T(a@$=zpmB;2x?0|(3+c0UA{1^I@I?QTEyz5OnTd;zZz zDWeNIZTHh`p;%MhSCGGf>?LmB3TGxHlkF{%j25_6%Xtm%Ey4MH|UUtOf) zGXRUh1)Q9~9>Ww4g7k~vD{|;tr1pZG`{KZpNK_CF!ya8w8ct)Y=V}t6(KfUDOpT3F z8jLitp$xQxH?z9n;p8S)_O9`Qo|3KgM(jqpXLC#IQWkykPvaU_9;i~imgcbm4ELG; zJeC(&3%;*O!sTXbLp_8Y>zsAX9`+!!>XF*2hJqI-Jx||40I~+h{oF_=?YCQYJHit; zL*lG+N{Siu?=ZN_0tEb0EN#*!D*lxs_@oXQ`ww)yAeO^O5O3)yYxeNxJ0GP}XAFy> zK(+rr(ETIiEN1+brat_g>!e!Oe2C;V++GTBtA#VKMUloAT<2o%*yTSB9O|&CaIj4j z$JpR4o}C-s;aVqtt+mhk_Hmu6nN>hzBC#;lFsr!Nu>HVWEf$lS`y`#8bzjhK*RLpUI@B^sJq^Hx zrcwK7a^yB5ED7nHTQ?LMu$@b^8FrjY^n>@o3dOxcvqAqrNK~C*#0uE168(JQsma=V z&{gO2r*hfBE(r}P1ON?rKhC;G_lzf4?$P_iV1%Nz@iQxo=W^4c=;Bsx!|Di#8 z*@*x{?9R^K2?qc#!=cr{V;eC*Tp;h7uYki{XVR&u6?_BK)-pAIoxxoBOV~iT9d>y6 z#P;j<^FDI9#;-s8<>bkKQ$5Ls(8d_)Hml3w_=Mi;Wgazo(TbMa-*A=dD2Lo0mF{lJp~@U0?dP9O~HO;(w;*c=1Kj6qTNrO_Uv3 z^UnxNRTvr**PNVP(Z(-PEOeQu`BZ%mYmx+AQlmfL{PVN)_m%P-2cP1P`~7AuH2499 z+^QkwvrSx8Q3=tEbL3fSY%-b{$3Sj5ZfT9;?iaQA}ALs0ZU$g zD+ITbuL6bKK^5_Q!5dSa)X^JDVso3^e*xo<$2c(XJ7c)yo1Sq2bg8Ryoinj-9ji5h z`}JmNaAp1wi6l>&w{8$9TpcoK61je-Oa8na-$M&Hp%J48^V~0dbS--$E8P+v&RSo^ z=$f3{dOG3vhokYMw^qbv+sxO&7C{U$(-eW>yt1xlVO>HdZMW|UiI4Et%%$u7PRah2 z6TIy?oISKgtVy4>C8JhCHyCdPC+cfzP<6Werwe^t$>TGMJy+e66CEyQ_=Xf6T|c=v z(tnnsHa&iES}L+ll@g>OapY&scOV1}bJ+D46(1!2rf&@SS)3Q7d1xHS%$=+1c4rLT z`$Xz^j8{8CJT5bTD_9EBC+FWjlU9wfB}QNG4EwsSbZT=cNQPv3NCaQp0Wk5PvsRg4 zc%UjWw8?76H;5v;nZsfN@#I%H?Gz!_^~6|}%(YlomUB4y8t6AFE>amvcahqL%7QbS zl(9Ox)i7!kED`e&jHa^*AGqcAvR|k zjm>Wj3v;YV=*zTrCuSbB%8N#Tj!{tS(>;)h$eSm`6Wl4 ztsK}3(oBBljzu1nM9Ulkz<+2?5vEw^=gw=`IaHxv(OoY__ipI=za#Qg1e|PMCiOvzvE!zg!K{PLmx^3B(NDb-a6*iTk#r`}6 zpiEZj9H-Hx9{;Kg{F{G_-0i2IiGCfg zAb?#4pH|5TZOr#J9}5Z~L79vf=%j)@q34xex8QEo$(ZYHL)Ck94#sLB_}IhjYf$${ zkjJf+AYw&JN&eU3MjGJT$IhsPecL6Z9)>s{&;F5P)^J_&+#jj^@&_gWN8@5lIN4U` zgr)Y|(wSqi|1dz0fL+Li=oSMSfki6F)Y1s$DWr4nMBU!VrC( ziDP)5kyxT@XzwW=@5&uAgM*5&J7!!If~MFtRs}12Nt0iGdY3S*M=0ef;;_0s_$DMz z_)#0f@z%^DGc$080@>tBOS8-YOrf+?e$EezS+V7rR02bU@V$%ghHn!B{}w%JS8ac; z?kG$`rs=@XF7onK#bY38db4skhEIla(2E&;Sud>?lLaZ--u&9HC5g9?Tj^R~EPP$* zKNIN_%Bmj#6l#OTXwQv>`A62&qS3tc9XzmW63RS(s!tS!U{-a2GJkfm7yTTifPW#! zXwuF+I)MEXikXi0wDSlQ$l-z8ni9wkQCm(S|HIy2bIa7XtKADGC^cgiI#S4AXyek2g-|(S{hadlb{V_UjE4wZSQ%b9IDsFR)V3h>i zQPFI;ow~+_-f2x1ve*@MnM~B!QmuJ1Ok!R!E8LI?&!zFuSv(y}manVa{Cb=viF3<;S4;FQ(4^x z{6+&Az3eDNXb#7GRY5XXU%p>xcChVB^KL)ae4&iE%SZHjQ)W+};DKB);+92Aif&Pk z*6r$Cqi{=$&?w|sops6r;Q+l`1XdcMxmdF2c(k2c1IOA0bd^!Q!e5gU8t;8R0NyJ# zeAb_VR@DZO9IThepZcOfr_zz(H|?BX?Pzn=n@YSZAde4(l=fd;C4N@9wYJd~_)t4F zb;->zV?jruOmocc9Dc#u8tL|>Zs++33SJ_<%%lBYQfoMZ2|yCEHh;=)9Xl%c?9AM} z&?A1Im4NY!0Rk|&=D(wDTc2yM3EgCEQV(h7BnfHvhcokyTWC1uobE+i7Z7D@w2J9w z5RlmtGnA>8IQj_pJ?g;RZw=QJ2pfkO2O!j^3EL%R0UEPvdipCLHXpvCtt>J_V==F? zx%fuD#s5Afp_}Kkk;J{@Bf=gL!E<7(6DO!JF#x&*EP={ohl*l`-7lyaUUL$oRMTH#9JpIj$h zC-|;@em63duV0#~&q0SwE=X*Vgg+L4H`>#yIbBz)^?tL8vamYK9FUlcXHz;?JyD_> z*GR?9@dbeobe|uP0+R~@t=~7J-|ZS5mFdRNPhg-Z7X7EFO5~N~vb` zoappR(T>ya%5YX{-TedE=vdG%5>W!#rP6KF?i&Z>C?_#Ur*8O{snPd@4uIrIFrA&d z!rj7_!m_-g?C6viz2)U*?L}-Z^N3kfh2}dMr-aO=%WG zi89xh!b#h~zWXcq(^%LP9kwD<%G|+%ypeJk%SF!O=mscM&03cd2&Pc!ac-u*?)|bh z{A^OmJbe`bbl7~-WwAF+)+ZAKv({KwvIW{^dqKkJh5?n?`W>giqov9}dm@ONG5rL+ z2^sW@YO67G0n5Rf#_i`?)g-AXzJrjOm)+MSaOO|rL5J`v44w~BRpf{s)iAdYmBVi6 zBGZK}cdEOZl`VH<$cJi6n^1~BvhM^ek|&y(J+Z80SGu252HHqlz~2B+%j{7NVjj2# zDV57vC4nEO2uvY2EsI|5XBw5|G)>okSE%^Hi6n$q(c6T)M>DDu=NY3@I0}&?$6JS` zBaef$+|a3z+NrFzuV1-v4$NxDT2l>XlTn2Yh{^)H09EHeNnlOWe39Jh9``sW<@Tw* zh%7 zgI5*itOzdCStKj>5gP}o%ZKD`U)0>;zV$4KUJe2*R&#FboTf+a62<21Xvr}KG-XuI zO5+3$v=BDE!TT5wSqI^0`2%Wx!9j(Uyf(jDWTH~xe6E>8%x`gpwkjLW)kjr)8RAF} z!{rMWJ3)((~G+h5IWqI+0)G`OnAq4D8BBk}Uf`6sn&&NWX4dEUNofqotIS zNdJ>Lz_uEV7<0w%ksPW3(*z`}DE(eB93zjlm4BNM9VZA5IQ>Wz=hNO~^1&JLQde-fzPe-E4uVPU=dO1Ly}036c+~0i zskSddrgA*Z<-F%j??HPt^S8LfIJA)m-;8U0NK)u1@4*w%4r;Ebxr#HjV(8v!F&s$^kRCPXAwGjUxNd;3C7IleToyH1OPb6heGFX#5xewV4Vf`N4O5UL+u ztDot1*5Tv#L)xz^50oFSmYBL{9J5z(koZ%c*02Y(o_jHtMe5`n^6jt~{kVNfMXCX{ z@!Rcv>~B!BGT24flQ}9@^kagDzP@$eQ<+67bwIxfAk#ct9toFYYO;E_0JO2#5L}%%L z9i0rM4ubvpS7d}(!Q1n#%;O4wAI{yABc4}Z%z3{ogy12Igv@Cw+o}t zLP<=iJXm}?`~dojT9*&xojGOCf!m@Ft*@TjzC~S~jp>y5wXD!;Y%Uf71$1dJDx%%)y$3l8i8 zYGj4{F0Dy%2~iIV9JiSve{9j=ucvNDfcMOS3lv^fsihz>>*-v3-U; zoU}54n9_f;y@S6^6S#%zmq211i_=U_aGvGIy^7q+5|OoMVYK(zzOmrq%QB5ay~Gkc z{OAF}o|<)@RR@zNHsGQoMVNZ>pf7pT_Z;g!J2*InUm%h{@4PS@5d93A2l^G-NtA%OrY-<48Fc*v;oEL+m5d;!!GszIJ)Yf zCjZ9|qM)Rtq?3|Xx?v#Q-Jo=L_mq&1(cKNBJ0(YVcOxy``~05o@1Nc6Znxc@d*>_X zrE^O?%Z}vMmi^X(+($#d;T2;_N{^85aLFp+A2mNXaE@oD2}5*Oz!+Z!!55s>6Ar6` zkU~Sxh@OyM2kSZmy;ZJ1=C9~e-GBWWKd(BJke&{lK{km*>BFAl>@#M(J$eK~V9On2xZkb}*bNyZ>mI#nET z(t|a#80Y;{3j|YXzaXq%Fv^`}&WUhfEGy2Q*k~#=TK}88?7`#C-q*pCFboU~vGAOg zZ@X`}1n-Vi-|~^}U(41!9>>dt;t%|L1xep@NYR!)3XL5boxN2&6|?C;xZbha z(1*FiF&b~S3#q_~&txtqdIaU$kT+kSYeFvExem;ECn%AK|5_E%4`+NTaot%-4 z4Uh1}Zbrp$IllS`mYtk_me_=e*h|~x?M6!Xdh9CN4A{@Tf1}ZhAFpcFC#m(xaE@7! z#eHL4zI;po^i8jSt8`Y-Q5WRbkV}-D2dIgMr@A-aNN{b|$n9-Nk?O@(wBQww1V-+J zpFQI(cnDK9gq(UPyec2ONKGk4_8fPKmd@klSFT$a=(?9&dQdjoSFZqB!+xFR`5FK_-&&TQUKk|KG17V`(bVlT9!Md zvfOMSS-dAc{R#-vi?nC>^nSyOHf_MC$dW(gBcFPDpQSF#NV`H$I29Ou9!~8EjZt=Y z?P)eKq2asb3EPxqb&}NsDp|=E(rqC!vo9j0Jk=_{BN4onuV%ItBlvoy=!KqJ@RVjg zArfI|QSD%c<%L>t$YDKpS0$T%1dc9_cAcWPL+NbAJ$dj%omzgTK}~>gfu>MrU7-Q; zuC;rdMUGA~c)YS?g3(z>imE=sMpBFU`#=h#pQUP>6jfcI;mal%-FehKX5AkMNV4{N zf0#x`LVj1c%tW-0Eb#kD|Ds80bx_$~WIGcr6F@SSIG6aa*jA;s*$h~dq1afYDagAA z0Qve|ZGoa(*rt4f(FkLJX0VqX4;k*1WgY+HNIJwl4 zr@#jdpCZ#HXksX(3~C9M`2`YAVT&>nDxXFz+j(R9m+IP{zqcOHIJS7~Edb$+H(-?# zOtqHJ8Sr*G)-X*dc`A|0jZ(TU8iHwm5Lg3DV~vqrqv%E*_33-zo~bb+#Q1tN6^1K< z(L`6i5|zUuZz`2UfAZNU+GwMV#zJOFK&dE?b3#JDX1T)1q81cRk2M=CG+Pr;veyRM zA)Sdq+~#w_G{&BT{~`4atVx*)K{hHKvH4)&mo|IfdRCdln?rq4qd>#5-1U)R~_s)4Oz z8@4Pj?%~Y-ucQn~Xawh9WttsrX-0g@)II+{iINf1UAt5sU}LiKMgW|&-PYea7pJdg zG+Izm2p8;x$0Gz?n5E+&%30Y zQZ{KPa9ZY*ypB8oz!B0t4q=W^$i?|ZWmMkUzSPUL*$X>U_IDMf>PkVTsr-C=LZS*u zz=ux7l3#idGoOf=WnD?R+WTn}ki62Aj25x(qHS)6CXb}8xnwZ8RpzI(y-hhtY3BV4 z8R&2yR_EtVBR+T`65J%Wu&nyrB-k|Hp`_q&A8!RmWX?Qj9t5TDXMh`j={$y2* zGSZ+nTwDxR;|d*bJQ(y*?M~9A`&mq*`NR}j{gExJzg!Z8rNFqm_9y2FU#GSHDRJlv zJlR(a3aP_Yi>$p-dpWv#LQSmHB)sg7{S}vb42Sugb^V*P`N!-~Q9vfmr_JLENO;kA zZ(@SE{fVZ&Hq2DX2rRC&k9=5~6RQT%kMBe~efQILQMa>oSIcH*o*?PUJ|ztv1)gg~ z;-Vi19BAWC6T95Bpiu?5I~rB1mb`f3`rjlIoaS+HKR>`+Ng-*>Ec1NaMi}HJe5)sr zSi%Vlq-@#Sn2-&uQ$e|{x)S}P4@8x;mxL+=rRov~&?DVE>&Fit%v=Ir?KXofZ-Dd_ zjz z{={pD^uygqe@JEz+`m5wPShHjkTuyaEmZN{RL)k-9uP0@9-}5R>Hf6eRoF5epEV}H zh0`Q`$L5{z-sz+_S1=x6y}vw-aLl=JIT9+*iSNTTxKAVdJcDN*eRp!q(X#s1)}GuX zfMcP-dEH6Y?@dxo)t?6m=hwl@4SEau!B!b4Cm%*Y<>3S69ncQq^*aF;p8hnuRaAi$d3k}jAPtvlRWq6 zl9Hdc_l>0r8fNZE(bD_ME)HCkxohK70Y9wvP*YFsTVV0-qeL|koQ=i{~M*%0CVS~m4mA_s&3OaW1{ zdDr6tT2h?4wH6$aRqj!3QogTVo4))Hy*(AgbrCqydM`?vN=j?aLf@_7D0@J0$^WSJ z#=`M9asYPCTN(ccN^k*6#TuR8u*9xZ!}J_NPzJ_F&ME(Bmr2#QCzFB!z}@mYC&z`M zrd={UPrJTGB_(;B@!XaLvq0szSHr29cwA6O&;iRCzbgF#%YF2YVyDWo=lZnLn8#NH zKjY^G+}>WF!nJou0z1Fi++Q7#MUQ4!L}v&42#*W4my3-acsP59BIu*JBJ1gg5tgsn zR7;w7pW?M4?;V2DXbax;jE*odT>tq4PA#9t!8odzgbU@Q{@!4gHH!6qj;co_j^97C zHZ@k3=Haoj$y4Y})H%8+!LW-%0I} z)POBeg8u<~ob0ZG<~2_d0np7GXdlnn)+m69Ph)CvK||0Y=MOs(NK;&ob z?P%O}v_tsJ=sqntxj2#EVXD9Eya#H*lAoEzw{~M(3}lD3vQ|P4wchE}@%6L$O0G0W z2tnMtM(0HjHE$RMS8^T@xI?#uO2=v{Ypc^~wbdQvxZ|b{?%?spmzQI#7`kH>yuIq; z+(HTl;Dt~ez2o9o*8kX#p?o^wW#D5y@^|pfMwEPQYXKDXf#Mf6h4kU`m>BWP#bcW1 z6Z}{gBL|Q;M)`mPKVtlNy`->0Yl_{+u8GgCHE(-Ct+b%hGB zIQH$1gpbS){~=*GAUX^JE;q9{PeuMiT6qH+EB{6xyMqj2?0WqP+B3_q0z506OB|~& zgT;1U9xJl5F)y`|kdvE^B7-%dS<6R_XK825xC$Sw=zC{~tXn;GOxwL?-&@r64nIad zb=^A1!l0r8EX4OE&tX&xYAtruws4a~jsNb9ZG{hjSe^!v#Sr-2~P z;`||h;R!c)(!XVW*)a6TGIER3 zgNrHwR=1W9HSUnyDXI9=Y%weEtcHq3*j{E)_8uQKj_=kla>HLBw2R`^<@>PNv&QK^ zz{4C;Ld;YOY~6+1xvr5^oXKBWbF6X{MzWnb{Xxmtv>-Vs^cGkhqS1+?m0V^oXKchN z6eo33)Vo=w$-0ojg?ubTm$@ymO%gOrDT1a8wU!W0jMx=@*RR+5Q1aOf-RJ{)@k(?{ zWm_Mn@L-Oc1yEd@z2<4XqjJI3Fbn;&>^&McjDZEWj%JHK$^TcSgi+u(L(;g=?m$7C z-ABZq;Nc@YW0 z4-AvorD?D2o9kDyW%1v_(vNM*S14rXKT+9)>QE&^z6~|ykQ*L!d7sOy{dMWU^Wgn~ zt?1yZW5zakWf8X<)xk#A_5HcJ9(*jt;!_-j;b;lFAmVNz@5M! zxc}Qr_*O9EQL9kSJYLcyiOUt-GXm94&LRkqZzvbUh6Fv2AgJ2!KEkbS`~Gd0dj?Fc zob|~U|1DGa5w&!()cvsLlI_mC@x-X%S--MIed;{%?$b)W5EO8&HC(VPRIO|k2(`}LbvCY{VPm}n<)5M!5Mq0)dexN zwg=Ya1?6dRuIls5hr;X`tHi@x6x1%m<);}Qq+lA6iaUR3^dw-`SkoA`foFs(fEI-r zCf-W!e*aMAM&{`(1z1U6v^9&u{fGXY>tyHagX4rQH6XEE*x8{g;&cVy_8+=1cK~M| z!S^u=f*sFf+w(8GvxG4FgOJn52MxACNlMWxwP-j9BtD_c9K=&Vf)>IRaDR&gMWYc{iFY-_e-&gl z`+d70fBJ2af(QmPUad5a&G;Cq2B{`OgAG2tNa-~KlBM|&7upJ*_?|u5@ZlqIqgmE( zqR+>#I070c6i9?G@+f=sx=^FKTj+7B&RnA8sNas9$|XaQnI_q zmWf4iD_j+$Dn;+d6JZw;MI-6=49HF@JQBI7?ANuCOW|kr!|`zl%_5qmB3ih6FD-(8 zv7r~ov2p&2|7n3^dqn8d%Hog1M(*2fuRtcfq~+t5pSEX z7*tOU$YJDA|(AZsMIt z9ntERhy!H2blE`0vGS@@rdjrYbJuvwqpErRElH>UXxH7Jf-5@2J)lv3BOIG1al9H@UtAH0cp#YRdIDVc#c5VR zHM<6t!GQ1m)4^qd0PsPpgC9;u4LU{tE42#JEr}g&lH!15%Z7BwI3b#zAJyI1Iiq{n{!}Pkz`}6Q7EtS`nygGP zEAlf8?+VE%JEX%lXM(9;vGWdc=9R?$XcMC-QDo=chWejk7RHv4#~+K}7C^~kBa6{G zt%h$85W2hv%_V*8lI$bDqWQ+e>Z5aFdMXz_@2x@E9xN0)-u9!So(yhCmT~4lC!zOK z!##hmFPB2rYtIM1a|4sMY8Uc&lcN3zLNs_K)TbUU+?Cb_I*!~FINVGE zVLOHPlvz*w=Q6Tc%!_6t){yo zpoNWmK47Ey+7?r~D;WUVYG7&dpQThLM!`J|o95{3Ie?3|>v1kJ9bYn35dEQUr?Kp- zuM0(7c2dura^%5*OFJT+>F+LWY^2HYm~Y1gS#KvLgxuysldkWtFJBC{{6<0|4H>?B zJ&4@hRJJ;#>W@nyXQat%gw(bKaC_MmOHcq^JE)%II|R@b1u8V{&xU@~bPwoDpGQ~3 zIa5m8!ypF<*>~ML*F_0tyo|lko8i6{T2!1CX|5?+OwbJ1F{)s^p z+T5A+rwbL1#ySdG$+aSKwXyQvAZPo4`~x4;cC*!58d4{%JlF>!MWF=MhWL)uC@not zHmm)UNN zmz#KQ83&&xM7+?)F=^`;R9sY(mkWm5v}hD>T=0?8Et+R~CfAU}@tI&yeygpVfdep`L!hn%T@p3N9D2oAk zQ|pJ%7B3CWvg~9H0_|bpS?XB-Wj~@so05H{CQ9mY5MlNVR~pD>;^1dWexmE`;K%LN z*Y^tY{z>v5Qc1~~_6qGS|MH>JO4ZXu6L_q&bv5*#rLH$9k!t9%Mgt)->Ez+Wc-uZ_ z9DzW}0PM3K?a}v@+Du7b@@TA+2EA~X4z2Dtn-GCo_U0pNtK!1fcTKoYK@yD-Hzw*Q z2RcO!E%9ecrTQD4kQTg)$3Qi0l-hAVJ|l`U-aB)jQm}-st@t`ChE3t$`M0Q(X7L|; zZEYSIsst>qDAMP`UTm5#PPVvGaweX=7YB-8wlT8znF1qTR3U8|IAL{hZFf{*$}h|F zQ+E69(YNz!7-KBKJ(8*2=G4D*Fb2>XTmiIYE~NhJbHBrz^ZYo-yFammvbD94{tk!{mOdp zYGT;gIhK+i7t~8-Y|Tia=3=or6Q$tsj*kyr+ll|m9`*Vl7jUivsIGxNPcl~hrL-qH zlntCy^=)oDZb9nbY5i2abFTwwD{HqVF1IL1Hz_U9)Rb2qE`#@_ni1p{;%bF6JTdB^p;TYM@h2eC~MZUv2D8!eCGW2ndI;y{GS@$ZtO(3hdAA>|;&cB{L zDGdpkrW1+$fHP`fD(hy>(?nX6sbm$WRVlXXiRWI+mwMDIM$D!6@!MfiVIk7$NFXKR zJP!dHZz2rQM=z-m^19{x_va2)p~U72p1CZ={d4w41#lEoH6M>L;iOCgk_}7sL%I$N z=0Gm?VYE94u57o9BHzz*&Ab-<%br?Q4LWFcp-!~)(0*a=>R*=1{G9{BCOh`TL2Rk& zO4YUeLHHG{jq{uEAvns2H_v{oj-M*YuG>$#>_GV^9;`Y0&YI6|wfE^4=}=bPt6qxx z*IHj80*m>_tHtRH_41N?e;;YP{y)%}(EtgI{Q=?+%6dPg zU%&G9`DoQi;bZW%cV3et6uPCTLM@=2L}z_5X#om00^zPY`$jNB7I*+crL^l`tbs{T z!;b=a$gTBKvz}Z+uE5&jG3mJ!wGC~f&?c}XsVj266TWu0IK~A{w?Ix;ev?PoK0dq# z=<(U!^3^6!wCPoKo{uM07GDPco^zgK{8$>Y@UST!a$sGdO1a&&5b}=svw)L#Z=>K7 zc1kmI(%&vUTy)8`)GWIqSd7(@JVH%dH|4X7C04RZp3Q|eO#qtCf6_mo(T83~v;D6G zew|`T7O&$Zih`#%%|5g1T}Fkb+Qt$hXdfJ5qzV6+u?U z02yH;%uz)-yror{-8e6pjh`gLiqU7DlS(P@H(oA9C7eO!(DO>Qn1>Su^%(K2Spn`` z&TK8}2jx8l7!UPw(b>r3dbC?Q?~dM*omtojoKsXgGM^H-;+9t*!mVp8tZy|Pb^DC9 zbD~Tn-mWtKL>dwfXp!4Xdav{4%)ySNLQw!{7p2u%E0bf(8vFL6v z2EzZF)<;AO%MIVAN#avHe5Y2^YX^18Ecg}2lwIvpZpV#b#d1eHV`aF5)!1DvLs?}U*=9K{k} zauv9cE`?*(pAantEvG0|tqu8pns}t+OV34|MTG6pwD^NG{I3nkwO+amrH@XMS#Uut z!-IAmrXQ2->bLiSj5zmzYML~-isk42Hu&j8UFNe5$q@NQI&DWugPY~99H{P%QDCc% zu)}29%FMo8l*+3tnk);H6$SvC-6{JIDU*4@9j|i9{MP~>a;Uv^B%M-STeOef+_y~J z+K~as$AVei(V6AXusU{MBj2xF9lDN7I095Js0982)l@*XW43I%2F^#7H5%rph}=OT zMeDc!A;rXMY}4jDDEONCyVtCZ z{Dz=^f7dm>Mtyz+V3hMK)0tp|EJ0pSb7wrOV~#e)gK&(}+8}vVPdg8!ePcF92Y0wY zp2G_h`Q#X7wW@jKery&!@p}D?oU{VFIOnpitZ7YA%IuZAiF(lYy8RpTfvqxAL7kvXdWgIFr;xQXFtG`^jQ9x=K*;_MK77dD=D;JhdK#sx>+qV z$MBQzev42xN1K3|S`xRUeQIImx94)`uB3adgB|f>a}^b0vVlGH4sSOWb(j>fM2j0MMvlboeOIRs~`vpgA+n;V}&*l z_&C_~lntl6AS?9+%55;Nr3W5S)7+`KaM=UmTYHsSee1O0Rb}2sWdx_A6T(FYMLTQ- zy)f)A3g-Ox-Z;G@U8|!QasIWR{~;vMcWtCv4o!aUK=d z|G2-=3+Gg`$cOZZ6(j$L1QLJ2?dtipZO237mnRlDPHhjCy)r3ldcOGabd<8({&VwN zb_8jbx^6l33}lOL1_@{c<$DyD;A+<>Hi>D-w(ENFzM`&BSOzdvU8}jXCkdQm1eYKb z)R1iI9o!XZYnW+z`6V4qsB55A{Jz@`@1k(62Cl}4{?Dc2JIWn3=pUfL*7cmGO!906 zgLOh?748SYJ;h66HFTw`&k`-o6VGx+`j6xAM@^gifzuWu_ZO8`tXHS0N2KdY z8z)9v{Ja;Ke@cQS#N7kl?*T_>>%}}BKU;HONId+l#<@oZ&o`%j{b2kCFBOR_wb1b0 z`Fbu7KHtCz4xrxWkdb{suRC);-xiVxeaHn#0~6u~dBI@?p+~nxTq&y3)$Vmi_b^kS zp*iuoKrNX?E2B(S)wA04OUaZ)ANQgmoIk$i_Dj0fqyVPOD3|`%oASp z=aX81e4U~=Hw_l5iF!9$(mcdW!(R*>=frX1lp7DQsuQ;D4Tq;6&fW1IyJ9JNc!F(2|3S45OUQKK~r3hRae`RazfYdQs8t3{4Pk6f8%x;HJ|G_zq36h5P{VrkNH@=PBo=VZ#4+^}sy;-Vi32-FrOT%D!m4Lz<5tXJ*wzPU{7 zl_G+#9Kq;^r>8lZSbL?y+bvxmWHL3umKy*oo>!OrxxzKUt6q`|+Fu~H!X>5vKE|mC zdmhuP2mghFZmpBj{3V5Jt1h_)984s)nhy|OU6)0N2SjZC;PF)FMNPmatMtNPtMo_? zKFdB35ejuPf0-3~$$u`V<0>o%CE=cJOxJOUaYlDbd#lXO$ZY=#@wPgYO9Q|+6otYu zO3o=}F_o(g+V*;b@7MKoRl z({MTunGZIEycG0u%2Y_955#Z)jHzjW2%yQ5jdjYXz=A=Jz43>S*BhzM?1;S9<*e#Y^i_d)F=N~;QGSABF%k- zCYqL!xaeWB!i5{|T*sMi&BG`;S|f>*!j=(iYmr?Dlv53u_zPJ$AR44NT#;sJuh#2p z+1kwdnICcx~Ams$m}YM-7RF>a@-S(f^R% z5#=}`V$r5PJTqZ$ubh5hVM~6q*kcJypN63b+2qhB`!>}wSI+W~J;;5SW5CV96Yz6pcsw-2j z)VFC=D=?Uv@YO|Wp9^48!%`^Cz8(Odz`j1XzewFKwLY%iOlH=52uQ2-ZLbVr4<5GI@a$=`U7~OcQsQcV#EZ${u#<`&|z7ZG-+{(^yAO~ z4x8VZzAa|g4Wk2pbWX!5{#2h)^`K5bUyF9wmQ_Kn?$^0kCGm?@wx!V@iJfCDpZAl6 zepUsh?)ek`!OH&i6YELiN;`j^OPXmIB0W?d)O-u!@_vlUI4;>ektPkuM!2kRUn4mG zwP!I5wv0*R6Rb?iIaA~dq0Prra}IIrS5F{KmR_06&TGvsv7iZczjcpPSE(BeA9d-M zyklzQRfvZlOz+2pHW zD%dfx2H|$RRJ?y*w)835Cd(wi^FeA$^Ht2lUoc~ZcN3q`and_#SWbon!DyD@4uz*+ zDXc~-DDFLc|7fXBQw&U3p?^^&oMnvhlEXcU^ z2iD=O$2-h;DazDh%Rr_bq$;PGT+ZdzNy>uC(KQKZ<+iB-aAa?%K8%GVDr1FqpW&uo z(1lOU!J&emu!c>X0{G3Mt;5TPMZ^}2PlX)$=W*{9m$=4^IO;*SkQ^W$N=?@{2l1+$ z{N$kPh7a)#_6O`PG=_Kojem@!sW#s03)@deFrz6)g0b{#Qkn6?UEvDrPmVTaOl#6t zMdzQ#2om*ui?hYFu3^$vb#3)#(;|e`Uc?9C-4k2kPz$5pOCd|kz-ikCCZEMSQF>$k z(yinp#z(n{&+SiAhF5a+8K(bi$BMZTBb){WxrWB7i%)Rz~PpSvl;KLz3>G%P%$lj0ax8d=m{hD(c@ z&IZ5-z~7s+cEuu>jV^f4DoSWgRva_#E?Z-Tee#D&9+Yj==e=g_Y@=)W8 zV?zv;p~im^z{r;UW;xFAjuTT%W<7G+tw`E}AZSYAb1B~I8E~kdl>*Ow$b9MBshNrM z^g~^hG!UvojjODWj`wToM>DXTP}AN5-% zc(&+{dK3ye>Aiui>@QxRO!8XqTm$m3P=IH>vYoyKX_E$vM8Kz@+J zxl^Q{Wsa;Pu(~S(i4-MH6FF*+D&@#)W++e48(t#)`A5p?BA#?r7B>Z_ijicre289` z^oS>tJoXv~w||>HRrc2dfQDDzH3xk;rR@3G==>od7+;XYZF$eL#jdaa@650+sP;Qi zGQ*$#_1pGJ7vlL#)|#O?6HExeW~u^}|19~`46h#hqH^`Y4x4I#ZKz)GsfJ%%{_x6j9=`rX;M83 z?2msUqe}~S!I;ZF8P@K-+eIb(P@~yai|>eZv7Fna+!UKFLx8A1{2b1y`UAO@+zqa) zCwd!J_m-vlw=pU!$N4w!-1_Cwu+3yI7nylO@%htTa#<^ z^yoh%`c3b&6~c=Yqs~{-zZVOL;e@X+OUebfnIg;0TrR7*C~6%d|49ki1T)Rntk6B> zrN?BvSbwE)iGw8SCOR?2e6wyFFO%-&sopN(e+T#{j1!Hm*fl&UwEv;)sBoTnX{f&_ zNKgxAADf#0hvd~4tPFemw>W)6-tfqTE7vrh^`_>5liFF@RK^PsKjRoNdpOjb;Zd?nA zIXbpI^D9O$)9*$N^!Y;YnLB+@qk_xxTu*AC(k$`yML$~ zf)h*;SPE}auV+~6;4(U-iwW5KXsK+a^y*4GgWa6)O%`ZH5gJjSeoGI(opM>l#5KSP zLRMii(`49-r#KMw-?<_p>}HLQT#OtjKnTU951hu3?vx+?qeUh8g3REqfY3L zhyAxNk~U>8(Ku=XDDJ!yg&!AT;M&xmo&l0dEG@qfZW=juE3%zFjZ@(sujnHx((|DC zPLrkm)_OKEB=7u4sEgBy4=xB(_Y!PzmBu{MoRKP9?S8-?CVP0^P173e<9g0TV0!}y0_;u!;puQz`z?qna?btA0KUi<-qIW zlsI~YPACg8{Lpijy&*oKTj@KCSA3s zQdT;Mn^lyUa~Jtl$=~u{>Qr5AO)BI+^?UefI;19V2>sW61nNQy+2kfA*x&#P7HIU@Fn@H8m$bM1<_J zq@XD{>QuC{tc2ehoB4^t!KGJkf?B3kC5G!tZ@z({mEL!f9oP%F>v*Aj zk_PR_htsX#dikH(9oJl{k2lNXW{cR$k@1nLtt11(!;^KD?5L7jXwQuJBu<&d)D&Rd zv@I1QyL_}Fj&sG(O%yE?rp3xAPP#kiCVdm$(%{`7#3lX41~E9LoEZgMgs#!LK9-&l z=@aI~tnxP3XGPkZ;_?d>w}L}8$Io9CR{g96(|A-IY1ZJn2JnlLZcfvgSMqnIM#kjO zLyabEprKKQ{3ms+fSO74eo-d=pmH{!n$tsMidf(bg*B~!J)PQ5k1Qb0x;Vrzv0~kG zv(;EXz&NPdnBR0K(q+2F;d!v(U0_A%Q>~^%FXq)CjP|T#aLjsDdLLMx8^G7I5XNj( zP!|BayF6Pb#Ke-{tYhi6#*Z;>oNN~%c5Xml+oRLy^;i0zVx4oM76hc^|K%5I`5kf~ znMOvJlYcivTZ++5HLVYrM@w{_?UVKQdS`MO-J3QZuhlq0U{jYBPRYq9E$dUp`Y&s)0uk`DAlEW~=y~KO7C&1H9u8S-HGgMx8KsG|2Z9N^#G<=da$AUrq z<BLz8?>iT!=SYCkB({q#PkF>r$kUJM^6!5x-d5WEQLB*=5Cvjk4)%Pf zo5a4G4Ohyx^2WS)%l9!18Z={~IyyR|30c-@11Ln2fTMc`f%BM#9T$)05+~71mnXgr zrpMspd`CTUsO5gsr|1Y`&F3pEMRphYRIcHv2z1=iHxedK1L&m-U9xMapB#1jWl(+x z?fPhdPkidz@gA6{99V*VHAlajI+dk<4D^tO&MJrfj{4O>&2ii^JK4#re8Cg&! z6fXZ3UGSEf_@mPhnov2vQ%7opWYTYqVgCoXcj<73Nz9(E79){6#k<27>-Z9ra|sPE z)<3esE(-n^ zD-3*j@+&1}B@zGTqR*9_J5>N|8z`u;hyd(Ll}wWzBKSu0{?mmG&N zn6Ue9#$@Bfg$+*!6xk*YbcfdaM0Gfa%Uc;Rq14h50|)*%o7Nidco*R|U|L*`NRKzx zXh~iK<*Ty8l5f|5l!#aFgV>Wa$peTZwT!O>Z&L;OX$#uscs~7|#t*zF^+~DRRz5*S z6W0Co{EHCROZ)wdtmtmQ?tTi3$q~W9k=2=;ylvw-(I)k)PcAKin1LH)xDTcGvPky_H1Lw#w-C0gbBOYKj)f@zBx@klKyIS znTJZ?C`(Th4=~i~(FEp{O$}dU`0ZVDIWuS~W`M+0=fden92l=n!fj>og$<4U#;grL z<`b%(zDk6XpcbC+o=J2FNIf+0jQ%+LdIW)SUKc! zL?)f&={Y3LXSv8F0H>rZEiUEyA=LJ{K52(cIleU~ z;6322q}tO!`d$<44bEMc?w4MuJJv9ch<{6=RoJ(DBp(1_L94W)Cye9be8OlOF}N5C zeI8K8$UxZ$k3L(L6R2WT#dw1SJ=gnAC{Ulu2P{0N&%v4(q;|7Zwtj!5{wg>y%81I4 z4;N%S=)_RfG&l<%)gA!O)EdiwO0w~wi?6M>4F4hJqK9vsfqz<_b%=wz5##(6bzF!#P)MWK zbFeO=XSIEm+b=;&a<~{sncoYXdC<((lAc)MHa;S{yD0oGla zblt7Sukh*gE7*Osc(#7{c}lLH6e=DA)sW3*p7~sGj%sX;@&T}Tw1Zb6Wg+!!LM}l0=B+Pq1p8h~-rUN?W zVBB>w$^qd@b|Yz@3eI;Kmb5d8hc~Nt(W7rfnt&bJ6J3uuU-D)76q6&yJMKlKapFnr zaXyjyDd{Cn)a;QjzURhC5j3&gU@WD3xPtG!|2}ron4Y$)J zc>RX99Nixd<+f8>zTymT&FLHV0dDfLMId+Nt*%T37soG+G6P;{D1{ofjB%d4+sy|| zWA$#`x@q#zf!ztQW7S^!Dq?nF99!&$_t^8e<9$X;@i8iz0BE4vB|BW^q?>NEoDeLh zEoWFC1(!E2eiB4S{cy!OsNuWrb5qU>+`Bd4d;(|Q@538=(Gr?_ScGVod(pv+Jdzzz z^r-}XQ`xFEJf$|GBh+tteh)Ad769Fe6gNm=zxByAUdH7gAxv5HOI~UKtcO!B|L4lVmEw+Eh|J&VQ&H>V5#Dj8ZCOqFjr05DMmT@kY-8%H7%KEak*WY;T`S z-1Cy`ram=hb%(8;LD|bxf_L5E%da7ASBo~>dCxg6~u=6zN ztkD8hwaki@jq|38{aVE&ZJ}6Yzi4}X%QYh5 z`sY`h=e0eSL9~WROlxnQqYd|N6X#xJBmd`*DHUsG7lmVqNEw(2uI~b+#segw*JD!U z1`n+^m8f#qi7JW9_)*XRY+^_|I8hPn8~a-tlYK_S z`qx7Q6O8UOb_ZQ!db%;vFGF0iY^hMGi&&fkHChW3%b^B$wI|MlYCu$5I%2^fSc}sK znAltaad3PuVC>SMpRs#Pba*VVDMd^emsbyMtwZPB>DlNhJVg1)vtY~FvMQ`Y}nM`cC zZ{O|U@io`@yh}tx9aqroj;oy|X_>Z;48Hhzs+^<93Bz(@PFhfGt2mITguRooU+}LZ z5jF=~Y7W*f$a&>5AT?icoaRe(3mw5^0-bZm3BBb-XjCLTeD%ZR%N?_^HhqJp&`yIH z!yI^c6&|pX`^&;Ui~?KuSIdU?R>>4sU1r@k{8{~dPP&j*e<3)5sYRh;a}b|z`G;^4-m zGEUoco@IhdeYWj_@H%q~^=?BGNOd3wLa^Zy)CBtX3If9+lqUL^Ytlz;SE@2I3Pg)P z_PwfP$DYc_w%+>crk8&9nIZB3J>`J7RIiV}4RrrvggBx{BxX3Vs@nIr;yKA4SWoWW z6o#`zx@0BxWSVHi|L$sj27!hfpE;bIgx28Mg0=pCaXW_Kqe3>;+M(lCG-j%Skz}?8 z>X5!+r97aohN`$hb^apqCtIw7M0w18s?7gvi;M*=SpCl5^b-%Rp2th2(jxtTK>v@k z_l}11i~dGu^ymaphS3r&MUOs8^xg$Y)M$z5gfL3fj9wz6moYO$i4Y`2H;5KJdau!q zPVV#ly?5Qa*8S_g?_;qrv#`fG``KsjPx;o=KzLB7kCdExJEbP}^!FDc-zz~|NqK-- z@px$c#Ko7Zwl>X`#7xa(Qf)U!)g?KaQVij3T9lbOp0aElH|wEyg3}v=-1JMn5X#bM zAgIi~4CDxjZazOPaizG4fzOeYIZvcK?~#>SpbM4tE&Cn=m<-5=?3eS7?jNT7?JLao zyrr-HGA#ZH%x#Wl-Y~B{&Ot@;j{2mTtI$;K zmh;MWdY3TZfyjsjGEOWErB6yngHngYT$na;Md6V~?UM)v7A zRV7O3DoR=1j|`xcCY_KF=9JWI!8{CKBtP!CN$+PyQ{;GOr~UmT>)>GQPiIIfA20iM z!`tfVgXF<}z8?%LhXS7ONyoDWrLM5c$rDXFH%z9mSW0&hb!p~JEdX~^Or-@SXV;HS zR<0+yKxTcpcEQw(FL^Xu_F8sNWz~@EtbkzT9)`IBjVCBl_HFOXfalBQ8h5fs%=soq z^Pz3;J31iOE&=_-9swmQ$I>O7zC=EyUaAo|=`x30JOz!GKLw+OO|C$F8Ie=%lXi|{ zP#-*1cZyU|;Ec$KE+Gxn7f9}cK3PlSvPFc53-jPyjp@3AOn$kpxOb`7>Ov@~$+Jxm z7%uyNpe^ihM_u6WI7(_^So%{F9R$XBd+MV^L|Wf71(DF5n#*JHli`W^gK`-Eaz1y_ z%+`j&ldRzWv2Prdl?XDQqB@df8e8w{>QpxliihmCaT zkbhKMIG1eo@>joUN@p~`_Xd>@jh%-E3sBF3!#+xrU^UC%FS0-mXjs|_6lz^5P}cYJ z(~q}=4D;2jMX$ZYOp8Z&Y%`}_u7y+$eMNkblezS~Qc;TA#{;l(B|y-$9u$xEH}E#) z7+)fp0WY6r6xHWy+EWO*gAz1Y${B3%YVJmsBD~VR3t_3_h|zaUY-%KW#4?yL?|$~c zVFO)|`@H1dgHuO6zr;5$a94sg519_>d#6;cd?=LKuFnDUA4brH>{YWoM zJ+FYrqPDahqAlmDc+vXGHcp)VsLycl7k3Mx9r5oSqsk(}H^2Ft7M@3qLtV~385Z`5} zWvhIrqp7C$+~dNf<=q_U`(g^lt<)=5xQZp~aqoo{CuGlT5E$7_e11TDQai-LG6m0h z(e%r*ffxT`-Lm952f_QXMtU=gJLVj1U34DG&ud>!TLe;hv}nCCnZz$ov#Lj?uBEC8 z?(Pya;q_>X2f4Qa_5++>kVBO=x zTMqD%GDFOAt6?KY|EZRXnc9CjGVjk=lCqP(XuuUthy#5lou-=NicS=H|AqY1vE^-z8;(j$c*I3Sl3gb+s$xXkFN z(j|aN4}i-=>xS86DCUrnO4u7QSKIHxasm?|nNHAqZyT@9A|Hu+AL`bATgVO?RfC_v zEz7+X9zCNF?t-QB{Iq92-hTd0u$Cx!TE?a4o6JGuSnQXGnuzl;PvL|+kw3TbEm*62 z{6C{%IZD3F+Rm#)AN_bpONhO_wR7roG}V&@s1U7eBu_Z61!rDtO(=wyO)KBEZ*&03 zn8q}ZJt`p<+iRiK_=bfAq6*G)7l>;fHXLHZ! z3Tg_Zvd655TUq!L$IHFqiJ3UKmkH5l)})360~IXm=Q#&=(?cr#`#+-P?kI1N!NXvJ zaiZ_Kh^__ey@E!+2=}n}L(2Iu0wrTfn>2k2X*0t;U7*d)ZJj}u(u8hg-tBvOB+hSs zC_Chfe-3o;+%&im0}wb=@u111KAV0vg+HfGOwMbId7E#GlFJIb6 zo_u@A2_rX#N)TREoSdH(PqD%hiAJa-ea6vnV(m!eLHz=;o`;8mcN1-wQSxl;id{j& z^+PkS<*3tsT&rq4r6#;8-oLe}F1Lt1y3y? zSm0YmU@gh{Jkkk*B1h@PKl-)WHJEda^)t=<>RcAT>|^mLHS))l^N-h+5Mk-k9HY#k z!3&fZ2{lbLc%ah|6SGVju_d6$Q_zk6n(busrT}-8e@{()RS}eypfmtumFq~eDHFg+ zTN}WqSWb&MIzq%l!&P4uyUF2dP>g4 zz>GDY>Q|po7X$b(H@(39(gxv_Q;4yC;?7Ka=^-*AUf^!`BEz6pUfs2RRWY^@pMzysIWGmqJt}VLzYF7e#g6^3~Te_%98SctVLAl{JxkvOAk(Gp#Tl8OGV&g;38HZM`*stIDNz&^}A36 ziOJ-#B))#b-oJzss?z?UHb#l;qqcm|Vr0i1I6v)LyLqPck5(Mb`U`}^V zYQdbY!bCpK<4z~Rrl>-bJG!quIhCi4Nr~1mJQKfY+H_pOwAnk$z6(s+wn2%s9gA91x~fePAtcdv6g zmHQcX3G}FGn|Y76dzCEpQ#rs(DzCkuePa;vp>gT&F?r`ea{bC$?wXOJW47icGAzM) z8^3pMp7%Tz8X-d)1kinZ{@u+f zoWD)c@Xlo!@MT37W-oW%4b1m*|=dDqS6|SN_D@(tdECf2m2NF6XbXxq_U7u;+$i3KV?laZp*IW}x zWhWuet$kw*+n}dDhhETDDw_wqh28U1wQMv;N(BZ@B zZJJ_+vpnEFmUnUQR~DgE1faZQY@vN;^M`;444P)|g)`YpjyB4G}Tuw|2 zCZ52{&DpZU!S;WJ$Y8##CTD-_q&-61)8B~n6-rZ!DyXxUA}y_9SJk%orSf}O(l0(G z5l%~^1UlVuWSUKMmKH1F%Ack2>{R!kK00R1e_fO?bGL6XdjFe@$ z=K`*;st(t_i#bT~YDOx2z{pn9kx3<1K-9MrJgIkBIVyvAmBI z5jfJR$yn6gKol%H+T-wUKx6rvz%$YU@of$KM}`B*!cT;h}k}0F7cp{EmbC>LEpX= z7*(sGp55XV46sd-m*Uwps0z3_yWc!Vn2axTs9W3;S^w)*9I*EiFe5$#rr6y2}NX#S%G^V}!oV`+Tv+GhG zxs6n*4WpDfEs65=5H9DCN$n@BNA1@$!z>92L%n|DvPZrVG#JMr!kZMu!JIQRM8W(s zx7rD)#HDcT0#8D(K1<2R-=QEgxmFc(r&hW!Y4JSFYZ-=!bcv|8b?9i& zrtuvtrG0vQz$}zofYnnEw4=>nAx+-X{Y9uZobm3#dDnK6nt3%Qji@Xl0$eC{OD}-LE10oAa^{wy&?pVf$kG zOp--6PuFjVoL4`3dXM~dV*As*ePL>TH}lJup04``Aw$69)8yt}buC#6Tlp6Zxd|}IZy5)RY{QkoSijk_OOdE3iimF6?@;|hy6qqwn&psRG50# z^~ZU*8K3k3jq9LQSdQ1HrZmX)=#3iY`;NV(QOP6dDj`{3g=*$!(f@!lpHQ(tv6}1y3!|@g_4g2VUiW1e$xiI8*=c+l#6%QV z{iv=N-hTYq{KJYAj|82yKZDxm#r-c@bQv3)SynYax_zB{ZuV~qOKLI{lE3KUt zaSK>dPqg5$`(H-!^yCdD%0?T+rO8Du)%*GAoA*M5?RQ?wcXaQhV9@yT9qkXdenyNAq+I@YGrUe)DCHici?=>ztj-~^8>=zxnANIU7s9R5GN_JE3)+M!zYPa$SaxB#rNXP2eI`V=jvr{dT z05dt1s#Cz^G!2{9Y9@Dlnrzo+Tmz#>3UkzBwdR=5*z`Fya5rIozgSPpmiM;xT-(Jr zpKDun)!RB-v;|uhm*kPPN&}NU7ulv@7S6R|i`FHqLHOMzSNT}pQT7WQbV*Fts^=GF zx8gCxu*y4Chqf%6F4?4rc&LA%XzPCU9YG5E;0?O^lQM(pu1w~YQ#|(Sz2mte$C%vs zZ-x=~zxlMEOZ5w|dApTFh(lrp6Y?VVcwYN})@$cCFO8mWtBezK0{)*vaaA!)WnDVD z38jHyW>I96)Fyz{+^e)??tP^}mi2^mnLBeDZ$7fqxPJZ&XZ>VVA%(5~Yt_w@P|IQ@ zdtl(0UX2hA0g5qBn?D0e$NfT)(Z|Jhm90A?5IWr+LIR|kx}|hPnt#sMSZedHCs)B# zauJC2M9UBInGV5%a*cc|zxX^yff0MHY*c5`Z2XghU7PLD_WdgX)3FvFQt=*Y1F{(62<|RqeqW=YO%B}b{%Yyibr$e^F=mXLT&PpqRhe0 zRFjFAqNH~GuPSNteevb0NqA1 z5!jCItjjB*xR{fNkniagHm|5?1jCLHVTzQ_!W|KLHDL za*pVzKINk;Z~Ly5R~?$$Vnvk@9Lf^>@jn{ceuWRkgFRu((Hs5OZkU= zlZKJsHZSeknM>a=Zn7gh%mx)|DKrQUM)!%qEzImP#*81>u7UZB|0iU5V+cc`B7eQbm*zazfiV)t_$G7c z`cc_)+fwt-)?88x$WKk=;a~YiGRZQbdrrq?DYW*lz}b*s0CJo(Z`mlK_eiNfs8caT zHoJf=VDNeEOHNF)@LJ9t@L*ZqNaCh(1jnwLeud2FrEF@SK0^=Pny`X(xW055(}YL< z!xqmG==j$^NpX>9n?@UvzkmA5zO-J>MI#3@PDFR2y7|K5dS1^Qx^i2UxXP!fsBXHa z??NgYHm!7-Ry$igHNK#r8}@BSWVZWibPUQl)+F+-N$>ad z-52X|;|b|JTR@-Bm$l)SNp?o zDZF6iIv|EDNB#n30qioN^N#5uxZmPAgZ*%I*7wK7<=lL4vJm)^U0&Z}Czg{E_mbG4 z2uCZA%_|dkM>)vv>;oOBz!aR^g>FNOx!E-^+wV&7?p=M7yoT=y>Gx}N?rxyt(w%US zm)MmHL+j4A17&c=>;Wfg^&?K0G#7}2!{&wdp{KYUoOW54#KdoyiMa@ljir;IpyfW zWdn`A1)iK>CU-dh4pzt-rGCNq)H<<|2stK@LKoBH@?#C?#IN1K4C7R< z_3c^1A3c{sNAYYLVit_cR74h=sCWdwtaxwhY|*d__|qNdQ+7lkOqW3)8xGN}ogd(2K zshoguP%DIn0WDqFZSLS2$14SG1F9Z}730wBW#7x9i;=8N)ggH@~=PvgxPupPi&M zwL7kabJ_Y9y%B@`!mLH#h0_6(64lTEOo@(aJ-|jGy}Dr>_){IL{r8Lqiv=u=mPvEh zKF18mkb8BBA+yqw-L%`b{Q6+ci9>N45`KU`v31@uU@J)S+wZyJT*@Q;^4G3AH}SOp zKyNNscTkT`o)#fL!>2e0{GL4Eff^gg*p)l+CS(tbyS5w(4K&QZG)UL94(mOHmDBCn zjwBw%ReRKLxDtD^F-#}P1XIb*3(6#{5;^Q~{3^To~HURQwk5MnsH% zo^HESby8B8*EFq;78xhD0AQNwzs2nxi*nfz;kP&p%7)|DfIy^&K4~U@uXLH2b-V|7ly~g$$LPJ*uD6&cSy9X{KOLbJB`>pfkV7zHX7}f+y$aedN+M{HxDQM)Jn9aH|$Y(ey+_C`+ zX}R#vpph?+*}VlW#Y6KJJMB})D`)C1v2pjTYLmvIJG z8HdAJjlk>87)&#(fySm*3;f?784L*YZ}Q&)h*nEYQw;MFu`K0Bg zMCBfeiHrXCL%_Fg-J+zRWTv8G7UhC)iT-b2|5`wFWCWq$1wt@4h=2}ENC*Dc0fGX5 zCo%YcF3|sdfe8qSh)GDvZjn;}PpG8@5r7E^35W=ZiHV4SX9ok1gNW#e>A6IdNNyY2 zl5%@8h=wH>knt#gX=OC}z0WIV=M{d7oaqiT3+vr`eEb4};u4aNrKDw4o~Wv+YiMd2 zo0yuJTRgS2cW`{^dQ0$ikvxR0$e`Ev^1q zQ~Ry1zTtaYdq-y%uDhpqcw}^Jd}8v?)Z)_e%Iezs#^%<+;n6Yvzy$ulJ0TqrF_#Dly^Z1dt+#l@ zjP@Drynd52@rp0rJ@_xQ|B3AXUtrT{l#SfA@MKt>PxkGtDx#8KMy~N(48Tsh(w8uJ--yUBc zDbv7=wWPLj3`2;w6p7xQ4{4oz@}cg|P0_Z0T8cbYf`%mh)ZbJR6tf+*6^5{&`X~Kp zg&sfQ+eg2BV)Iq-M*IiM&tpVL+G;H6eqz^$$fz({*xS78-*JIqSHg*9kdZF%@d-+l zbHFkqAEXgjZ}?Z>iDF)6#zk%oDPIx-PHFT;x@(2w&+HY3Ks8>S%Vn9c9;5hRaF1<6 zGA^~3Cnb)wEPJG$VvTcI{)ch(h`6(j*tPdFSO$(K>Q z?tGE!vH*N8bj%+1B)I4$-w%lK;e1<6^K)m0#S!Ggw)PP3?C$w}2zlJ%ot~uR6ox}R z*T5(M*;S-{vqPw+AiZfMxDrarJ3^{|Q)?Y1AK@Lmd|}8u`#vYu4xuJ{duF7`rC~Wv zc|P*>myG43Jd|1e&r4~+7z#=+wZIFLtcFtzDo#3drtuHOi;Bq&7XPE@9HE zPOo!*%w5Zs;^rCY;JL94v=Q=l^0S+=)i=4I!=cjapJqZMdv?N%I$sCy(5%VS*#wz& ze9XUHJ-~R;02{{6=z0osq39kJYbTDb3=#A1y?o4}lLvHSFlCgZ9Z^!?!wDih?+Q{4 z-us?FfjU^QyzTyP2HS~-9b;Ka=mCZuI9svWxCXeHJfzO@`h(=UEP-_t)h)pEQ9j+B z8k`FWN*uLo6Is7gwn&7;pU}2raDh8M&n)Gix9HW=SN;;ES$wlEU&{P@`j2(_4BOe9 zl!q+qm$VW-LTT%_cR4 zY6^}35Cr`6=&Jn>GRVa(a#$j36|!97|Fc!iSit+ zZ)z40Z+Q~U-|VA3?mjFVO=uE$^69cv z4$jP89UQVhA137rFo7x|{nq85HGe!TT(oVITu3+E-9dT?N*#d|k`wna;gsvngIFwR zKzh0M4h6-b@1(^=1D3Fg8O#twIih%B9E$d+YVaPL&QnQTRaD)sfmVta=1TQ*g)%IV z{sX-b56_lbFBhREY?2CgArWXxA{S7-C$*zJ1j!o+g$T4~!{gQ0xD%c^Up}#KfGL^2 z6e_g#!#t9zN!?5wPpp09$am1GDt_a|&;mLFIBSq@EIIM{G)G&>?76X6s6tv~GwYZT zH`@=6{?uW_O)V2JJsP3Acc6P;P_%s}(3oPF!04>x<1*$L^Ymb%No_bVS0GFX_I&KN z)HX5_wN&Wx7B&!i&*eXkSC%WuN64687W)Oc;!~F@!5_bFyiHd!{TK;<4L;Xz61m;j zk+^PpU*f48?DPE%3M<$zZki>P(O`9;A39M|le=DSUXReFvtGa(Z@7r$)=?hJD5olK z9cUE%o+uzKD9^cfYER)p(~Vo<`vSP#_PnE6VydlYl~_(@K)ef|Z&^PJi9Ww}`wiY3 zaJ_H3>&K8;8tA(|wK%o%7JJ%+D}TnFL{%)ld&~-YEk1&kH6RoaIG*oHqUz*Avuf#( zu)0x8p^uAgJGpX2>C_cL=0^fD zSHYvIfVX#O&b`hYfrRZ7K!3UP!jsWfVk+bGko;|rl$gr5_UEwaNnoU>t%9u z$v7i%ngmcd?9eBpx2*CdYs@ew}kT6Ot{b{4awZyHj671Ciy{o{O!C?kQN z{>_Xpg6t>pMfU>%u-ZeS5rFbCWD97 ztMbW9j1i|8q+V^l__vIDiY|IM(MYf^GS5XdYV2eII?~yNiwNZRH3_fPaoBR#Z z?he=mfsDUXh~>THJt?8ZZEm@)j!YIYM1HFBu#z4eiNiQXz9paB>|Ng;N9QnMdy;SO z-Z~}}Z*PwOBC;MoV^?0he;aYUK;UUc(1l&2s%H<|zJXWPLN-vUnG=2m_M#aC-l0Fd z6OR(yh8p=Q<6m~Pi>;l^fVp6YzXO474-H&Ex!g^kEa`v>M$EIrDMy88tv6*5mg(5* z74*mPeCu3g~jx(pre(-7&X(z>$|6bIN-D^uyUvHg14XM_ND z)U1BJoY~Kuz}NTA6L<5XICAlvJ{E;(Hi(EMXK~P!3))bF#8_7@?lp+3B)4WsfQl<# ze0)>CBsVSeK*E|MFI|pWdM#8Q+v1(LX=JfVSZSF?9XM*A3ozzwjYQg!aQB)p&6Nb% zm)Q&#BVTQO_}L@nHA+HZEk?V9!&s%%fml)58y_?>5*`26iEtjJHt}uqmtH7M<(a9M z>qD;vsf%Ej>1gIssCzBX2MipNxT#eq(ygGm3_rE@HYrSgsMF{`Sbt&kwrCv70^*Zb z- zl6vi0Ii+5?;6m@o(&vD?Srod$fY(ias6>rS++0I9T%GZ^p89LrhuyT(!g&nim4SQ1 zu6hB_l>k*oO4xUGo)K}d!NH@6gyZj?(mWwHZv^QdN2Tzf@L@J3Rc*Ho?o zOBux*%J*&A{h?Sa@n^!DrwL8{8d4~fA0d$^c99{3;U+D2bl92swxI-x>CwBUKfyl^ zHpId%)(<5h)q-jRIP(0IkTgz#ElY&uJGS}ze?4Mk@@{Hv-!zQ*ebVFM6jo|WJdiNj zUEQmga-n9}&p!y8`c5Uu$RwO?&rbu@GMdCV?>arqX7M{ne4+K?Z$23_A7MjzVSyA* zAuR{cI>eei7*M0woP4zd@mWSGjvEqF8UrZ*#NqFv`RhN`IntH4xN2+`z$!x+*zdf&6?1tlA?BmErtvUA575bXQ zl4pR2P+XEr@>4jv_5!T)KD^T~Vva_Sa0`^OjMR+3PPf z!8pXVsN3k3NWS&Pg_h;B7qz|EUjAogwDKm8=eTof#D1b*6uXLkxfD|2Esc;(qBjT% zl`(7dzF$G7@`g9`bGs?_z3gR2zEa<}P zt9DgpnC66f?0j03WG?yvBXu8qTI7}c9@5*$bsqCYH&Ob=ludfY=5nm-iM)vud!*vY#*9y?jxM8vEoVDOeLOOoQ_z$- zP-e8!Rex3xP99cXOe(9nUTInUThdPYtL>3Rd!9jur;hjSE=I0+25Mm%hTxo|Q~1Rg z_n(F>5Dr_oLAI=5xu_|PN58S(HZvQ9WI&@O_-kW)c@MNdQTOFPjekrTK!`6lWKcOH zUDndVbQ{oAbwU<<9b(qpWBRubM^dI%F96n+quY((rFhhMOEG|EheWTLoN%Cz8GEf= z?@YY@xpwFzA9V=2B4ya^HF#q`TvU?^6Dv)fMmDe)E;#n!TPhz_k4p&MdvR~~Sy{Ix zTOjlm>oFCLWujT_!?M-h=vT?9Ip|7`(VVt%Nv0`I8Tyb~(N5OjQA^iDc{po`A`2`> zt!U`kv|2!Ezc=Tvw@>(9L4#l*)3@eZo5jmRE0EnrkghdLIS(HT@C{8&vdl{wzpyMW# zUL#Y8x)l+F;ekoQ@`HhnLxw)?XzP~WqUn5l_RP$i1FD8HS(0@zChS`V$2KhyIQ@2> zS{JSvp*Uyb{sKqI9V1yr#}B86zhGxdWIoxE{l3UHNo|cHKW-+dNAKklZsg2UkK} z)46E*QePQ?y9Z=o!rVKxj||gc<-*wj2PC(?r=^r)c5?TM)aUus?&NWTsqYD;^tIWK z%mH-(oYQ#Oq8(eA>2p1GV-@fJJNqtW=SI(wdS4*nxw&ll8B$&bUafK`%M+1vZVDZ} zfdozPczeDrd-lsO*WM|G$4@YuaydK9;Lk&K-wHQ+7s)aiE0#2kfOjc8cf#oR zkXo10{6k)$dkW4kz!Uqs@||9%6DN0mvYXV8YCZcq)m_oi-JUOW^^CNC2f57sBZH+H zL80b_N)()pfqfju-g{_2D*iNnrS2kXxbW>smsnmh$?uC@SiGZK2xqwVq)(Y5JEPR)7r}0>xOOwMzG|J4HQR0%bDBW?Vh6oGE~hdiX$eE{c-px?(el!H{s zsC0Z=CBx_ky;O>DX})cM0SSnsy;hHu1J6uaY3X6#pl;{PyLk-7!}g)qnIk(5&4AC` zVcvME60`x(Km#;XyFT2T_d`PurvRJU#7z?(6MV@rnxmH9@id4PWTeWaFp9Rr+v?8c zRKv+Z*1t8^`0B6tDdSHy#6fmR2{l(nv>H+j;02|(JL9>e;t`m&Y!bNO(LkG4mz3Cx3_LYlmTCSg^uuKFg0jdsi@j(2I+#1BrxHg5fewS8yy z{lK!4aQ8c+_iQTrYF;NSS+(N*C%Z-RXAjoaWx4qc{8Dtl#-7KTM2ncH%HA``~wF)@5ir3@$GW^;8gIN=LSWdl7Z|wOLJED ze}^-E4V?{K`s_XkM|oZN$8a5TCp<~|RmB3KmX@OrJi2y!CXw~v2hyhOhx?MSSZLt! zM&R$)N@6;Bo)&Xl^|=r3tu9EBm95oF34Wedlkd3}v;5ID6M?G>Txe9qXnSI;t)gwo zHA)RQZW*Y-CEpAnRqy)QuA`NbTRORH$mI3QsXYSrH0~sboSPP@&2(59>2_kXN+nNs$@hL5xi5exx;eQD#g6ukT)D|<^EV+b++?|Xz*@6!i1=I z=-c^Y`e_3Wv4r|-eONELMRG;BH@%gk!_*_8Xx(XQ@<@s(c_e?pv*pTJ&2d`D)}xQF zK5iE3@u)UooCoR~xZ*viC&c8eb9(3$!}Yo(lnt{B_<#eQ%@Q57l1=t`w6h65W`k<~>ss;^1A4 zW8Y6UrV!+O-7GwUjoBYZ}JZ6zf_`nV0gqD}(^g*@Bpto<=F zqyCzViWk8Ef9^7qYvnp{+R8L%mNr7GgjD{c5D|72@4M0bz-jXv*3F4QBpwq-kmx${ zoYb<2V%rOKlJ_Q<*(%1!K;{j*LR15c@!6E~3EiI$k{q5h@6`xwd|P(|&0olks8RoV znPh+FU;yWdI|OKrwYp|*)3L?iEDkH1%!95O1hqo?gNovB_RsW=s#4fHL9OeE+(znj zEf+Qc&H?Y*xP$)Js-SqQUy*k2y&HdO+hOO#>31DX{N(NsUR;b7YI!Q0bKes+aQMQ5 z0LoOE0A(gl+Ts=J`AExW@dTmD_rR-1JwRP;;?e0JR`d;%dXq?aiHjzqK{9XBd6YyuBbZlH~ohwO0yQk1DD6Zr3t#T1*iE3GJO+IKHPRtMLApo3*sk2VE730 zZ+zWH?UwG_?@{Mj)&@l4Z<$aY0~%GT5)2t(g4M~-)a5AiURZwOl0I)1Og6N$PCdIA zeITph$-F#{>CQ~o6PwR3IWL>Oa^?JVxh^4yA)@97w|<=}zfK?j^AHLWqbh}eDs3;g zFzf#Zx(aBM?-Uk4{`aqj?lSCV)F-~Do$Pg3Jbt%w^=bw0*IYsDBq!$rfHcE(evVsS9*t{I`9 z626S44T@4$-@9{wUAc_jRD{C9{<^dP?$MY|434)D_FzJu0*n~LuQ=5;dDM)rzYuj8 zUq0P7kJ*4}q1|C2nQN6cCxBKh{Y)Z57Yny$x z@%6f3S)}1lBh}|y$Lt5liU|QXg;(Qh$rkJGB%tkVN}C(sU^2-@6*zJ2ca{6Ex=duX zlGB$fXwH}F{rr+Mr3^d>l4LGJB!Ud1z<%_%1zx`HdP=ij)eR2vo|jOXSm!dKZ&(O2 zj{i)&F5p1$sj+KDt|O6?5?;4$#rlLP5FtE0=!J@Tup&gO#JOH)2xu0YOGMG3yZs(3 z&q|fEKRkbcCtLdjGWqak_oq?t_2Bpi@J6X*N4^hXiQ51^{ScoucAS&AK8UZVrlGa6 zeui^z5ZqFD=VKSL3oExfD{M_!-`M~sC^Ee`vGAZ-5;+@1Ds6ecl7fI_e*;zU78()J z52B`l1dTKl#l+OV?ttUBi}xcx=wfP@AN9}*qQ??njGT|Nmlsc*3caB{PXxduTW-IP zKQN5^U5>mZ1xX#vUHQHC1gX{C?D6z9)HNjlLYX&Q@Z`IFGAsB8OqvI=&de^YP#Y}G z?xB#jB>Xc^%bdH*!I=5sgTH7;LrFtr&G40Fj2tZp2aGWbGp3d(BHvr#(8k3FiMz?-IVhbB5;w=sXzP{dsna& zF0hyzRrASR?;~^n%3>g%>^JR0e#5HK} z_U8D<;Z{#Cr6)c7XlCi&6*v=(o-|IYZNGm8iHmuJXG(NUCr`#t0jLTsf$$OguU|qd z2hJV_9^7~c=}NH^n}rLIwH4(RvVPB~17;rgd4Yo0(@5-2$JE==q4fG?jW|>$I~v{XB1jic*xHIA@;6_JI6OEB&+DWbXb(%xoU? zuOj%6ebee8cwU9uL-`&Do06_AHc(#Zx@Ol_-$(3%dD3}ShNfvIQ22fG3mSv}4OaFu zsTyLYZt(ARTq5t;*o}wM;B}B z|3Do7rPPv?nRn5EN8mwm9Ux@p41R$ru`3R#We445s7V8j+|-6dp+D}jMnHeQKv`t% zvVzVdJ?fHxbHy(}755nt+^;HzVD0ui{8*w1I#coVihqu++RgLwc(B~r#vWxD9lOkT z*s~fx;EP&w}xf-dN zy3STtGQXF2-qoc5YVZ8e9+xssxd!HN2KFn@8t-Rj=?mw+6wH-BXDKyBGsx`i)Oo$T zk1M92glB=!^XRf=^e3J=e0f>0%)mHr6kEV@yn1h3BMpjBiocPD(mnnjD(Ukr3NnCI zhbuESsB%BHF3z48#Mrc-O8Qy(%Z01l;=NgOKP3pV@t_XB7|+s6*eY8g4w+};Pe!$oT#K% zqj>oKRrrXcgFHw_#208#;ynJz z%`yQWj{P>_H8<=OhRrAU7sR@N z6Qdp1uUu0)80#MQ&6QNn(9BpRp16L431*077QG?e)cfz&@{xrBjP-*BJBUn`)Im2- z=S?xY(HFYt>d_&ZWQ)L=YTU+kTl?5!y^nPfG@F2-k6PVJk#v@LiF1&1Am66evEW2nTz&8G)6^L(Rb zxw6Jlt)ik0+h@NMYVS!G17VleJQ0--%|0!WFMfPaM397Rn|u8$wGyMl19=h)r}6jN zY^#?>UGFP<#U{>2{|dY|rw&$w=eR0Sw}_>7iZ|Ne2S<={s8?UTyaGqUBfCJPn~Gg~ zq=h0^0P?A)yA z?M_{ES4F*Ta#aW;d;WPAyXZgAz-O0;|4anQKC?_qaVbh|>eH>f>%&dd^75?0_8uQ$ z6-z{j+^*+bO${fFz;OW#WClee$7o`r%OSBi59TX=f-&U7Khz|O+X?B!ALpQZbw?7e z-S1Ma1)E~dqc3-^HVCYsXBYaX(-*_DK$}(MAIJu-bzl8L3;uI{JjV~VWUZvc3l@fGHM$bmvl-SYKA8tjb; zNcg%9*qNp`zKLZ(qUeL;``}_$IxvsC|5>E2|3C-FdAl_0ii{7{cWwTGyvppa;%Wg! zNYcx6D-sk!$1nj_&d*T4aLupiG7HDeR$kVMLk~`aZ_Y5g&C7=q&{>K5u!Ey_aKI*& zwSa}-BSXHYcE3hD^Nt=zO`etq#9N1yf5mgI*7wB^no%Gb{Zcxj!IbM-6~hwo`BXVP zA4EN`=dVe!P&D`#_}%7#;t{E8`~S`pzyjTePkc9~3;xautYqW!p?N?n+ZNhzMFpY{ zKGNCKPu)X00{5T#;!Y6ni-UtRawNRrY!qEe3a+Zu3^P0 zy%zCI>u&l}wf*{TBHz=BIPhh`aQ#VSLSBt7Z3lL<-V9y$YTVBA|Dx%-!`b@Zc8m&& z(9)uamYUVtwL{e&v3HB2_TDo_jVfYRDMF3dTkTb~XU*2Az4s<0-}m%)UGIPA3RjNv z@tpfv_j5l_Fb9mSyZQjKp6gEhPCIzXILx^HJHOkIK>DJHGGMUR4}Ll<#dnHSop@y# zD_4IH)q6eBx4ONp_`(M~dJ`DrR_1{GS0fPoXFi<5g6OxkvkPVHOq{tWTlYQ4qcI_*6w&IrnKd+84B)!I`SY6px~5RWYjHV)M21KZ2Q_N=uKm*BHpbzm115D+5D#EuX@`?13R zY;K7IKiPsBw8Yt=g{O`mOs!oN16*W-f5QAwM!;9!2FY9jT~S4a!=zpHcIb9|5Nsfr zh+q$(lD~O)JuO@YSfvtf30GIUxC4Mq`M0V%#MO=ICHOp>v>vvz1l`lsp2;_EB(RkD z4+Or_b+DYQ%BjuG$4_>brcX671tj&GaYC%n-9rGsBwwVGq-N1~v@u*=BPGDiKw*RVf= zc=hOGv|=C<7owCOhdTcOTUW6?*-k`V05_D$pCBRu{#H%k!QSz-FF5CmQQaiq{3lEg zUXG)PvB1!S-`Xi*NUa*M|NhdV(=%$k)1=uqYQcRU zN8qEVZKLBNUC@brg47F?{a%$WS9s(X;|zU$=NFmxL}`J7i)4>|LXVPT&Um`>E6)z| zbDFX-RE1stTVhGkvH<#G%OO=SbRb7J2ZMF`*pSS&mOTHC3X|uYeU(}wUGe>8rlbd z0dgH=fiwPhsY?7T;weIH^mZrk8Da}RryAOdxI|q~#l8py-w<4n-7$z9=)xA@Jb9rh z3Y35;I&W%&IPf~$XEW|FqmifSAWgs)qiv{@ z5?ib`b^~}Jo@YU3b4OvKxZ2%cfPVMO|3EHUwyO_;L;q)iKugdWFq>ph2j`L&sY~`9 z{0uO4Xa;aAt!NqbTMoP<6b0NaDnsYEWkSiaq1 z2SRW=o5+Kiqio`@hxE#NH$92L_o6wn!N%?a+U4SboIJh>?exZRxj-p{+oxdm@863B zo@_hC$Ck(Xa=ZwCq(~d1XCBqnX7~-yq)}3D*zHLdh*Zw1g>=wkeTpS!6$m#=_)~N zBCAR2k|G72xCX46jS|&A<7=~7TS*4C_#iHaAFoDB<7aS!8G)|8Dp+?|9P5v0KqM5u zE4Kygb`CoM@cOu4oDbuGGQ1c{g%LJi=+Hh>e=~f$THbJyv07Pw;PeBFuvKw&$J zLt})q-nW4YxNE@r7OzQRrVX4;&i`+Fm$+Anhz3|bNKiXm(0`&E#38t>$TnWtdR4rtb3^fZYwVgC;1#%8ViKJ*}(WmoNet65pa~X%Q!g@+#a~0CIR;NcIR(C zaLI?ffc+>ovfkGKPM>Qd7x*Y44Esy^QDp4BRct?3cYK$LKQA<2hMk^a@c@Pk*L&9c z5{yenD8GZUV`{Wt!@SzVW!dM2BoI3@M>rk4yMe1p$_r{w|0R4H1p88_sIrGppFmEB z>%WLmBNb%(l?Hh5?~}*v)ftz149DBE^@2nT-waR~v)mJ!q!oWxr>1uSzx6E#J+~xF z;9UV*pTlT>Se&rxi%qB1&k2K-r%jurLaY4f$tr~{)#bmWB`xeOJ+9v>ry~hifSQ2toFsrr#_Z@yY%*Fu% zJg}x3PUqhd!3s;#y>A1cK*(2YF;Cx!M)dXre7-S4J%iIZ)GB~w+ZGA9&GSMBBt{m9 zzD5ehF>#16%-9_tKsg0_0l?nMw@*MA1^g(V(N!8keXL5oEDb@E+MqS=+)&*VwWa8F z6$ct$Pn9q|0VKaY{sS#x>QA#tYX1XWVUXBUtnFnkjt#Ib&UEjW|DbMh8s+;R=&>jO zNubWz49;(;XuydF+I|C|uB(iP8Eugc`V2YmKhJC3)IeeT}#VWwqH2*ZK)xm_zU39^$H%32af*K z{c!UM=dbGz z4l1yq=zY2E{F@-6U>3k1A`@ZTxjGnA&D|Qz6tJGw7}xajtmq6_2D*T&#Ur!ES_ApP zj;03M5Vo6(5c}ep4UTpxpqM-cZfO+E#0f|aQtyD;t^Prq-o!QMx$}lycPa)jLjZOl zB?tlO`?ib6r_h^C%tSe`nS-7ECiE*BdiG`XMghn+fKW;q@I5XX`1paGD_=N)*1R+Y z_425)4f*K}QrcZ6KYIqaXqwLyq| z8ZY<`+txr{g!2lHg_Q-}#d2||3#LJC)Btr)sMsLN9QdGpjqLAwdps!B;Km>+@}4EE2|49QL{6AyB)m!&&-;G5CbJDK;BS#2F(QN3nmu;E;S5T z4hL-tO2(5!V?M)_qn*!VwT*z)x7K~2{^kw`YtpFf zbTC^rY+C`>BxEoFn}%G%Psic&(5}^cb*lT z@UJ~sLiP%RqNIKCSg;m4At8sVKTebBB+^}UU_UtUH59(vcHB(L+`maarbhrkoP9it z(je#A~dh|>21|KI%qA}DXSjYw(^+& zJ*$(l5IY#pFUk-N`aOZfybf%$z}n6z;Mm*{4csrgW=ArM7946?BfLL;^~<#cb~7}L z(lO|T_uX4)iMnMVi#r2-%MjO*f#Upvl8?$^$g^ycJ7T$a*)O}r@A*Z8lC>tyRB*%4 zmQk52jXNdI=eugnuwy>cOUP;AiZ4rou^F=`-8XMYAt&Q1Kuu;xG=nGq1F;y;qhC%~ z&t2=?KtDsS9-fr@Ys0or0~JoFL4$`^R}u)~;>J-Ul%^}=FG!Dcaoanpu4q+@NTc)_ zoD@)2^3in(-0_#ir2s`|yI?^eYWO*8@f)(mCAccL83#Y5UWXm?H%djjUu#pZzBUD1 zz*r%b_O;eVI_D(W+q30SI~!3n#0b<}062mbEIFKAtP>EY)e$oQoCWj%wXKzr`34E7 zqlYk3-x?Z0E>?lCHoX@o@o*6~k^>MJkp42bSzMHzJ@gvfi-hm0f>mx|M%8feG0xKn z-hwi!1X6OyX*Mw@ClDA=Pu~E7$p!86f_U#2qqs=?+9#)?L_R?L%Gi%Jz{B)i3ha@G z>HJ?`N68OSWl3_uNrfHDeh%Gl}r;jNBG ze${AZ`3*QgW%CIO$AQa-P6Bb(Qdv{}MZ2maj97Opw>=AY@>i8w4P(ogKqG6|bcV;S z5U^L!r~k4aSNC4_=KE@$5R#qR@XjAYjaPJLY)^Nq`E`PHsVhXZdiDjL`9%aXL*SS{#4ceihT)-x`vgn+$M?vgq)oyI zM$tXaD4piE*+0oHLsBajg6+<2wAT~3D{7Ma8;`0^>n$|tRtAEOMWxy%@ytEBZ&xoB zaCiU?;(a`7O*Um#y^}FrkDL)8`7i zjDrB$&2-^~ReZ3z5>A(`bCjAm!E-sz@e87(_e$qe#ZRfTd&nGZ-PI_DO(@|1#q{d+ zv^yll{iC1X>7Lm?9dd_Uk)n3B%3E#xu5)pJP{*Z*HEcmkw?!zcPa`*BK*jSp(ZH@3 z>~s=YOoiprEHyw~`^cR4;Ziliw93#V(8$&_lpFv@lR%LR3RJ|v~fjZcI_EU&O*IT-(qwDwe zO<86tQHcsBughXP@Fp`$`p+<%XnpOmB$F_Ma5`{C$i(CXu0!GRFvxQ`vV9#+=#SIw z4NTYJ{%E`*a`V%ReLA0Vf%DorK9FIefzic$SDj33Ey%OasZCz@p@4f&%ds6m!K%I8y9X!#_!)`TfnGY;NAa0HiYeP_Afn@?XWb{UN!%IqF z5~aa9h##~BsrnM0X4#8}r>HMUog~FXx7)24UnIDPHN;>!s3zVrJS_x52V6pW z*2E{;wiJD>(XX~MNCDVow^1E!)dxf+XYj^`BCbv#aN?|x1bc=s3P?0}!DFWeO(>~8 zUtEBtJo*ic9Rm#PvzeQSB!7elPUl07Yb}uAcu75hYU`TjlyD%tVDtcVxQu~EQk(pY zwo;er6(AJqCnCLLq$5Y40%n*mn=gh17POa-sgSK!H8koFQ;RI zdmKFT5wNjoN;UQ&dv4)Ibu`*}99_R~@ktB+KhU5$8k@AQ+so`xLJ9pl3nT(vfatgo z@2?cePuEGfG#vVt3-iK7FKm+;TE`>42ipTavZ0`+}Kk!xFjFTM%V0rnaE{*2juL3m# zLzd@m0=-`Bn8|xlpM|d}1JLo`^4ue(cCmJKgH-p=7?^CRK34dYGcsx8|2%Zjb_4N- zWsqguc$F5mc=P|}S!;3^@r`ln=bUZ0yPM^LQ2ufJ;ga8;oM6O=JZDv_iy+SRDhw#q zed)oY|IB?Eow3A}))UA#tid#$`tJ36`KKzDRP&U?u$0%AwA_=yfq}7ZZM5z7WzUH} z&Tp*<%zWodI_viSj?uXX8hsCxrY3-b4ym-IfNiA)Zg?*}a^GtLd`H)e3a68kI;mv~X9@$%J^LJOXgqTRUmaFuHOHJ97cAhYJEn>BaKS!=B}eHSQ6#p41R5zo!q0OY{6Wh9g{}-Qm_1bBm+xO)3wcw=bwzD zHUdAfy(;YS>6yRi7_BhXd_yKP`K@m`Lt&6&Q!xIXW9rS8x_FPmdA{k(jnU6R7CYh- zHR+0@5Znzy45T%Fhy5LsI3MuvZZ+Yi3u~)2(ekin_bv3b_RK+yA7ovcpl?q8opVbo z5N%Li77?~s4}1w^89gcJQ}4uK5L>UR>fU&kIlUI%A^G%z&8P28bL1Epsufrxw?{Om z+PpH*N3?pGYktcE_t$@* z7}t)XuE(%I!0*u8TcJNyEFiF-QS0&x7P9Bkh;1E$fSMWGTRDAu8?yeJ zsHT@kkPADx+F*vLWaxzwbRdL)W06)0T`@LZO==mu$RMur=-Vbn=g8KT^H$Hq%8I8F8OBmh-PkWvR>}ryeBq}*=b=l$JNiB^!Ez0^mw`zKWjh{UzL`1h!`gwv3xe}X$bMC=Gx~@~ zxVXWTTVAL2OpW-%fV?UvrhlDa-7ky54@Vs#UeJieTGAGJ(Fy6<_dXdV{^=qCT((b6@8IQ145X`Wsh zpO))t)`q-QVw_&51zu)b-S8)-u`|f0Hl< zBrhIq=7B*je)?kx%P!GywFTlwO9U6X2$dO_i`b}hl`>%YCvTuCYXId_+*fIssE24? zPk2K#rHKy9*jDA2cR1!Ho|h)?%56nR{=Upmw$Y45>Kn-mzBC@5ISq#LOwOfD3Boi+(}w^f#5{ zw%eY3XtOHYG61rGDl&tWDKu&Ay~y0f@!hL zefg`?xHO}5uMm3WkyZWld(z^nHpW-03~>mf0lRE582D`-{ZFs)N~)n%!gvrf6j$fZ zh9ab*v9b6_7ffdYvhNO%%#Kz2aOxZJm_sAVEA@QyUkcsI<P# zk{zcg@$zjp*%;W~W2;mqL6P8d-|em#_n>{GXML6^9g(s52;TN>2`X#eNL-e4YHbx+ zTWrm0%?!YyQIh<-ri>kn1j-K5v`4NY<})C_F`~(#*XoEruz~6YU29T`UsZldhlfTF z$#KZX+b8L{g8VI=@s^I7mlp|P=^smSzF`}9NHY(J_t4OMvDX&&AwqqZvxxksZLf_L zy9h?%r+>3HFP+3aa~umLf6lQXrjb1`Tq|vg6_^i|97Cz#dI&$TML&@lwdh?}y+i(j zYCNq<~*m* zZSf3NX`gM*Wo4bGfulElf_^>ETsMk)Nk^OEzwE;OP`wgmFg1Rt@!pe^ z76nQ9m{HHgU*jHnVO46DP)QEF>87&$J(6eIoMeNYlA_SaT{&E;RdEXsx4beh`$6V_ zxIffyMbIm$E9hC0GJlf&vuJ9jFCvz(w>_;)EDi1>fpQ>8Di&T=GeRw|VvM7Z8JQrL)WL008!F*d0fh4TW!NNj+$M7TNfW}|>^TbKTsdax45K)n(P?y%f; z_2cIkwS%n@(vRV|K^%fhS{1Rn3|nxrP#kz&}Yc+YD?xBXM z)H|8F&P7z33k2Ls{>Ox5bA@f`45Mvn9%it-cIwqsN#>3eaM|y*tCAcKm#!|@@O(>a zoumY43QsR5Oq9t{NPOw>nH2{-&F}GHrmb`ED|vYf?P!Brp@23+`fm<*Lks5Jby%|6 zffUy-40}gmox5nBz;#})pe$D?vt6eHsZH}2idLr)ZuUY$v-K@G`Idw`Z zXJuCF3=9PNoX2y0roJ5DA=QE5@4j>)=ay`gX-}!K$^5LC(w(6~jZi6v7^Hr?WJ-xSVv&Rgz1lvyJo?_2i>isOW-tGwZyLH#+ zaMMBgdckuv9p8?`CSuGIrNZ*H-c@by`rb!=#;}~Fa0ZQKO8udej7)D|eQQ?vH~~*) zk|OzOt?6k$T`;Q#QZAyN%N;6Lln6`=I zbiOagZzJ2HnvI`Mo2KsI4t+P{a+pX2c5&`BVtO zX}X_Q8}obx^Z!(|9*B=^c9QJFZ{m*i6}mQm@yKuxzy9w>Yx0J;Fb0oZX~wMDi*I|m z>e1^u`_|9}8WnX7Ay`TX8z30dJ75p}sc+ck=gdRh&y#f_q9Ju!bcViqvuI~T|1qSm_c#; zW)GGxdiLnJmMTTB6MQ5(I&-8-F)Mz0M`)o&4M?D0jR_C%ACp`zb}XbL;BtoM!WQ=-2+q zMm$48%KOe<^>aH(WsERseLXE5;UUjRAccamv^SHz_ebPd5v~sUmEq$vx9-PUistKy z5V-QH<+Ko6Y(n{6Za?OJsD^;fvjDl0KO;c~`JskJB4PbmI-C+%VyTR?0G?2t+E+(KFH!~unV!HW$-X z)%_QhvB?DDX`n6Na)z7(CV!uIM#E{&EC%g*~G^q5{OnkX+Ru#m|= zW@n|O@Cpg<)f$4oDQJc*ew=Vc9(?C7ZJn8)>=F2WKvtm5|PL}trM{rs+>WbM}<#;aa1FX7`XuyVj!8V}H;Ed)fHKKOcA zBsESNL*~ob-VMX{<-ufW74l!L-aJk zs1qHitxLO(mS*mM+qw4!n9QZff9-%z@Eb1_Hd<7e`cv!oob7$Tn0?rS6J%FsePTTD z)!hOxUlAeI^3^;Cld%NifC6&zIhGO9f)j>Ok+mv!}txU8JfTZt-^R#FrRQi(%D126BLn3 z*E?OblL~i!r+eQ!U0J|9ZcK)5$$z?Xe#D=l#kqmAzeAl^5`YQPv=A2JYZHY3wXW6k z%2MecCf?*RH}A57#_~WRen$_b>Z%j@YFNeL$4+1_;p67tq~?xcqTDf??6hY>1lJ?- z+YIsWS9UMBC6K_xj^5LBp9vT8#8x`}ncK7CHop=9^F?-Akw*&k+q3y6NYS@a$z2cR zrRiwVE>%fIkqx?9GzP*-jJv!>Lna*DrwsV&V4-$b+FllBKV9c%7Afr4^xjmngJ_fO z&l=<~<+{mlF@z>}7ABJ7-IPQ{>2{62a|+G7)cyVFOycq~QunE(UrhQYKvT87eE8Pj zG?gz^SHTiIQ^0TkJyOj8e&Et_M9dQxxYfiMsf6C{uAbJ*i^2QqGs4$1b=JG|G}Yhc%SsiGD~7aqzD5Qt;bpmBPf)$v z{VC?zl(f-;zNuncu}?|FQ2D!id`;aa!x;rDYAmQQScvp>j2hKi$n{W@`QzzWRJSxW zeC_}KS^DP*<*H<(gq8!L@rTFl{v)s@1*nnS&Z5Z3Vc|;c$cshxvoczV z`;?uH&F>|D+Q;C0^8_;WdPFpOb3|8XZg1$z1O}h> zrVQ2|4Tg;f;L61g;o*)gIn)$1ppHj)9mQHc{D^ASyIuzm)c)|N^sU~FF{6MPQ8HYl zTOqrRRB|*~)T5e%MV@}&44br;xY40Ua7vgt@eNj5E`{LaJotRHaEnG&EpZ~Z4eUxL z59fW%{DPS32$>#EIu7@&2Pm%hj6tZ0b8YK9=X}-OD(6){WW_vj%kvM#!$hlzLQ?K> z6GU}babE|0L)XDhQ(5BV7=x@|c3>xf{JJ+qv` z=W@H7Y>72glr-@!3)5mb0{fS6i=@ zW2wO)D5K;VBZ=j9%x1`J1zS;`Qha?CH{QgM2g|j7O0UGCZ5&5|mA7NjePN>Iv3GTj zP62a@PC)xbnmM(HSY&PjIzvNQGFmNM5}6|S)!JLKyR_@_<-j8UrelX>Z@Jw|g&Vm- zem)}2&by=FHtxtT%{7S$6?BZ344!n1!zM@Lca9~?JHysR&Cu&~Tzr`Hv5xaO&6|7- zY12H9R;{QEurs)zRDpA4ngEW>B3nQZm`ug+v7XN3)8#WKih^7duGgk#;$CRYsMlLB zL}XMjwrSa;CdwTTC@fM`zwEh8K~^{&9Qar_8Jn~tH8mc%Q1Eu4EPxCy!BVTknjF>C zMHE7FfE=dn1+rbOVPGWUd#;49gX_74#zJnTyw1BEUoHWR&bXtGW1lGghM^>A2(1B_IiB93J>;c%~v7jfc8q`-r9We${Z zI+Fx83>J)>5lA=GVrw>(!V=-o4lDj1qH>Gh`1dFV? zjN0%Um+;_MouRzusMJE1dFbZ0vhw=tMq9pynve5TZ0q{qLV(tAUc7TwJockfj=>gt zn>3$6U!Aw{C032)Q<{M%CG4aOdM`r`ALE2eQIOZNY>+?{cib9~)~|SB*JQDY2&1P* z6fmj{vuGa);H!3+UE$!kse!l%35I{)?2l-lp`|fxH%g~4Y@Ph)WQ=L*)c7Tp^`e(q zho3c-Nm@yJS;T}E(EWde&9=^Fsb)nsk_>f_)%j^Gg?g^%FAI5=XASQD+R(o z@wIG0j34^|2^5#=zOi_xv$;Nqo4G?!z@+P61I6Dr%3*P_o$RgN;qPPn<+U1@?uTir zId9(9Kj0gXe52d`Wf(~7x?8K~<;k8xSTh(_X$;S9kOu^D%O%B0(&Mb6NvgbAjZyro zxND(@w=8;u8;d$k;mG&#FL594h)4H4kL3YU{w}xpRGZTe0fkPy@~*ww*ZSEqD}yyo z;kCb-t=ad*LoWLO{Q&+fs!0oR$2ty1rG<}AxG^zPH;B7-lqKCyr>Yw|nM@pHHnV($ zlrE^oWLChfsr`Jctf4Z#O;{0J?Rib(jcA1RXCKAR`8_3C#$or0Y`cWj%Y>2K@3+>c z(CHF{wcY2R(8{4a1hs5CqQS3p>y#Q~N!F93ebXShBfCrpJ6N8~47Z+I7AZNq4zZ z$~EM7{vrDA_9ZFg_|`hhY%tyiZJ4~(NODcO~e=%pVEyI0x2{7)f^iq(& zBGyjVpSuf(EH-Lp`k^zrp7F~Cy0J-cnp6JE*w(dq%niX6Z!56+j9yME6P6;$Z0zD? zY<9^)O%^OP|C=g1*1r}trc=;wIXcUhsETTC2)_+i9DjLC#MEeHo`z_h`5SUi_D%~6 z)$3@<2yU$hM1LBzLq&z+s@DKI&q)!m1)1y!~n}(Qf{mCvRsIi2zKoF1Vpj3;?`}vFaf!aY#jYWpOo>u zykhSl8zRdt*T+f&%C@x`^MD`*fl{73**F>OejT!K<7QJnT{3tNyUu}B6jZv*J=qyz6Z}8n|w@z^bi_o%e zN~%lZ6=U3*YW!2LaLdq+FNY zF#b-D*6}#R(DM_Wvg+y*n}7gz+5P@ew2V;@wN`*UEeQVyU))k(`?AU4JiKAKALeln zXl4T|6SrILZ+tI3=kH4uG6IwX zZ1q@hSq7)L?eZtXgIs3&n=I_K2 zPZ2$5&=f_Mz!eh8u!ck6nr6OFmjO%iOcz9PbbMiFlH4i5>na?p@h*IeG*77dBx07< zaL{FK^UUf{N^19s;CbY6@gb*{N-VmO0a@l&x%2RWT+pz9uXQmh4KX@i(OJQ|xce*D zS+frsY1)&Y9Gk=Jz{`x(u${Y-^rPRbQ}g=qV{!jcOx!%jho!2sv)f)8RQ}cX>Fdxp zZ@<1$ApO4Ff6?y@$RLzofGxemTZOolYaUo^@!O{|vHLtRd@dtBBojMh z+vC#(NonXc*L!+Q_%ObDPW~jfM^A44F2P*L`6b%iWb){ubT)>cE}6B2^QoVUy24DH zz)nHks)Ky41f@v%!>)&3dNuR4sVw_1!e#ILl zFo!~1?4JC*v)_+1#rnZH?>d1>{Bn8_6vTkKev-Ueu{=mV`w&23uTdnAN4 zg!S*|_rPC=iP`$)5N%kvX2-?o`8Ndd@-dT_5~C7tD}(C1 zaAt@AU+>npENkP%=Wu;X?lBA7!A|l^jbB&K|jIX{+R>ALEny-lTh-_qryLFNh zDG3%w>o`h4?IK4$_RvMqjX}C7Gb=jXV05)?ts`rt0}wJ@YYseoFwGiw%1JA&PZ;#1j+J1aeBxk=+0`972AoAR z`)xoLETV|#b?-2Be9HC960!*Bvg*C(K~`3n=z5QzvJUd1K+e@`o71_&)&GS2QOpBE zKl%tYt{+S5`^*-*mDZ;;-VX37g>{8L%M!sd7lL^4IJ${1>+ceuAT~2%V6y4?FT!#- z2=!~S3Dzm1f8*ZPO+0dIc^1>wldk3ZD;cvHttaE2nDoOhw9T=+Y_Won-_6f2u1;@N zY4Y8>YNM9mdn6ljQW*()EiZgm%!}WAuS|@}@oky3(c$>9{`xgpeb7~JKzi%4DK&($ z?ZG66XQGDIn|0o2{yZ!K}26(#}poi|aJHivsG7=pvw0FX6v3v>~CaXEe!r3|XaIz4%lqH92 z+**-Q9W>g~Q=2EYGm!XZTk@ydZ}_*Z7BUFp_)l6%i2&X9q0- zM^}9vLylXFA1;kk>U}1&BqZsX7D5-6Z;aIX)dy{B3WJ@FgL^j!NTW?j_+TuRYZ(e0 zo$QnRr`J=465Jk%;5Vy`ZKIu=u#Vnq&G!$564iv=9`L-KVRfO`)xH)chd<9G)JS8U z%;wx-lfi_U3~uWmTMCX5DZq~xlJJ%UG!2uk~53cZPNCG#kSuE z)LP&kxI*6c%Q=MV6v_GUD~;E0)c~4k<;!~PSr_zb-?H1Krcv(>gC(WUwqlL^DfPVj zZWcLr)o+!^#&orD(S+XPfgaf_>yRtb;)@eSx0}4xT_fo`?!kT@)cHzlOl>XW1dnqP zWCko|VsvevFyAf!q)pJ;X;_bFfeM|6mFGUPReq=IPuhcWjXW93ZxIe_2+v~Hm?-VA z7(kRI|7&fC7}@FKlYIXVb=ct_tehDNV%f|To$WnQKixBjuBmjR92m0P-w1~nm(^9v zNVb3Z7)(b?1l2&~l#;{_*XHiuyHAA^`Q_)ekK8r}Hl)9NkiRm?X33KQ$^|nL&*|IK z&F?u%D7-H$WN(gf5#EysGAuf%+hta~aqOpEae1o=u@D)3Tl#b#gKGpqE5sA}o<)ow1s^Eq#UyNQr(x@Yc%2t|j?We_C zJL#q1rGvL$C&$JP-rDwxXlAzHEy4z}y{_>_c-7sfa~D6qdngJD71w!ruTs5K<9@;- zV-a7PSPW9|3#8$vDD76D`!(rkqE6Cb&r5G+3&Er};sSea#ruuHk_fEs8O5U3(rTktjMT$DfIe3m6*O-MQ1!H+!#&-sv!eWc>$% z57eDuDGqSd=|XA|mXXhv0!C;naG?mekr%$wt&-#~Q?Z`z0qnRLVF;jXdy&P_5^8Wk z{EHy4TY~^1yRYlXND(M}u>lC?S~Y)J>#}qL6V(`!GBsXO8$c|^&sp25qc`}KMQP8$ z;3wa|1B2MzS~rGx)qz&Fa#09*VKC zvkQ1C>mEFUN4t?B=B}9$cXlojsIYDlslr4;gbv1GgW6iTDtE^VKH#necJ&$i{2%1A z18smNg0SHP3AA>8xN$R%ao@8QKHUlD%a#p$GR2qn|F# zuD;(FI932CgWaQY+Qmth!EaPh17AqaHZtAZEt~=w?S!^xOJ5hObA2yHtY~|vwkcAb zzPNnI@=UU$AJEgoFWC{FQ_EnrXZc(`gzk{PZ+AIWv$(Am42|r76Dz>YhOt*a1-l(hwLX5X9@hDtR5Z)hlQay#|tKNnhue)BFc+&98%2P6A{Q;eCWQe@|Eh#>w zXS-)5d|FXJ>&;YWCTayPl=`4&lS5>Xlr-iA2xx|#WglqAQ2J9THQdi|QJ~vbl@;a` z<&K?945rEDY;+OX)w*AM0t0c8sIP_smDj$!Ia!W7_PSOf0dr0T^XH~o==uqO#i{48 z+ZGS`F+zs4_pK~&7qr$uJt8M}isW&2mFX1;Y`VD5C_UA!w|OHC zk13Bd+d9}Y3wm?K&F^p9&!$_`$M~%}AN`2s#ruMaEA%YWa_+!u_IBoW2c#SD*LO$_DtN@AwoOQaKd{)M!} z{Z)y|4=kQDaLw|?rEEWlW%z|ps+bwI205y@KXLGOR*BVwvgEV&JaCE?Z@M$rD*MP= z;$r55mHLqIOIYp3(&G1I$hyjz8o+urkXr@^1j@%64w%#bvn z)oZ5nu|JE(Cae7^DpWtR05Pb{K?UdK+rWtV14714QGmBdx=}|W(Vyo;v0;Ik(ae>( z_gh+3Df8<`F0n+!k4EvK2k*dig2j80s>@+6uCrO_bw)Xv=JW)`{g@dpF(v;qA;AFE zcB67&pLAZ#@+5{zC(7zi(+dZ_0vZ{lu@7&YqGl*K{T)dPD1xFzotIP3gpr*F-e~UyU;#1_F>RHV|xrL zuZ@|&+f4W9#XYUM0Vd(aVi-0`UuXY%ra3Re%3pq!rxl0JCFrD%w*3*zVRD6lHzx&D zMm)W&G@}K=_&9}Ex(eWfLd_7pTfQ8bT$3U5z#ocJzJye9Xu2MQwG4|Z>1{ncA-7(f z6pU2QINjgygIF}(R7-+cqOcR{lgNX| zz{s=jt!*Nr7eK{>bkOj$uT4-HmYYjm-5zeZO9HTdF?z30ST$R!MP5s*h%!3z&X~~Cwcccrq+Tg#nXsyM=ED>InULW|vS$ankA0UYvq`tB z(nn%!!x-VOqh-nGyi5?P9@H|K8;bZ~<~}i-(kr!eI%;aCYEz6B?fuJ2=2`UQKahd% zj->5KgHsv{!7e-&WVHscE6&C~DY}8S=kdjmcB~7bATT(~P43C2$iJ=)E1l|o=VdWf zf*;aWc}SX{n6`KAliDPLhl&fZ&%%;@Y zUKNfY+E$PK5ryl-Z1B@svjVL?##dI!S-CF5US6uvEt_{wlCr*B4!eEkd*1tQz|xZ= zp@KE3+l%jYRZmE&doWF~=sdtf=29h@TBwSN38hY1V-i4()YTFyc~Vw~`=~mGn~%kK zY)_n&`g|#gEu6nUA+ZEGjh-E|c;PBgbB80i=x*XmstmFl=%$9|P7rO%NUYl^w<3#M zp%N>Il7Z}Eqwywb-#DGX@u+nO_4l*IZRA<$-8dzhb7ezX8g!?`sT6;lfiddzZz1PK1!Az4 zJr0eTkTy!qe6iI9=&co4XCTBpP$#6$6}VwlLib@q*3c|+dCsm|U~GoI-V8A=u&>j* zuD1urC}K1ST~TNKh}~@5tNafuXMph-g^RR(1xB;ixD=5Xd&G?CQ8vl*QH&4p?wd}> z)4zbNF*p7P$}*h}Rs%_@Nvg?nuAXFr!vLN7PIZQ{JF$wm73k&6^#m(y3$VrJe=oT? z@RB1k`QcY*s4W|UmxecEnC15TcMH8VliIU2_eyh&2?#B{MgifG?{!J1U1!c4-1Ihs z@qMobP#(Y&c>;C$0Ci}&K%>%HP@QmPzkww( zHN(=ny^JJ;9nEmba#oN=bQ%^e$?ra<{rUrbXV81x(HzjL05vtM#1ScOXVnUrF+n6; zxE_m==`&=(C2iR=_6fr>mfX|5UsWl)-t1MbW&Iy#Zy6QkAHDq!3@y?q;J^TbQU)zG z3?U34-3X!*(lH<%Lx_Yl5+V{3Lx*$;NDQEaq;xZMcZ}!${hzbWS?jzy&oeA|!z^I0 zn|t5iz4x_0S9<(~=Y|>}s>7A*h+h}J!kHVR>Mv2i1g-I11Q>Ea(_QVGHI87Qe#IA7z91z{&1CU|8(07M+$@!OPdDaK2-|kh+As zb!Os46E=X;r9~6dp!1eIm^T~xcsqV?;;3M?hFHbQKw|JWQZ#!t>blc$od6Phbezvf z(HZ>wDU925RUgGnoznqJ;SYw_i_#7omZy?+dqWJCI({gU+RUJS0slF`Y3rWf3O_Rg z3j3zn#-3z5Y75%Gkdw1d*GTW{=U_OChR+yzXt^mPa}MnR~Zoye@qhw;Czwwi2A zhJH|AER4^6#?06r2!AcRFCl(Ug>?{aXEt@)$z#f){B8)0fMmk@W!J@;(oU>h&!?44 zA3w=LNe4--Xwu2Ut`7S8l`C|O8Y;t{7ancy{ZoKzF`QD8UK5poQ2p3rl^86}$|JSR z5j2%k1<{W)j;2nf^3opCuN;Y@2%a^4_Po|JBP8v!Ch0YwKQh{Yh6*PrG4vp zdbg%_HCk2i`eG^3mynA!RGd;5iFvi5!3+#$!hH@-FtKiG(7_hCs~})dm!QhKT$lF@ zpzjApX48cX9u~L7ewRm2;qwb38!FRi3^H&k|(eJClg4U<6oVn z02+5Z!sO<^X){+db~+K?kdA*BoP70h{SsY_0QQtYUR!&btcX%2YLR^#LOD(7izuj% z*On~(rfCNt`zi+@uDK^$Q;X=83LS|W zF2dVyyi0Fe+*raUTfb_>x1vDe?Yh%*i7dRm%x~!w@fDP>_nCa42YE=k2TDkDvK6R; zrqy4hnuqAU!rN;~4FOTDbI@fip=F>=_vzJMo#-7OdL3uOiqKChci*J-jk%LlM~Y{~ z{%%ZPVpo#Ix?Q3&+|+~e`K5IK3?HZs#ifA!cfV9_;YICuO@mro{u0G1M5TR?PDiVP zmZn{MdIc=MQZYdXuhX;{x-@$3_w-8XCB~vVwqEMvQ|{!k;P{1Zu-bz|G0RicDIHUt z2HKyKd!bHXk{J_bC$Hnd|9m4ocU>~y);m0nQ_WD4I-0(|9F4%tiy6NjGz(Z~B25Sg zxiI`@d@i|WlN=f<_h6Ep9TTB;9z`$vVS9gTyo;FNP2ft?L-io@MD59pY zG2N}4uV1a}!*S1N*!9_@J+tP|pSB*wX5x_SZfwdTUyPpbHYOM~N&5#f_u?+JsS@-1 zDMEx*-6mvkS-WJr@U2O@fj&;@CMhc{iVpB)Ep~+ROdl_0@|1b4@9$Lg{CFAK+2hvi z{n%|QM*LK!e#ek=zjKw*eE+(?!Q1u?JN;IS;1Ix_9Q$43o7p+?oJB$&mxf)qQ6{oE z4vqd9_gp#f%3bE>hHoW+cKo?@?{_V4j#VP-Is&hh<4`gO$6O(ij)i5rgc z%W43TY$_Vn${$)OV`rz}nn(E@5YV&>K^qZOx_0|u_BB77+gM@3!_X8^*+iG5R8TsL$1TkL-vRF~LrKP{QN5ii$(H1I?=*HZQv*MuVjUNik z!qwXSbpLNBx@bLu2kr3OKDKUSf+lDjSg7;L#EAM#=vFo_-Kzf@yN|OEK&GYBk(&DR z)eL;L^2Yxde~Os91=wNv+|t2|Wq66KE>(U=6m7lsvNNP17?wbIWtPI7YQ8w1oNCQx z;iasn=FfWj^_wWxhD6zyH;nergS%3>)=Z2g8SCZiuay+-6*yIcmfYhJ&C&J1j=0bj zMI&OlUo$szyzq%#E#`*E7jeQc^f6-KD`IHWHaaKcTW;7FW^5&1pTX}Nx)G6N%=Il7 z$Xf20$k)=)fl^~Mc6g~MWY56i==4(-k+hxAr4?_-#|hjykR9!Tb$unL7M;xDgMXm8 zS1UY(&-c#p9}zPmbEG0$d*|0-H{A^FTzayd@WdyN(xl`s+=6Ill49 znaRbt9{@JV0v6(cB2dl7oMuzIpPCPq`oA_%?C14mdpwjR7eUI&(1<}odRO~?(rAPJn zqtalc&ts3}H9?+FK&9)o!_C@TI;ulr6g6^pjd^P>k2&j>n8M9tcDUxD=+s=)3&o&o zZUBlGe`Egzv%=b_A{m5-&-t_w1L!vSTj)ddP)NRT1FBv?Y(Cw77}C zF5(4w$7?E#r-u1|llK1y8jhR6PJ6EEJfvOD$~(+~I6{*^vQCCqr7EYvK1yiGsv4v$ zPY>(C`4F+QnC|eCHJe!%bRhN5l4R`}9q*3-j>K@<1szV^m(rVl68B$&G_#*!gWMzkLdyI==2IhVIqvzX7uI<^u4k^gg9CVyUTDWILX#ekvIbuB zv0^6xyW^$62gn(tn2<;%Cv`=Fn+eu+kE>CFUOVSj)20EJ?7N3r=u&;RE}7M{&AL56 zAJK<5MXN>(lAe?S?(yT;%+wrGiiMC~EkF#^vN(M|QztyNM;YzYu`vX&5x!?diG&p5 zq%R|Pq($vGN7xoqs9oGLPm$o0e*0Us&thQ*_lqaJQglA3XLofVehq;AI*vKNYX~Uc z5WBO%!(*AuXatPSok-Au=ukH)6;kPt3DzRB)kpCXpBFMD^!$1hKvcp2H$b#ls-xOD z;vfNnHcRT&&_L>lQLn1kHk;TQC`vV~P%RZIil}eZkEk;pCU7i!g&f|N===3y3 zmeQ&8ScDq2$KYBvscrcR$y`LexOFG47nCq=dUFnm9j*?u^Ev4kQTcr8=rQ_s>hfL< zwEbPGIRbd#YbNSUL^r$C#;#CHsHG6fZq&K-Ft-l~Z@nlIjG0$8!A|B$t@vR)F?m`n zr`OY_0G*>zgmRMmtczPcB&+z0qkuknC|jey%Fyv3vLsEB^KFXUzbA=z4dgt~hb zb}@D<_QVA}f!I=sX58!)P;j^RzTK(vQ1cc^^n{BOwOu*3#KJJM#gX~z%g>)*^oky^B~9?X?bvsSHm)ZuH?3X;I!Bl9nGOdAy!;X+}1 zJUhF*eQkkA_TlbT4QjR@qY22BuN%KD58=F&HUFI{ZO;MX!pjT{&<$OO#~zmnYe<#D zBmmRL;_A_`T|)qg2K2h-6}5M?>G27zs_~7ZfV?Re&(NuX1ZB7UU)NZF)RNgad`8F&74!~3Q1 zy9~@Z52Q(ZGdyD4I21<4N6s}BS~Zf{7bw1x=vPOF_OV8sALRL&7aP}I3ekDP5SgOF zz*S+tU~-rFqeylv$9`HLbPQj;&^`T!9QvBhZpN8 z)1y|w>PLM_`-Id1Qill1l0SS$b!->KP3J3-FPld@U}+|B4hQZKv8=}o_G`n< z=uuOlOLgMXHGcl;q*ay2zRPQEctUzontH;xph8PJ=6%Ee`}B2u95S2HeCl*}nvgf0 zdY3-wIvP$~xG6Bx!SS=F;aHj_;W{W+gM7LjHxt7j6w>(^Wb(zOLppT;p@0`HcP^To z0jS5+)f#~4vVT0%>Equ1bCzuS>bqa-VC;A;tiFem z%nIJVrI%VFYN_$L9cbX1mrg_Hv(E~S==XoA#_?5nr58dD)Se~&_iQ$6eTNo)_8!2y zJVL^RUkM99x~WNH@^e%0GGA`S)&jX(2d4n}{DMByI`7%}(auePT@>3W0S|_X^jw!% z+D8lA1$P@y<=vW?3w$sduv7mJ+89S@fLzFMqmb8nwRQ7f@AAv#F7%Qni?Ri{ zDRYqku}V~ac~@5I!rk}3p~$CyAh*l39M=FN^j2!2g<`;o6QDB4F+)$lZtT9A)QDHL zAdUwTW~_S1j-XT1mt$m>JVxgK?i)Zx_T)veF6 zNbfA=Jn9$BoXZiS93pBek}z43SxdDn$V=P}nGI!sve)TFyBx2TeSr+%?W$icMX`#BNk1hB zZgYBB&OX<{DjXG%S}GwP0LXRT)Ovt}g072<1NO^Ct0goA&Xd15S8qu4Moh%MCB2-W zooDLuq3%rm-stvDYQ?N|H*L`X0;wfGU2530h1vLtadad7E(tVfA+CVdt*xzx0+GZ^ zM;nSA|F&IBQ(`BKUE|Gze}_xzZ3LYS>Om{SDWzS5qS&#L2QLSR(ALw}-pH@65xF15J$g zVySSMPWMv%%H+4Vjx3KsmSY9Sf8RLBywH+ag2bP|iS52{{B48UD(^=w!D3U;W;c8` zWq*{9gfKJajUqv&4!3FAxPQe6J_x|H7$p4U4`$4Q#s23ZD(AdzcgAfVIS@S*vJqM5 z=nMZm>x|+87@E)&Rm_8^paW;I?rqGfbI(Zu+q72og((TR0A`!#)b3VZci_0fbYvhg zGTX%fb~y^#bL4Q@(x!_fZUCJr^Tn@hY^Rs$&aQ7cEH$TvDB}DjkteT z1xufx49>a@%mgI;ZaO}lc>ngi@7+E3p3RPfviQtY;P{PcXlzW$Du?r0=trqV{6|m7kM;&Y`!!rD(rf!|7pHsFjjUhGD9hdNShwv(XersAf6!o76r|wY z8r3uZOJxxk>-6o@!`^{0G^z_^G%9Hlduh`;{)PM6yAImjdP$Q%r~3U-B{0M^$_g2E z$0(zRdj3lV#~iZEl0Yw1RTrT50OoT=PvIOum(;v$6L-Voi@Z_SYWxREMx6B83VtJ0 zvYV~D_Id?971X=sX87m}{(cJfCjN#`>sL>JeYrjA#{MBH@((HD)f6#a>3a~al#*6J z8udcVq|9XdyjP}mpo-vfJU?t8w-o@lY0Hz?p1VsoZ6r!eZ6nz2K|2=quui=m$6~p1&_SDF8;e4F7 zmeP??X&@cjTe?PP`D=lY7jZQLncZpKfebAIEuyg^KV57HR{KR-TSU?5UdCK=?d+bq#d*OJ5|wS8`Tr+6N6?_@8S3afhqGG^<~<6|=j=4QL3r zb;IqfMXk-h)d(i;E~Ts6VHaJq$laN#%X)(UfrRXSp1bx2T$w7Pr&6H582cRGnTCBV zFJKgzEoy|kd@^!57STv;wB`5+r%WS z6K{DE7Bd{Uos&Gs+|I~uV17%9@D!Hzs35!}xt9t0LKboY4B*EKc3vPUvJ%%x?6wV1 z-CdSanT^j}(ezWL&(#0sG3F+U^<%i>iX=+OO3b5!E36Z*;tr9c$Bfo}{C6g7vUCt* z{1%8kw(5_+l|%)}E_xy?anq3=jNQMk0>c*`I>@XbnQ={?g^=$(wd+Lub#9K`w$tW% z`iGM-0ea{pxziap?Mhk|m=tMaVpdpDtSo>tMCLm4y0AQBhtTzR!(%6gQxBR4SAX=( zoEXeBoDsR$i9HLI{o4H92g`gBd(}DoF*S0>+S2S7_Z|&!Fnvu~NJXa@pN{^;Io&~R z=A#7*2NhUG|8--q4v=YB!sveL%hzoPIs>}r1o$Kt{sV{(KjvOMnE?}kJ%|JqiZ*kv zCic9A5rJ1L@aBb@xh8SHOU#MTbjyf+j(8IxB}Xpi`IR`CcNAo&FoxB;9Pw5BN^Laj zeM<25BE}f26Dyw%|sz`}$ z4?X3U*v3we*V8$@aZq9W>X-(9Nd56<|C3Q;1#Bd35jg8S zMYKQZL7O37%%~g^P?Eu^%ZHl!F0)kKdox#v-=rP}5___3C8w7CE4ekG22{jlNVCN6 zs;5^PUe|ovU_3WnJiuyR<=)qW4i}!=?<%8i{()|i%cicUaxwTc;FPd{$kSmThTGlK zJp%xzqfy=uzSUg4zXFo*YZY$ot7AVbDddN0+2&66=O*CRO1ZEl$o{JxR z@z&Lb;9~jGfGW>Zpeg>2d_gnnxU{3E%v#Ib*%~0Rjk~jj%Gtdfc{qN_X^vseMGBWMD^Bi8mwsE#-}_ufRRh}Ah!_RMlO9pfKX zpB{c;0EdrJEch&aU-~h}$i>eyIP=Nv9J(*+LB4EAd|yxi&m7Ly#R$u;OrmZf{ z@4(o5=hR!nGf-CP_|jKPC}lE_tJ>i{TjAiEy--0;^k<(X?I}Inq0>E z^e1$iI>8&9LlJp+Y2SWdLGBG|V^7VBt`RGYOc40tKm>KNbDi`3z{@z8tx$;_(%G}% z4K0Hc7#rHhc6y%Wa|$~NRd@q@aiG886guiJ+;hFzGLQ;cTqFf&-v0+W1=vk2&EtDC zuKE^(x7p+9n-Fg@A=T!Rw+1Ba*VBodCgQ}fac_PuN=8e*Zy}}Yx;2z zA9vsvrb*=`I^9d0IXW(_tqV1u_l0;nHx_qfmWh`bm$364E}AR$KFMs9^4C1d`}Nna zYgXoH`|Mut&GhZ>78!6fco6q;wxebzSI2rS!ehaQVDONnIej*Ko@O=YIF&Nf>}zq= zesX=C%NK8|NzR{fn)>)V*OJhm3Ff8K)H%urw&LxavX38HWVDc#Qrd$|=>8b79~D88KIATdS>R;FKvb4ax3u$d*kJuzwELWLUViioHyb7R{Wa8}j8X=WzjCo81 zny#QGZpR_v{jtMo>9)o6ewh-!ty3+YGJ6xwx0_VTikgv^ z@g5bw6oW;>D)p7VtLBOEPSi<}4itb%g_=tLM9h{|!N6U#7a6RhcN^-=5me|ruSL?z zg`+LA$r9w6t)UI_Hc=peH-6CU&{jh7lR&0ri>2)MFcDbFYiBs|Z@`0G9>X$%%RklW ze)Y=h(GcR|;);<@_MSey=L@(Juh$3lUOzy&#;7h@YoDYlALVrIxDyq;uY zg%?hH{fz#x_I`@~VY}~YTzhUr6JU9HQhYGL+#zC;y5JnR^X3E#{c81o$}NUe)iqsg z^}c4oQl_};7a6-5r30*FJ>1|`d<W24sab)rHz*#H{i1rdD`bS4pVuK(b(nALC<4dcq|tT$F4*K|)<3l~|qI0Yel*H3Dc=8rZagRG+R zt;0Jz4<3q}-XIlpLyum0Dep3Q%3CBJux`HR$?)RjQ&4%lo%l!3->c{<=PlJCDcsGO znaIa2b3;|Z@>?)Q8rB4Z4n}^LjU@V%*ApAq)+48zsj)owf#bvEnwA2yfMD$^Ma8v0 zAG4ztrRsw^=RNus%_ZKed0XE%{M~|W@pVtikSB#p^W4uPM(hq*T@XUNK^cw@Xp>_d ziU<3dXR#v%`5lg2E2-U}tKV3Vznfv?9c_tFE=mnHNDltx#xsOXxT=%!l9@-tx_?hL z0fdh+^2OsbsE^08lfUqAdLg}_o^4)$YmX#e=F`<%$139ZFM@GRdPPqvfIPrqYd+lE z)8IepO6u%)r3E(DC+y-*qE4Aiue_Em1oQVjGlWW~CsIP6;#8REOt^|&$G4(OvMf;} z&VB9@KOQ}p2WI>XbFP}=Y;J+nvHNTK1gpTRJYD7-d#?sZpOgwc zPmJaK&7k9=3>8i*uvEOTu<~5j)Hg-1BV#e+K}MFp1=nU0JdbfHm%ausm*G7WxL>la z1*CoOn|`z82i_m6k}W2rSH=wVbx;XNJN080A+up6ZZQ95^YZ?dW@V$J&;@tIa|eVZ zj(GG0y0{ma?0rv~(sQ{+FWJ*!_u_*WrLa7BY{&Zf6(cXZLxCk)%ZH`OK6BepM<8G{ zFa*C}cB}oeUFAjB`E*LSwZyBXSDOfyJ<9qBbM2&Lo4-&s=q4gbhyJ0m=3jU4mN@kC zwslA>ARc@bhf{CiZW%l`@WiRVSJA`kSLL9Hmpy4SZ-~Y@Gsdd&P?-6;T;NC}mred9 zgBiMB6p(pIuh%Fy6E#=RTKFJme|_2;sC9g>Q7UATw-#jWB@9 z>`3->lu=rOStJ>*)UbShHhf#YRk&Y^>ZL&?kk#ss3sE`-78>0Fi6u4p?CHsMtkgQ0 zn$C8c+QZKbl?;@a4Rqs>HLfht$(`Qcm_SycEJ~RBf%+0&$;|0?*itZHsx5AbJS>{NvxW zS*#J!*pT^4U`vvOQ< zE^WI2y_x+@AZx#BWv&-ae4|T>%p2Q}A7nTssMlxiadsO|M6C-dYd0>#rd#;3W|PEE zcX^$NK4Z3}QL&|*K|sGBlYnG~v4(PdAk*#+__~qp)r(!$@xwx5^KIKzF1)%#0+r7? zm67~d*#O-WM#&zgCFO4985Ke`oCn$w^-f|-_4mgFDbEe{r*A|)R=yM@WK6oRd9nJ0 zD}Xe!FE7S2iuRtmb9^7~TxR@=ffk9GJdgHl*dZgIR2JC7@T$D0NM~2QiDf*e*I9@s zSs!b}1v}E!LFde>`TdGO-Uzr=F_q-TQQlEt9`NKCKoneWZqBF(xgN}9;hFEEuJQXt zI*egVEBeBw+4>DGs?pxA|};PNDfGI;x2zin_im&Mq%}ikZK2CmOK1M5>q@^;@htd z;Ww-9SFMZMqMZgvnt18!H60;>(=1X|B1?5YSJYJ#>JR#I9+!rJcc#dtA)iIV1D0@@ zQDB&?4Kq`jur(o74IMlxqG1AU;%qJCtVMUpH+l+{B{uoQ zR+PGr@5^oMCjV2rq@u0Ro#>>Q^vcla7Xn1Atk5<1_ z6utM{SU1#`CKSIxy7sDVBDpaEoA0~UNKImAK4|n|*g*SsXK~#h%XNexc7oAf$9N+F z-Sc4OayI@la*(82#c$f8%OC^HvU;gs+Y#ESXqL)r4g<3uN204?>$#vLcM0hhN!^>p zx}%c!J>eFmrsKI6oZ)O3R6a0$nixNb?_R*JOO)i5M4GTd?l@ahDoqnA%9e$NCW2eD z&x%hccwIw3FY=Or?50RDadoPL|GFsm zE}Mmj#s`U!y>2ux8xcIohtd96_7^WUzz964(TBbp`zZY7jR7aG_^TXhoY|v-Zi_Le zA&b=vx!7~sa@$OHM2+{IAK&0^TBZO^)&p?gw&!cQd%KS>QgDJuug@?W2A;~1CtHUP z@Lqj_Ea2h%)WX_-i6E4?;fx~sz~^GVX|esdiHaS~b}rhWZQe&9s^!GGX~J|Z#}z_f zOWh=+ph0jA&%*14rLS=`)h33k4p+1z2U*VppQhgddQ37k0+8#R{p=4uWz>0a;(|aL za>yBz(`eZ|H#gIAytV-3yOr`WaL$*XLGWjIq`!TiIQkgNl^`5PvK;_m!|92HzxNoX zgGcuBVXv1&2z+0=W8NEb1&ywTi|iDb#+#t|2mEb+zxN=W`u2=3z?5y0?c|Mo_yFE( zOC>jW+eQ{Nqrx^@=+P}fL@i4qr~>)+^nM6sMniV~{fJvJlE6G>JPS@v^qudqd=JnT zO8$zL^lAvCG4Xq{NzQ)}gH|nSH;L{%o9^v@aF=@(a9-7Ssmv3ZaOu1Q7p2A{E8dIHs5T-S%-Y-2U(xbjJ}UfMyBh{vR_D0f_;e435)!{i z(pTL@-{U4%EJ$U-rm9VYo4$*0l8S2Rzt)6p=)E6Q+W%4Qe(&XxxbW^I!a1|R8Bhuz7iYPkL6YC+ZP3*n*gMzumFQH7J*Az~F)-{ksI!8=WRq6ESRJ_OCs=(g|Vo)kLHMCTgiquSMx&Y4wr?*892|tim!fp(1#)ZXEJ!C0079!nbtK6g}$XgmP-Iov5~_o?FT66kRiKH}|xYHLPUH0Vey91JIIOVKg%$h?X3Zd;C&CqO*j^`%z@lzB|-?@sfj zS0UUwQUE-dz-SvP?m+b!j;4s1&AfG{4YTTlIWdC_cDaK^il0C8fjG|h2F`+ zm|?XyboHX?x^>ZT?D^N6YrKdgXPJ9PZjE=Uu znxoWe+c*XdV$U@fNA}rj`g()}7MW3PPUy?y74s1I8w#q*vW2xH`{!Xg7HCOh!5Icp>dx#ogZCF0qO!{Pw=(^dxh za%BHlq(x=(h*GPM;;UkP{#kkIDoL*F&-EAHC4X$TU1f2jqs5l9=a(q0tq0Z2I>JBn z*=Z!*UcQ){GU0sYWKPULNiwLJ_vRMQ=xW2|un^~K&gBi`#zvdFl1$BV_C%5L&8ZTL zsOwD?KSLPxtO>AA^JDUc*bNMdyrsQ!e@Jb-@!(E2pHPI3fE=kg^_qQR+AOtcriI~3 z3Wbv0rx@jKdx_qbF1J?Upr40!rTyCme?&(5ii$@Q+sfQ@4y?#%X+=9gBiebW!yj*i4v4L9vm4jy1^3+?tE zWy@$OMoE}jM&a1T;tRP@X6ECsCB)z#8Ex*8Ur@%0^QE(9%{Y0O89S5n-=4C%d53p|;EuTQ&> zOsViG`Rohu@a(oF_hR^hF&=U!BD)?7ZkO6F6+c_Nm&?`6)3ZQsgL&+J89q0>9SIt) zZqJ|F2{IAPmvYR;hm1!hNWcq!B}fvY&M98AtEFF#oG!Bq4j%aG5QE%$;jumeo)(Bj zWfIC2Ix1?lG>=^uNnahv9ju(DQSX2Y`6;G?O0GWUocU5ep26YE=SK7$RAk6+Wv!qL zAKpE6Pw(2jb>TxWc3GD6GrKE?Ta`5Z=%|&Dmkwz@?=}@GenKQLhQvh$m-ghZg51xb zt+~)1Tx-qq`Z`MCy-2~Rbj!%sp=Pq&g&4kcSLDzL)m;6`kp?A+p{Mqc4pRd=U`A5# zs0nOCS}Qf`n+l1Mg_C8`0zi7_H~s-J=mqC5vxEo6jzeB5N&12x<>o{w6X zcrwgb|JAF%xJ9P~M{Z~s1qT{M&&?J#kwqy{T(Z?~XiM%eNl^Hj=Ou;&RWAhSlMqO0 zxJvqJyHqm7f7rNKD*Dc|>@rwmV)$kBT-xH0X54Nnxc85hFYxOpS6GRy@HwxAy`h-#ZINJd{y3X zb-{|ueBFYn>!kzStBt=%-Rr&chy~Pn`>;C)KCm?3!_u5XqlB1*_Eov_JWJF6fDV55 z2AD`ZJsbOg?Cp~x7G};yg$Ry@-!IDp}Ov2HEXs?BnX=A zUGO!OLV08zb8qR_#4^|}DAwf0fdW>nFP9ot4>)J%x`;dmh+^p71egXZNUGAC7!r|I z=5)$Qrf(|9&x#c-bbm(O@WZKQ^Sof$#eG{sbOHOD{2$*4Ql`7q+*6y`N3g`5bUo{c z_0#s?eBfahj;qAf>0VcvOrF>Dj z958#DDBLrZk|(%G5{qBAy+c^sXu$I|5R$o{-v(29%c112Da)r;bj&PMzo7NLQmd8k zgg@9#bEZ9z!lgyoY}D9^zmmZ9VugNQl$RiICO4yD<9zr@k5V_gXHOv=yRHkj#KP{L zLKVRWG3lUms@XzPFb4hN6F*PyjQ=`MCy9iUq_Nf)JasPh3s1$2 zg8%L*@X8B$!-+*4b;#Ex!LsHm5L~^enK@+www$~4i4$5?5Ia)Jv)INhewvrnr5lnV zcP6smcCxj#q0Qz4IXTHA+9hK1jj1*=WJrJdANEsrd7acwtE+y2!T>}Cbb>{KD%+fJ zTvse&G~yd41?^a-}+xjwI+4k zhS0YO03H3{XC=@wYhkFlX);s{6{5>1RpBL5pBu+L`&#&~4q>G{K=XHxErl;@%Kzr= zQZ|sPaLYhK(DB;!7at!>z60m6P)4^wUeoWQ^zSQl%ijna5=!)3O-ftW7$gg~rgTf1 z7t-}TVdgV@O9Hnq*jN7tiZ8_GD{D-es@y@hj-?!c5kV&P>N*B{2R`HRx&gZcZM@vym9#y=g8*GpFdzh12fk@PY4 zFhQ)j{NbC?VUZ`L!1_<5Z-3=487wVlIkDRW^R=77S}k{C!xp{Wow$ShovQvT;L9;) zKRFCGbKVKC7Sll(YDit4Y^gn^ATg?X3JU5s-#S$+dR{o8xzuC{ z<WAMIq@H_HfZsnoITkHQV48Q?nxAVP zzg6hy6x#XNK+#+d0?eWP`I|;WnNFkxrC%3=Yeue0=Sk~v6Lf zIsuMdHApAWUsvCd|#yN z^$@yLh8$)s80sjUu|&onFwb)!{9@qy)JKI{PK!UxeU~LLOcGi$&3er=@-D;#sGe{4 zH$t{;J*o)3!I(vDr4p3^@CgnQoXXi?%jq*J%wBl&;q;VlT&3O~Wit*YS)Yjo! zNxFRdse2ohft2+f$_TPFB$gvehx%x@cNrfxmfdBQ6(zXmHp0|R?c@9LO;KpZiEb)!6g_R z2%Nf#W(5r5rZB9lS&t@0U_N?=0#) z$=cB%ILu8p*v_?fmtt>%^j+T_7-O3Qpk4lVz7ab9V zv@Lz(DapzF!rG*(rs$>AJ=6C7wZg(N6qhzUlw(s<%#=!5`pcTFcY3PIh*-Si4yz$! zR$8&?VX z01v1k|H|l^OF#PBdc#P>PCCxj1!%;V^xeyMOZT_!M|BG*pI|jh4U^4frI1TLEW`Zr zCNu@+5=;`D+2RX<${TU0be|D6V@;xR4Kh~V4AUsLbkulJXJXl(sPX{esa!Kd7lcB_ zn7Y#M-2m@GL(jQQvH>L?gZhkiUiD7zK2krr`M!cdAtl-r->);MAS@uBEA6{wVJq9_ zS#nY5e6K=D0z8)~d&EJHjM2MVIWmj>Hs#H7zlgSGQu?PX_R-*Bm-kK5=jI>5#31kC zqCn(+2IjEeCk`ym1g@2$2AZ?hI@2_>JwXF<3o&Gc*(uRcUU)iRU){XGYKnEzFTdQb z&E|2}c14YiAD|jYgo1Iks#W)L9cS@qM`xk-xii-(H^cT&c-+p_U*;2r6RFX7fSRo% zwDYhd{bWz=(cuAhBhA2~kxd!lm0;_WPr7nji7==Q2CPG-haAzT8L;*;fF!PY^*ujq zCHV6;;5w6C3u>TkaJ?z06lHfs+P|pgGHDurR?vwZow;>i$ zLdV4v@am~k&Uku(f<@wdsUUe2xE)+6LAqtooGCyOQ46R#Nt1GkK#nq*FT8I>v1Ml1 zTqH*5+I#T1=6|>d>Y&|JloF_aOr?aNR%P}0+G)L?Yo)kSX)e+C^aY(FnAr2NaPK;N zJwW!oj^q>4;WRPt)OZz(sO7Be`2>M3R5!J%&=*&kQ}5->fwY%hC=LXlK%$#Z)PN%@A~6Gp0rG$8?kzTP()t%sYqCnN1cTBm!wxoK;^J=c=?fWrIzdA; zE}Hu>OQBKmVKC4!tF3g!JsXeqoeW+OR-ukVvkcWQPDi-ntRpcy8X$7Q$G}0foMzc1 zV8XQ59q?)Q4kzzD`kK$0jyDL&;Aiiyex-Xtmvz!BPT-QmB~JGQYv*6P@Y;2lGr^ed ze^@2u^2t<9BJiA>MaokN9v245D!;ps>*2?4XzCpq5xF}B?7Aqj1krSRt);`5dtfoP z+nBgE{&zB?-WJX}B(`?gY;xtH#DT?`QHRhkYuF_>11`lWr9W7-YT(xvxl8NeG|yXh z$Wi6E7>XHmT{K~OZd#jfI>QH3}&Lf3fOMdsjgIZ=;4 zZSHQZklmU>aGa76#8aAj>t#{Ya!gIoqCJZBeN|F>vRc3`>{0(akFC#!@P?9%H65SC z9>OROFymJgt;+R^^i!%IKi&h9Q_11P4b9r12OqTAdmMO9i=w=%Ih-sjrD?auOL8;J zk1xVr`2_pgI&xViN~!c^bovYvDmam1P+CiW94b&-YHu_Qse}Zm1AMf6{OyR%N!Vsa zHCnujAM!QD+!kW~H;G9Gf&Tsdw*aD1RZ>v`5fFet1i%OA-!w=8L`FhFN^_ZKT^C1iZ;d#pYP(V;bLezxrEJKgUKoG++|{Bx&MHhhnG)8RP3p^grveVMJ1%NimI-jzJZ~U z@e2z}tGCuRwsvms+&w(KynRC6hkghPkBCf2{FIcOlKMIATTX6XKBl0s=zCQ)u=r5l z(D<{Xv#T51)7v*ZGCDRs@pp1+acOyFbq%+^v3YQKbbNApc7AdBUt9zr@c#-6`1-#B z`~Sp63&cf8LMb9Y|M8+T=pIzBb4inbdXEb;H zLve>oWbyvNf1&+PWdHvH3;sWf?EeYu|BY)3L-ue9Q_xDf6 z&CSj0_2l0BoaemH`y4pi&yem5#kzao?>48{*-yE>8KivNxc9uK21kwbs(rgg0?YC*r%@8R->j8nYI9g;-Z6@*4rM z!sn#&<2ilwoxPmS2l)$`!(#S!W1(if^9R~3@kP${@89V{1xVk|MmiYs zGo`jHq_Sk0)c6Pe3jBtV^9I_m$!~6-e~hmntWIyCUk^0Nya;xu(1@D255u>&u#F#M zy`g3i7O7<;0K?I&us@~+>dG57uVzcV0$-ey z;YnZ=y@O#`UrK5bWRRePbYFU(*2XIH2Lh8h&YhMwn}FwgnLoC6S~4gD-dNs$01w|W zgA=~yIW22=nC{IMIhxS$Oiyc&Yni$)^cQkfCVwfJiO&))nofu@<4ricf4m2$Qe3)! zje#obzc7e+j&!KfbbNYbYja^lNZ z;p`zT5lE`)-5* zoQ!n46B1#9z)UL&>-E-cf2M??h12foaEq5eC1u+x7lSikL{>r>UV|ZoUyCwgO&CGf zbIBG*fGsKzDR85C55J9O@0eILYCkQG=yUd}M9jW3QOkE*Nq3W`hP3-&L^Fy&1&W5w6i1RZY~G+c8vZGMvopUeRFz_@Wa!RJ~CtSU)Gcm3n* zf@Z}nJVTRX0K*WI#g~4z9&6gTln*^T)G@|UK$TP$p&_;p??o`$qOj$&Fx3}fgGUWj zcII;7I*dMp1fHdeZ*LG8iZ8Av@`6tzM1s0b^FY+562+L!chQ3e1L*Pb#?&7vBcIj` zZF5w41>WDQGI(APZFYtB$%cGb2~_nKm!jad{ri`<$xsy*ldAbm@@kDvL9-8jKu@6p?3eUf0PGqEB-0$)f2h@lAqYiT$$csuOg71 z>8jI^aPgcF*&NFb+jRf&$oprEap6>9@N>K4N_IAr`AjDFpfbZ*!v+qU_;uKGSWQh` zD|&tMSz<17K5|5%MLzlI*S~@(W4`<*My_i_W6c*HdH(9Zn-nK@6Xk#R<{Vl03Fs%t zGM7qBQZr241_=b2Z?J2=5lnrTfwmM3&q+Z;G_!A;Ufh!g(jkAcdK;ZAX}Le6e?SxtGyUG8ua+>O_t7p%Y-nA5j~6#{@Hne22)F+abiSW_Z5=+Cj9P3SyZVNGCgcXlRRyV47f*@kfNPyh_L_T!=DobNX z{e~iaPu3F}cq?UmB!pxU1E}tCNkrkXGwJ{>^bm4e0OLEzb!BdN?SXj5VfON)Vl6I8 zdyZl@5J;LZm$eRAz(#-w^t95%jgB5b2JDI}c>KF2KjnC=t>V5ITsJA!MspJp9Truy z)HR0e>GlmkX1sv3_S2xWZ#g|(-uX6L28nQylAjga)tPhgMVtq^T*<8Va9S1$1mlh& zZI4K_jh7@7CNVmIbzFl!Bo_PLp@;*{Xd5E%TO_h+_Q`^OzQ-9CzHW3#QET(WpskxZ z+;@gSRi~bLZVR-MPM>#=M4EDf+dx!1-r3UoK#9Ix=WPK~v0bzd(!g4Q#>jHEb26(S z`kQdt%7oG;-4i9}do3O?!Mv96%id27^SKAaYEWvOyvIDbsq&Rki% zB}&(uFb;$#1WvTi)i-K>@U`|5P+d1~I(dBmAoeRN+u7WzEG$m4=_Xdx@_RNtjrk}* z7_2V;NX;r-Ka9iO<>78a8f{W;T$azme-IZ=|qR3aEjO z7cxj8AC!WyYx|ZfVFfb99Bgv>gp|~sIT{?S`u7Nx83ghK)F713d#fBq*G375PmE=P z_XI?Bmu>!W9l+yGs}Xb_*98@h7M)EMpUl$zchHEB+3cwZmL<297C9Gk)o=e~*t{)s z65Wd?JYEmCzJ1VLku#SZcd8T7G_>AP7dfY+pB6YJ#roW;FCLk8 z0^uIls-gDqG>h@Iw3%=HI*GbtNhg$`^G|s)PCl zJt+0;EB6_XKZD8=^Ufs#1Gzde_WgG82ASKjo0~0OvJ-LhmvO9ZH}`rw*`p@dXRqM8 z>~hQvJa2Wq9l}z(T3}Ecmsuj1^9ktJHTDq2PumUAQ=VdzzcoJcnr8KG&ir|LQ$pF* z$|lkv(s5v%3{_=FpoVujMK9mS%339oj9mo2Z@eAO*G&V z*B*1r9Kh%jB0Q(iri{CRWqM<3zP%?D&i5thDUE{Je^|udA9g1gP;JcZ%}1tF4OK@x zEp$KW8mqNnk_4_KVx!!6aU6ZOnmX$^m4>Rwp%SHy!D@6W_Pt-8o#^E_qeGGFN`@MQ zKzHc|^EL2k=7Fz_9?xJ1iekQHnFW|EVSX0#ZRf=v|B4i%*oNLySO$v&)vi7X@(JCO zT08nB{(BjP?Kg;(7(a|y*)(HJ{15-9Y8DsURLp)S&s#_J)4RT?yPQHftvx>Gt{t0( z#T;?@O>e|36j|flCW`s?PsDMnI$K++e9o`x-roC*{|~fmsG+E$6Q7Q?42;ZRg+`~R zDwJiU`XT(dXPOroXy&VMO$4a~Mgj9O8KYK(?V^)iFKK&H0d?3^=G8Y$|TH(4V zEpMV@M`N!4R($-vIvmpMDi1a9#tvZI>bdaVC`zHia))ZDh=+6p}p#o=Xtqbi}<`xH=xIS~}DUi94_ z!{&2Wfy+swJAMTDU2|OOL@>d#)xPUIYOS|m zHBj2_=S+7bO_bN;2)F5OiWcpo--W{*MBf|P8%1VJbgTJe?&~Q)-s=;)zJtqi(&f@w z{eHMXOESEw=brN8F4LsML&Xr3>}Lk5q3@?9!&-^h$a*z9BpfC`u7M$<8Ou;}R)KMa zaB-tO%u~i8px#aKPBBATv<;x7$__ee-<+PLq3BghZUH0JpcOACsb@9y^3je__Z>cw zsvDp`wtfnmm?(%7)9s@tFf?1J-8Im@V7BJvUmZ&^P_o>KGJiuqI#r?ALhT?-jZoz% zrsk&|xaffTvaA-_`&+I*;f5ZMLo+J=13jp8RX&jz_P1O)Iux5il$*~r?#EpB^h|_< zdkg^eya28+qb&nfr|&ipUu0wFMF%M~DsRHvej+%r?G_t8mQz&bj2*!P4cU(5H2XkH-JM|rqSXxVh zi+II76$Ke%6$ZWaF+19c_#VG204gPsc$t!ML=Wmw6gvI=R?liZQQLlq6xV4woo$M^$w7wKDNKvu49N7@*LK37=MLXqXjT8G1 z!zHKUeVd<*rb`uH^Bh zAya35G7fWhy!GvWpnp}bi3q(=OX3B~z#)$TfVC)I$ClVzMOrnxwq&X$Px7YV)(`5aywXnj_0L(xB9@M0j@ zB5n8Kg8_jH@FSDD_+K9=*;{)o434~CnMvfbHNhoH|;eueF#PFt?#8Yfed&9k6(GfTW z0pLi5{kf8%F7f#}35ij?rStvCBbgQKEX~Pmd<8Fg!wed+2G_`Y?oi-io*1DvLBMo^ zVNNU+gOSHx7&==K#msHP@MF;*@jF2ReWB|6r=+JTloP+cypau$MihU=&{}+^!k~jh zO9(0!GIgO^h&<9{nD`3dMM3Z$t|z{#q0x6-DE838tvPU0-c7xpi*t>=^lDR+YGX0?U)h&q7w`$`l3pMFEc^0wrZu7 zL9`kFh<%fxLoqEo3-23M6A{|+=1&(q_CCYSEe9d!TLDcoG(XSN+ zwq|F^&A+F|Q#V}5JbCi}K%~V_BI@*!od1DBarnd7(S6yoVmm+k(JrsJ>7O&+X6$To zzKAI(QpDvUEK^Ul%H%xf%Zh7UrnjW&WVC}5z>S|S=Z&FoX z<;YZS?(bibs&#`@?1w>-Xkyicgy;bJaUFsv@vd|1UVkB2i}(@ZZ)~ilLPz;R>N^eT ziTpmK6ZMa5*s%Dr31>gP+IcmoD_*Nm@oI8#k#AXr*qFy5WZ{&KtERuvet#u*O$}Kg zN<)VBD^g!my`3@e02l8l^^J!}m)v}~rPN7O%V`Vi&Tza{ zAm8-^YYX4*7|o?fu9{r23+>pn^}U`rDlO@_5_5BHXN$qq!p+z|X$F6AQl(UVVMAqv zb~(9~V)o8mB;{JyC2sxTDyq-YRLCwnl;tqj`Puqe|eL=#p zYS2lKHofrJ&Cqn-onKF&+y8+MO5KfefUcP!BR!(B1h#6lV-jEiCr%(EfQk^?ofwbb zkH6l|zt@Q8*0A$1x0XM=5d?OI=dDQ*RoS6!dyXXcVX_l8R&x`-Z1Uk_1zbh)=7B^ zVnR(ej7%;P(VX1vQ`o?#YAvnYp}FR*4Ww>6^_lE6 zz<8E)25sl!%BQnlb)shJN7#yy2CxHdb8NE!0m0>giW<J zvqQ7@v^04=eke5h?R-*hPYgB8c{%b>vg=6+ zG*%kGHvYlBAA|w3upAZqRdEo;>T(qT>IT+L_2j4EY-!qU!Fw+OI#FH^FqP{AAGPQ9 zriN`S6;raGMxCw5NbX&ZM_k3#9|imDK9q{&=OIXXBgp7%K!nfN3Ye&vzUQ`fi;(_m zD4u5T^;X&y$^7z9`6u#@mb4Y=0q!^giQ*iHnICt`%?XMW@(qm6zM7q2KKgk(OSEMPjb0*q{vEnIS!Q9#cJ;vaQ1X+vKo`l1d{t-3feyRC zC}&F7qnYy3H_CgjOqWZ-E|U6)E{2aJOwHd**7*AaH85v2WOt6l2vLGZI#<32%U=+x zZ|SD2j}k<=M&XeHi5Rx=V7mxJ{lW!@eNj{xc?u2g!xO!FZr8p8>mh7W2=-ui(qt5r z@vs)OdN%2H`su!(ORiA0Rp!ih#wv+jDAhPSeazL*%~NbzkGgkRk9eH(Glf#g5>;9$ zOXIto*ZsrS4psHuMy}BH>QiqmC3A6HF?%Z_R)dr|wh7B3!9?F4Gfs|WX)KuuWvCo_ zTm%H9;rhbTJ)&xEw-|Ac4|}ruZqW<>kbD1r^G#;pYk!PETnF<5FwAeCBFKAO9n=uy z#jJ2DLS}#Qs3JiM8*xgVvHRK=hRh&@1j`9l&d4ihT!YeL_HAQpy?`2bf+%SN(o{~q z>bC7QMKwjw=a7TY9-o)qFMfGzxU%u*jp;`d?Wh~5Yf*vdYr|6X#c}{SO9m0+3#$?K z)Cli)WfmG(iTod{wJ8dUb=x|5XO&O7_tVlzpA{pZU8b+SWnTJ1z6eE|SttHHDaLm6 zx^PNJXBLQDMJG73iEIbgl_n02Rwwu$XtKDr)}b`hYR2`Mm_VB&lkIqWO#wi-5{)wV z)1j-8&7~7|TSg%_SJr4>fgPg^-Xv7D` z!2Haa>`7J03<}u^-1nEyQL_v*T1i~QCCKc~5IN70Ac(4j-awnyn+jqG8%?V@8q90BMk^GN!> z-fXGsHA7|ft~XMw&&-MCSc4XEvEO~5*^I3J+Oh$H%s$u$WcR0yNQSq}fHdV*rB{tq|#~e)u8g6&# zUo#K=n5llz1@D__!LEk zkj>}%cF9S@ZqO4@wu8>t@_VE-gD)Yw3D&$E^*8|=FU z;OmYlWuQuF^5AY;Hn+=FC3N-7YAB;t&f&tbh~^U6*asMB0O@}>3WX~9h_C@Brd1Q3 z2QuhqH>XP#&c5h)+K#1PJBzS{NIw`@+@ckr6c?-(dsoCTPyPTClH;yT+9R?fz-)3(=SOV(%EtjlsEH|c{`1Q79AOP`iA45umcw{Ftzj#VQPr>Dw zfsXBH?oj-uDmfLmFW8r>X)sWoJ8I^I9&7YO=DJMj*!xrpX=3(?RG4og-#RuGjm zYa%Qwctr`?>|bod>wER}SeyrSW;7)RUbqn~PCj3FsMd?}!8Coq-+B)^`*eF^*lW*r z#xhiuRbZU-n1o4CMtdPwijR1<(>-bu8GoBuw1+dnMu^;pAzQsX?8E4=xs3(rxvGw! zGR2=BOXZ{ZBDl<~>58Epe9!3PYuZGgUmd4_ft95dyKEJ2;npArg#F&+fV^1 z?+LsSAW?L}8%4?~t{xRA?pzuY;itBAhr#bo@`&Vptp-xSzJ>F^CN1iKpEo7U{^fK{TBM->|!l9|)1CAiI)Xjn0UU`{^*L2aYhC z9j0%u;9JNG8t~~NO`w?{V=~#ieNl;e!89t2nv)m-{^&A^;T?56Pr3|#xp+i9!)f|I z(EG5(DwCZPiF7RVDyXCH5IsI}Ikk6;pTdW^JcN6B?8~hFR#2=BWnecL%eORXOEH(ICr>P)*H+(D<_+`p+BjMIc0fu^44QwDYIGgEWd zvAE|}ax(Q}Pa@yw-%=T7s0jOGk}X0p=McwhFPz5~;sE ztCPqK!uuH(zfzP2sXA`k1-)N7hnr(-9%?C2AOnuS^RdgR#t$@gFLzW%bw$|!`>Q4| zyzViSq#&k-R@JC_X)g~7a&~^Ke*X?ODreH$p!VI}zR*kVFKj(Vj9##zxQk31G9ucdnx*z z`&Zy)jmZSAm00G-R)vK@20U&usc+q*$bGB-$BfvXF`fNuOR9$AEj4N0!Zu4Q^e1sb z$ik*z{|yQJF@8MQ`0aWA?LjA&PVamR6ypyTb5u;_%F%m#X{0WVmrbC%vs^-Ovw0x3 zuEaOlCZLo2XDi?YslrOr5^U52U2t{+9~j2`2dY!Lo9lfEuK^BdG@5&cokz7QQP6<> z@kYB#L93UB-m|`-q&vUxM(qTBhxn8w)GRnv%JVcZ2+|RMo_Ul!$lCbyMZEpGxRD@u~?NHrh;AAP?)KWLCLn@{(|WGKB~fTbyXuerN+y z=QaWLXjs$HERfo~-MjGnEjVnPnMKGKU?y7H~t1GEck3^y&gFAzPjYJ zJ?vAmu_nl5CG>nT?6u;WBYFUHE-q5{evnD=^2cS^a;$}&KCxSm$oQbzM7zz7j`%SS z9n`E(s@Cl7P_}(K_lK=Mq3YZ~ealj^;rE+(Q#QfxpYGD5_T$&JpiFwpe405AsG}%l z;g4F--q%x)_h5X{u}ut33Gp!T_Zu-QA@`W60RNYn1hVy03>_K$3LcFa7fmY0;pR zc-{z61b6@wi3^w#?f%`g69*Zafq_V$&*maY9Z;PHrRU$m@z>(9E3q(B-XWUnzmxam zqBSFJtp$D?P2&!6l{^CC_SSMAkz?_!&c#flQP2}yO$%OpouM?$UN&jp`AMuVDOeI{ zpGb7pyU^UPOe{!he_lyGzhNKUUrY@uNW44q^Y3?|wXJl~_~q0HN9EoEaBTrJ`^CS4 zoAc4L1)2?k!uNf_ns_Eu#cQ~?82}Vkjj!^`@1qwF)up~RID7zdsYoKY@wM-d_!S_b z-Wt-AUqGue;3AlHxAPPvIX1r6xe~3ljca{6Mgop!@DdmCPcn-WOJij{Wx|lk1+B)L zlgQfCJF;~#TZ@P6)D~m|RHS7ovz#L=epUh{;q)xG*#)JEI?=eSM9I^dH@mjDJw%rz zb2euH7HecPWvkI?x5r+^GAmz(07J97!^?-^Q`P0;Qw71kU*7$;3kf(aKa#mF1CXO3 zW9;~so9$QS{?P2_UF%T(o2`q>VWQ)_JEFt$&}_gy&IiD(vU^rTDKxKdwkT0hV5=N^ z-*E{bvbY?NQi$4AqUo!?`Ad`&b&LrnIxePuM>7<{su*=#v^m-)>mWKE?2||71I`8s z(*Q(VUatDhR-WSp>Tp~C`~PECeKO;`gvV+t0jCwF+`oGL%VE4qX%sYerf6Gr(h67Q z>=J&uFxd$Tlvi-=&N=7yH?7UMkO_qW3);EcE;HsW!7uX+($!b@emkmj#{?=wn{|wi zjfHVjX8DxvI$4{vkW{=j?ti2UD$hhvz7ddjC*SFDZDV+`&FU_`Ybd1!4Fy17Y?VHW zI5*WSskDC~0_4S&Y#5<<)c?N5eWGB(46wv?FucCJ{Db{Tut2hV>NGYn%gtM_#olYsH^XIi5Imk*1ERfSc$jPeNI#LcKBCcfJVt} zZtM#H%17a^Dy6q@ZO1)M*Hn!_Nd@PL@N% zDz>`tugKP4oTDc{s?Cb<_^uR)^!*_XPBlMbjHs=u_Ifz>iDa-x8PD^?%pIa94r%W&d_$_6jR3|G=0U^_V%{*%BXE&$KvA!nz>MhA0HNU8Z9Cb= z9Wh@ikrWA!U8p{!t%U3DX*@`0c#d9p;Od-?$o*bA@#@b@O_XT#f+?+JKh$qJk=B?mEN#d>UHo|y$FRI~m4 zqKRi*d5|}tM|i6Dndl9F4Z~1@ip+TWpIE2p{*h0QhO4dx!JnzKtGjfj0BjFttt{VF@%z^yY3L z0|9tu8*ao}u+ch)U4rzpcZt07RB(7BEuNavmiHUecK`8Kr_ zKVFX6qXJL@`m6s#>vQ96t0VSHj&&~|S0ir)pXfEh&gzdI7^Ufmajk`#N<6xz#V_+ZW0b7JgZnEo@7LbXOBnBi^~Iw5Nl z{UK3_PMPb=W^|pW?|t)TQl>tw-*yJoD;W&4j2>UA0;3Gma9yxaeaVXiu)AP~Ru=QQ zHk*rjkm8i@8cC7}pt;Roa)-=h2x(%f%U3J63J(JwZtBQxWTDoNo(zSiXB`{QWf_~` zd=7_a3V?iUoX_#tHgqV27g>689OS`Ep~0dcs{f1QwQXy2mJg;t8`b=LnZq3hUG>im2Zk*?y>(eH=UVGEI)pq=E5ztj zzyqYtahaHx(=|G{O3S3D>aLRw=%7`f=We^I5_!AY4Fk{mD%8KM*FE4P-s>+}fA`T& z@cm|7onp9~kmtFBF%`3PGeYpbb)^JsUg9G13u(a@Urq zVbW6ktAiF6a2Neu1k1N7T|2K4ItwvjT7VCsIT;p%_f{zQ`zAwuS$pSG}o?^Lw)P5dN_hb>-rFy~2JPKC}9!*uv z{JH_pwJ|J83LUc8F;VfsfWyXy2c26Buoh z32k)|O1{MbMVqzk!q;sAUaX)8(!9J0&#;b;PUBe)YBMiarVmAN( z3Vq}NF8dF(u1EYIsNoXs**EVo6ysQ9ce_n0mvxF7$hreOpv7)+z#Rk|fZsta8i7{y z%J4=JAAqtwRwRf8e^u0Rf*J}XumdV5=YzawY>W8I|A#VG9c{9pCbp*LS^{4G*M@JK zPZ>(w4~l9cXZD@uxq2xucWFn^1rDBM!FATj(K5#S4>%h;Z08RR99fk367@2%^ju|b zw(9hKUU#6AjpyZ5Z#I=%fK(jZ%^Cp*d)@F3>WJ_=&{X86uFTG*3n)Z4!U*KY>q+Ak zPd;+@^Hp%ac|+~w8J=h`Lel7(V?UozuuwMFu|{{72!^mXZnujL{Uy@RC!V?wuY14k z3F`&e3w>{u)mDQ?KkZjpDNtF)|B%KSv+EleswQa+=?CjyxtrRc+hv{;Qfv>zATB@a z+j=3JqKZ;Ek~*dB?^)L^`VVbDzF{9Ngz3CIh(TM$8M>8G87)mD=!iM&F}Iig{PcOc zd1rp*w3N>ldM8gcZkj9P;aJmHBMxZuW^U&2e7O9KH0F|?VEkXHXM0@4XZn4fBZ0y0 z=xR}00`3^}^S-o38OBx8gx_b?4M}U^bW2Z)kzQMG)UFer=Gqybvgsg>VoTZPL+kETzs8=ThoWqZXUzSb?-qrPUqL6o};tJ zn8?7})ys+4h0hDA(bV1(<0*-AV{fqx)cgda$<4*2UyM0Uu&FT4Y0JYwqX`)^Q=GD? zWU`U(9qu?6E77pnOJ0oLv!sa`y=KK!{?e~=9sC0@7W=mnj*5}h5)au=ww=z4;r9;J zaS4WrN)!&+O0)wzC*nuzO%6O)XvbycEgch?#TxaP^MotP3|g2Zp>w!<4EURw`l#l2@&6_b0&1pQb>EiI8;GXKe6 zo4OTS_`FctGl=1v``arT^Gy#4#D1o8wb3pKys0r@u7=K;-u1_vw{oUinu$M0y-)?$ z{Ql23a|ADWlUGF!0@yKP4!ewEL*BpZlgj69!4&LEkE4dc?~KE<=L{RjH{kMKt& zqW*_e=AP!}vwW^M8he&T(F1?jAU`dj@8T;2<3v2{<{lSoE^BgI8%L@M%(==kA89|K zU7Y@No3miha9g>k)l%r+xj&?-vozdRy^DCT{;O%b#1k&)(Adh-vw^<_Oz_pfA>Iou zBtS2cR!mA-M5^?VfVTHB6jxT}Iabu*k=;X%d696_hzXAd?~kjB-hp1F;p;N|cfY^G zWpy)JNN<_#Z?`g$aPWYYFDE|5tKMvv4isc>{SOr5?Dv0N>XUpq0SZ;Xl4ZN`Gnt81 zC8TrZ$wAe+(l2>xk+y7MQn3i&lcrDCJD2dD-&TuO10lTaCwkZ0aiG7s+0TW);IICt zpa4Ma!IRr#)%IK>rcPjkku~zk?WsXwKR3{BzH^SC+)AQ;s!&2JO!lS#PofT<2afRN zA#E<|f=b3wCujjWpGV~aJ(ULe?mG8#7|Y!u>eM(r>3P93LIh6VY~@SI*ivEr0l6(< zGV!@{-`ezl&>>*~H6`|vYU--Ma{s$$-v8d|>~U;XhWnvA=nwQG+wgPfg!A-Us~6F2M6t`!=phptQiW@ z*rgaf?i5$H;&5YqPxMH=l5OIZ2*^IjiD=ZvY@ni1;rX*>ekAy6%M7UnAh)v^lQo5h z3p(RU`mcNo@P`$v_~L)%{G@yKOMVh9oOg)C?vct(^6Di1zKCAX&W73>GR{H+-AN`9&smDy^_}^=>Zvg3y6t|RP^;hjn@pd~d|ASAW*}1SDdmCk>{v}QZk-3_hpjyq z+BVlDnA4Q_>``WE{z{JUS@n?ESiwLIDHGd=v(hRijs$=9I!Nf~3+J(-#oR}8gXEVn zc1e0RFJ@wY1SqIfGdfji|Kjc#MTjMtNi zl4Q^d1xqQL3w=_*Gvku$uMy|;ZGUD_lfwdFhYHpT0_DG|cQ~0g1TW(E_rsB-<|GRR z>?+Ev;kU`rKUQbH91JQx7j}3i;Z;mGqt&MC{2W)Fam>wm^14Uv-_2i9wf{g!38(>( zV)%gjdwu4|1q zS(q#6TWR>-LbF+jC?%kC=%1Jj9Bm0T=kuamUw0BG3e~UxayRoRjp$hE5WTtrv4b<} zQ#{xDoDRWtLoN`xrIGo-Q<;@bQ@zi zT1lmcPS&JO$}vaYoiszm8Lxgv3u?OK^d74azwFDI)K?kiKC1ZO{n(ggp+p7Uv1~Si{v?$gnn82WMkHVn{qQb* z?1$6o4O#EeVCjOG47fj~@24>nPn=p@i+ndQK zsZG_8J)VCI@3qXeFO7pVh^~wt;Auwpv(u>5UT6jFHLf)I`?{3XIqg>#1+RJ=c}T2L zn~xU7(wMvwg9o{CH|Y^={cV)GpWAIzeW9)!VRom8vpTD>l35Dp&f@APmg}|g*;BE| z*#xrBOKFK>!l03qrG<`b5j;DwGd-Ovoeo@k93Q>`Ht&n8MgcEfJg% zZ}*7HkmK&1k5!?`Sh3PH84ikZ$?_0rqU>?5V{9uUt>G+_Syk(B4 zrd&)!wn3Ou9^v;~^GyjoY71@FBo#)B$X}$V5+iC)fA{5dehh%`X}pzB+Fx*a7fxRm zMbv<7Abj@qQzU$Yl)&q06MNmr&ROJ}n$E0V4cmewDsQ^(zm)cgInJ&;t`<-J*rEft z49(mP@C*^}d^ZMw(`yIa=H@AM7?d5U^<*H(>7`J9b(XEl2=xiW2&R6^zjT^VdHTF^ zShr5wh1?iq_^T;Ifyz5^CNW3CiFtHZkKmiMee}>C>|MBO-Dp)_1tzvZur#NsAFfi;thnpVff86kw+4Wjofe zppZf7w6(YqIWdQ=1@*Iu@~=1jPK(7=31TmL>G3q@?ddPM(6 zLc_M#4ErAlp;3|=t(kR@T5|S3ioV_HTHz4jvR(uLrrZsczJ9f-`H7v4V83Yb>rc1l zmi$iEy*hECxkaj@f#sV$vD19F)Ovvw4R%zMWpY%Hd4!5k{^2p_?5UITiRPiqqN&yA zvF6`s1YdWi?7PTJzBm4(mGZg!HtgR?4fnsit%d?2x1S7imYQ@LNIF@N^ZQ-J zCQim?=9!zFN!oka={Ic0HR)koUF}D&UisV69<>HZoj-|Zfa1*HZhEL$j6*?shUGh>wCM+V4jD9LT#?eO z`sDO<)ck}{(Y9tpVDBy_hta1;Slvdc_YTW7CW{BPKi)x9Cl55M5rEMb*R{*s0P{WG z!MLS6r@+s2FG+9T@T2`c21BU;@rzeh`tbf6>dn~0l+|@?7f@q^=w$iOlYIEz-`1pd ziE6o;Sh0%>Ll0RrtSh}&6Ot*Y;|YxRrliz=dLREScN$K!SnMdBM{yo?PIXeL-TgTB zt^xF-ZW2q zV7bB68=|}t2{xA99frZ`rxrgwJ?m9LGfXdCX1%~^($B$79- zos|1e^ot|B_9Bwgc1-QiW(+B$#4$dlbU*JU$WHXZq8^LOX6D{|I+vGtx3>V~vr-4| zdH)9zCJIhJssdCQw8UZl-G&WT&5QGASB5?EI8*D?4wVG*9n+FZAN}Ds=~f<}x|mY^(Dp;aL{YVuQ%uhpW&8?TwdFz? z-2+VqKer_ybJ!&2>c?IuJHcNvt)Y~IUpjLQ|0d^5I8|e{el{K}L=l$vasAeGn8Rbr zun`3VIYA1Lm|07=ojs*rUi^k&83FI;$=U>s^0Lz&TXmTy$0a8fY`i)NYSNEj)k6Mvi#x!GzDljcge|KzwhNI*jTh7J)T!MUZ& zK>!mUi3E(FHi~#NRufIMxqHn_%Bbo^;$0^nksE;uX%}&|!qSlvC+14}(t}K!MP$ii zAu>Bg+yrYJtEgpkO#QxhZf}XR`J)>XuUCa7wnV{VLjq|DUzRK)kC~q#9Yde_J#XE0 zZJ3wH6?99vVxFq0b-9iVlcBZ0+d_%N1y>Y0OGf>^7Sm{bngxu7)Xif77R_HIa@3mK z(q-xAGtTPGoKzSt-uO|l z0#mej8@A9v@wLN&by8ZwqU?|1y)W=lRXc^k(cI78CKZ*1b?Y33H}-2KV_e;3 zJDegmlApx4ycn|@u! zdfBUk5*QtENo-vE$M9Io01-YJyBIvfG@=@Tv`-2cgCrACO>xz(1yRmFe;GMC^XPeM zhDbkQt~em7o(t^C2@?2@KC+Oy5(*&saZKTwzOnTAwE)W(&_97xCSMsBTRVpRvTF8@ zNAnz<%9|XMFIJNlNUgo&da*lGQnEg@!*)-8&ns`4#jn5&^|_jkGT6Ps(W9!xwybv2 z3o|h-R#^?`&Syv>W3DJMg2tND9&K*Az+R~6+C&=p>wrfp_zK`(YpyDanSi~NMVOG8 z0FnY%r;5+hn-%smW#Xw|wJgVp2o_RA zfE?hBNb&hDTw-Q1E`*0H+A`GO;Jk=TfuEpl#suv%qU~Qk;(pnFQd0FDUgRu0=ZUZm z5Rz<~ZFsy*HkSXRHtuY?k7J02m>F$U)97-aAx84Rg(kgOqW7Tca^D4_7C7U|rk-`V z<{0CHm7|Cvo_jFgD;k|XR!5jqqBqUkE7s8JUs?Um`)nr@RVx!NeK$ujb+`GS$F57c z@6n*YaeKm$AE6!+cEXt&qm&#MB#8O<{Xx3PSeLK+HwNNJ!I3MIdoQbx0=5NBmD*lP zE?p+N$GOu;P|Z0_HjeKeh)?=6aHZa%mR%|Qm)~=CpZ-I+@vy;xQDq%}%(q)KUE+Xb zBsaEH*7pVne;*TJ7%=}#5Ia6c{$o^en)|5FSn#LBCOR%NK;mRf1tA)nkzg!c*?DXl z2RREV{;ii0zR(O~UWL8m$C=Drdk?D^Wrr60Tpg{NzQHAj?nf3ygR6h5VBzy-)>9R) zoS9v9>7U!5Gkh*Hb87Au_{EpBEtc?ob0InD-BVXjng9Wd5Bd6|_mks1jbCT_ls-!y zW~i}aH4o0`J|_N{RL8eN(?2T08h=UY6`9AT zd2ywGGzyG)cWwOId-Jt_pH`Mz#_|tShA%wZUkd_%`%#xgx3c}YE(uATJ1d`}l9u~d z!>976?_wfteVvt?PP(1md}+|arTrkCgYpQ5fN^~zdB2SMIE?Hvj+@(0*tfaiu6uDm zU?=2FTdKBde&;Z~4p)pLA9hcVTL7b+by9u_idO!*Mtv%8D)g(C-(BR+p0dWb(K1fy zHR`YewcY8J@;-#;num!T`Q--&@&=0EawDPfYL*OrR?2X5p^1 zQ%>PxOFMu*4j=kWCE=Oil%c>#me()Hv#?)7Ix|JV|RXL3L;y zLF5(AX_M+6AD9Fd5=5jN`DYc)sx!A4HDPSzBP6K$wN$Y5x{4L*^c`^a=_YK%gn!dR zoK*%#b}YnX)`XgN%XsEDF&{HHRXHObwG!KBJ7iKv7@<(0i*@k?F)qRh0Q*+d8pX6R z`DtXU-#O@O9!UJQ!wecot>vBe#*EGFRH)0(A9nj_j&G!t5I`7p|AxI^WL4B0C+|9r4gsKN19M*0$qqH850AZ^uTGZc=YCgENzdurP~-C zY7l5AG*|^70zf$iniL9JC;~o`8YP@Us>BnMjwuK}Va*aQLW%@czq^I(oo7c8x5{#T zYtNvEOd5^+Qd=#|aRSM>?IY`5zNu<&EtV+*1P&PEzAKK@A5XiI;wj0E(Ml|U5tHvx zBui#24W-?MLvE`w%jRPpP;-p^DtnjGsa0~w$ma=#85r7>bbxi zHFWmvBi-5~LKb9FO9R0j4Q6@vE1Io~eqGueH-CldING@lRXCnws z2TICNig!dN$Sl$N;Bsmkd&_^c4>?o}V>PRFX$02ZWKIaic&fQpF|ru$kmb8m*5?Ci zj0%L10R$co29^>8pdN?Tm~2@t@{C~fTDCq^F@jeu=xUbbI0%ea)CyI)jSog2QBx$@ zQFb+j=-#4p{ zGg*tP`EBLSna3IQsTWXzBw19F4{r4J5!eOF(%)g(n}*ugiWXu()N7 z#?@?-(v{W5RPeD8Lo0;HW+da@x@DRvIyMizak^E&x3-EzU`L@i6{~Nk{i1NK7yypP zy)?yhc&X|+6qq8bsF4OykI>YsbAyQ#k9wpGemJRA5-AsB{n9Z~svK1b031+Pns{7N z;-sOQ1vh4DTny6V+MUH?%xt_=uv7VacN$T| z`h;zfkRw6|>s&?MlH0p3;!2*icGko<)JKt;78%F~Al2Ot>ukF-jEc{)x0dedU`(?( zAwZ?_rPHq!Wd$4@6(bm}X-k$@GpLl6k3+PEIW&_w+*mmcwBnm-b8iLFP>kD#FivT{ z9h1(}FB3m>tKy=)Q3j$KK7;})r2XcJinLLqJ^iqSV_4)8t7kakqDj_$@(99Y=YiIu zl0=zZ$UBI@;-2%v6@+CKw{F20q?t@xr^GSwRW~)-1oCg*?9@}rnn?i5;m4pi3VMW zRUV|*J&nc}CcPu!oZslTv1g-(1L{Be)xv6;RnCub99K#eokI*9@m>D_!+{_6WyuOW z!7dp3j0{sicZzwb6|gfxn30M2V zlVW-bP^sxmVdkgzsPk9WowYqFaUZ1sH_t)e^riFH)YGvar91b3-82BfGUvbLNb-zv zpUR~MG5j>y9wl6r<1~W9e9%V#{#1F5{nJwiiyS2*vXC&i_of3?w{uRY(D6>BA%y<$ z{VBxsrW5Qukmm}fx$REgcDC2hD#teMq`Ai6eW}oBt7Ca2*CJW= zsoYs`GHQ*jx6L%y(aRmSa}xuMW6)Ac#2ppBrZlPIH-r~J;~@3TS9xx)bu_z@7mO=! zUT36wgz~0BtVbe2SFB^Q(65;%fn;@j8CZPSsCC^;>Jt+P5yS@E`qq4^ z+mmxKY*fwU#WE{5P25!XUUr`}SRlH_H+Ke@x^%ZM3zt+@z!^N6x2VA_t6WCx9PI-* z?Mm0&W@NgH$vVh#HvQaotFNd{sK6F0hmPe(3!Wabt>(388aaarbMlZgRwA|{MRCRt z2B{#qx1c!GtnMuJ5Ll~l$EeL=>v|>LlFf!^wtkL3t#0VLrLs%*X_*#6a0&h@>C92f z5sp<46AnSpMk-qvk*fL)d73~>~U67Hjo{vNU9`yUPul`2NWy4%-GI7YBq%tPWeY1;8glz z{{W-C!5lH#x%8zi5L{N+B-p1Xpsj5`PrA5(vh4tOV_68x5iCJDuG>zwm;Mph{uSxZ8$blN4|rTyjeYV*%+Zno{U zp;kD!y>e09b2c?t;nC~jI8MB zdK~g99IOb66_z3PZNT|ps{zQX_6B=AG1Wl zJnk6;^y9DNLcPH3ttJ!3Nh`Td2?woA2jVN3u+t{9Upgt;b&@t;E>ZUw^dsK4wDB+5 zP{ODA_U?~xRF}74VL8SFYmAa0IH=+sdmt=bo<8B4c{UV`!|*VXZ(>> zV0o1awZfm9pgFAhE*)cN+23Y)UEN5l7S9G3$x;S+=97AjA$9xHXAE0%UHa#>a)(sA z6A2b(1RgP62Z{`Bc+$BYD|<@uRrZw=#T#(=XfpGehh6R0VRw*C&eYqt{c8zUx9egMex%;xoFaT=U8Fr_OK)#X8b|COHlU zNb5yif`}pV+EpJwYYyl4mo7&qJu7wyWx8e&^6&R^NWKczT5$gWy2f{X<6L#*m66*< zFLRxTWS3FK-*m7Bk!|INQ5yj!IRR1hIj!AG!dix!vfjCiVx8Crn#8+zTfC_}VU*WB z@3FPJoma$I!=|5fKaDVIUxybMxVwO#ysG2*Rc&fXH9rl&0BIgF2m_OYP*?w=%lw_FPT4$>RQloz=r#;*@$XZ7O9f7Q=yh-H{VuLI1PE(2!i)NOysLf*% z7?c)%g!Zd?T=QI5MG*mtr-*m?CLS3Nxs6#msuPP|L|IPPQn zt}~BHx!R~iOn-WQl;V0*8;Ot7oO@DWXI;-Jx-d-&i&D8VXRo#^FkM<%KFdaV8bgQYKQ=rH9G#q?gN)hH$)^$o|cr zQCQ=aRTg8W_2sJ1h;EEJi~=)~i;$Z$y02Q@HRY1KLxM&Mg~lnTcVcW+i%+q$NWRPx zQWyh{e;Uz(!aHc(LVz<8NbGZ2_VzHvRy*=rvQ%CxBa0MnjEpn-et)lL;BWb(*r4jMaI+6 zcQsB;&@vv`>6(<-5_t-;5~q+e^sP0tze`(rosdO1+=PyCSqXI}*9wE@;GA@>()4Z7 z^i(5}ZZavMqmR_AkCw8p-yoL7blw!!=91V$d6E?ENe_Cek*uOnS1b2N^{W=P=^Q~7 za0Nb^jV(^rHUM-D8xBXcVd|a}hT)#($nD%R(c`@uZTmn(QxZ`0C$)4@F&KExLG9~M z(^{ic)70IG(&UcSt|VyHK;#alty|fwmTMG5g$dTJBf+WKv@Urgk)B0AONDJF`!uh; z9dN9BQ<84X&6(?^y8i&nV1EkLv%l0V^k>rJl})obd==yqOms0w%6l4S@q^81BQAX= z7GosRV<(bDS2h-^!($RLoSIwd9_8I~LLuYct1MD^Zo#q;a@nYFpvKKSD*IAB&;8>{ zx9{buqbdX)1y)$YrdV~SMQDiO^PF@CrC+nUBb_IyYF$NF3fRd5IQ6Yh4OzamuG(0m z5VAJsKiyHuKj+rD*)3*gBmmht>t2cQ!EJ0CP`|yniYdZ~p;v4ibtn0LRK1qF51CtY zvb*p`iF7d#c$^G;t?BJtM!#wOg{yst-jb{puz31$UHm%kn~=u-1q9`e{=&Rt#X78- z&xr4Cfj&`vvHt+NI0mkC)vijB)IqN#a|onih5N2L8rhFlM|8q(Rv&piYk;wZO|~55 z=hm)CG}>$-;Bs(saC=ui+Zv{N5WinF5)|GE8@cE`Ye1;#I(t{1*lUqpM5x=5nBkc^ z*J)>WYZaIaC^4wXB=n(4-pr&oQPfhxF_F4P=qnma+trb(>2~ ziuZDuf{Y9SJ1$Iy#vja`REgZRe;NBbtuZ>KnMg zRsqlO_pYx)(tPr=I3h9a*0--!DBT#)r1`ANIu@2=RhgH8&{syqVobMcdc=^%I{+&_ z?rr%b;TF89Ny*&x5>Zw$d`of;k9N{90Hy|Old+BoVvcrIX29h0RXkO!okmj>M2KQ? zxb-xeJj3ik3GM4omqJvvIvY6vr-aHC=eAHCj*|fLA2O}=4AOCo=W$v9Pv#fBBXXIy|gbU zn8*tejFC=%Di})=5eHLGkq=Q#^24Ur86Jk1x|kRCulA3&THOewRs-f4RtL5#YRDh5 zru=TlIO$Pc*$JLD7#mwS$68BQxPn%KShq)6ic(8gPk4X3q6tAWi-c@)BNNJd7s zRvQPH#*SBztTUSO*r%F%_Gl#$1}7(`E4=Y$uJ$gm!~t!&IqjPBSC@Rw>vKg0B_dFjt{L;)wFAUX550ICLIsGX6gEYvLR$7#owNles2(IsN7CV zj)g@`kSSY6dHwdAb);Stbt+e(^{ZC5Yji?63n2Ta)Yoe*&XKR%`4dR1A^W9pdZJsEPl)JgYMNJ4r@+3lFP?F#<-^$Bc0ymsAgywFhEB_4{&=L&20}u zOVN21q=_JH!aeywewDHPp=te+Et^k|?LtD|YTS2i{_ZhUS%K+O%Wa~i4J%1yERybQo(prD>+ifZrpe(OOV}fgC6Z@JV<$ULI3w@^s9I_3 zZ)R0wP#2t34w00-vkx|(PG zr2hcx)KBoMQ(`~jWd;WyaQ^^}Unv~No2z4jqMXbK!tq|2Z}5tCkQuco$j=}r{{UXM zZGIG3Mh5%F$m&9hT%%6=9zUho-CgQX#cr*#Wbr+i%!RiJpH)F#vW5VYKux6#pW62bgp<};2 z4KXBW=D z^QVav{{RsD>LCPP)gr?X^c2Ak5-0bCD4IO|;Y;Pk?fFtjB1ql626|EhCs*+$rk0ot zB$WNt73X^Ai>_|uWobX)BcP~!Rdf!fs)wK^u%)*WA!X_YPrYQHGP^akwj(zZT+9ox zHPz@Suc~R#f74Cvk6~P$qW<`2*1L}cKqm1ikn$sppU;Y>O741sob;zuXmD{-6Ivn) z?tjvM;58xWKD7(~0LRz-2BsYa4`B|Yr5U8pH6;jnqJv350U4;U%Kp_Lq$Ys@=}{1K zPh-U}oDX_KF!fCa9Y*+EPJTcJQ`Z&BYx)95sGFEL55l4h@++v;r;_l?r&>x9;Q(T5 ziM_tQ)0%5{khFzyll(tQhhiBZv01IY*LOA@F&aj_NvfV9hC*%KWiqozw_zNvDT{g*KD7+DcN2o_#E^M8Ab)Jm$JsRdLsv=j~=}?k;-sn$?O= zm#;JwjWt{x)tCp)xa*NrB62_#e%OIDWsk}j6ZFk16m%Ny8(iA$g$RJ3L5k1QZmpw| z`9-o#y|$!{)j;&C-Xnmw+BGzqCWNR81N`2dD-zn?OHvWW8I&;}T2JCSA*Ck!vOP?= zk91>zc@=~{cW%c$tEprR2w{+WRt3eslXT?+~p+EW4?QP zcqLIVOv|3evi1J}3^t!+Zg+#Xf&tBX3TW2refG=|FR|jYyjudD0&orp8ON<$RfOZE z&m{6u`TkVnYF-kBw;XU=sHIW!Re%Vs&U@IYZ(Q9%h=YL&fL^5 zoSI|jE5~Y!3zu)+U~R>89uc-z)e+fpv7tQk&0!Kco&c_g!di?!ZIaX}-s_OVryZ%K zps%UlFO*LTgYc?)wUb@i6iM0@9Fm@t;A*a=b8!u$7-KRM!BN)~oJ~1}uqsnV9@Psn90bbql)U%nXgoSKCjiAA1|9G0@dCzekgsnOaYvyxIsBcR7x=QLR{Ykv%q0OmZQ zW7UD}T}74AeHa1(Y;sL^GsTXWxY^!B_DW&}nisM8K=KjIi|; z$zDToF1GSWFgIjot4)EVbY2{ecbS$fBX(T$71RqYMUDvqxNA*jZ5l?pjXcr@-gv1S z;x4LN<$@41>P-sWMBih$T{iu{jDB?Sq+cGgsH}er__l3ocelEUqh(#&(EO*`y0I)I zf3APdua+dH*;B+eF*L(<6_sI>#-PhNtF8Mitl;*$EoRDhl*gt z(aZponcL0+Tq_5hkJlJ}BR`c{i`K8IYrYD$P48h&cXA#?te zKi#QMAXR5S>0|xcvDi={)SV3}7^JBv4J8JUoKOKsXrwduq(wc--XU1!Kyt)&tg8q<%wNm1jLP4< zW=;vsM%pnmg^$Dcmk>119?0TsHhTNicREkm?pipd+y`^%T|MuJ;Ie{ia~|oKu_GLF zSjOto=S7Y{Hr;O_{b^}!2{P6lvg0`weibc%1uRgbaC?p_)M}C73K2-2D9Ed^j8svO zM-@+3)fY{-$Xv?#+9+C=v2-#`soL8}*oerb{f29*)--#QcRk`Pe`e!p4my1+%rpzh zP4>%zU8DtpCnmbtZln*h%x$0#mpoJDvo2K=jAFVL@w8x(nvPpoZa>y2C)TSZr1z_F zzHv&#++x~A0aiu!J?iD!o4J-7Fy2^E=LGIODhX!tlx^jSz!iH$xc<(&5nKQkUXAE| z4Nlt-YDE>S_TgYgh^X6z;~!dWq*oAJJd(MVFg9+--5sj!)tC$s2}2 zki+i--i;fvTA|w;X6V~+Jv|L@J~KL;QiK`uTpZ%N?31^1@&p9+HQngC9k!l>?CZJMa_6b8KFdXx=IS)_Ral-j z@@vs;EseY?fr%1NPA+W`vZ4Bg($vTW!*EXp!GG8ed6J}ZH!=md69=7 zg=aZ(?r4-vRkgtZ*K*M{vd0-~aka-d^{g1R9bRd~qEHZzy355_gxxHHSqn(n>U&o# z@lZ>+u92@Hgh%s)IUIvrwxgzvLiDr*tYfw-Ug5;<9HCam4;^XhR{(9rV6j)|XDMs3 z$6v{Pr?6Qf!k(F@#U

v#butgCvUH)~sf|fp$r`Q<0IqO!8(Zl0K&G6 z)cz6lx!!3SWB&kY$pXU^k~+GL=jG{~{dlRZBNp)(81vIU^-b8y=@Q){7ho`hEzL{z zCAXF1XPeDKh7xc=`d4+XZ?R@H@^7G+akS$#wr_LYs|Fm2qZ_w;+?v$`E$vNR11C3S zfm}8S#bB{Scin(VBoYzGdFLD875tH)*fQ~8z@a>V+49vhGY~WJ3(!S?BEN0Y(!SaH0(ycdxj4|&m zyX#$rj+t)vY=}xI8-cBZV`&=x@%Q7gHEA}kOp>|5Xf_v6U;gC?h;Zi|S4_;$I(>^? zOabCBc{DAEdz~+bH6JfYkr<4rz|CpcXYB$*{6>>p)1bApwVvG;G{9B>V>GtGh>d{A z?NL2VV9tgz1B3LXYlG%&UYX{jQIMnQMnbV4#h&%fYE8y^oYG3iX~#iOJ;JX59n~4R zIopo3F{Vczy9SRt`ReQ+k+-!?{{RWyk7y^ZTg0a_rVnbt88y>Q`=>n8x#*|ImV>3V zr}3p0wG`){4@d+z^vpM3h<~#vfdaR~f$7bfTXFo6l z(z2(rHkoG3;wL`U?A{Hr@;otSPX*I?_2-)Lcr)acAH-|jbh!S{rdvcf+ZZ3EXty(y z(701$5uDOyjO66?q&10FIbCmfSkxe567?RRIeg7cFiTUV6v_V^%Z5@ zOLjlG2su9WfvWhOV2V)%v&yUTnF#r6UznM~YbxK`5;0tEITc3OeAx9o)|K0vxny^2 zF#w<)Di|z^;zhDB^6@^M6g8m9Ir@C#+V2Q@reoGRFD%kD=N zg#?Qt^Njk{mI2U?D+eIC)7XW(y;kcO`={2nC$(AQJaR|tSl%4CS#9FCQ~{w4^A6zF z%y0YGea9Kv2c=a#RoE7^yV;%jz4AHNcFx?2LQL}9prQC|z;7gQowo%l2{VGR|m5h;KG5gATHA{J7vJWyWs}77i zRd^>Qys5rfN95I_vny>(+usc7nz(23%ro~YIS1)n{pZ8`r3dWu$0GVS2D^rjEFDhS z2Se-isp61J^#A{qb=xO(_4-se*re`wyt)+7Pa?S=DcVRmtI&8mQ=T=EBLNs>5^JyV zM~9JNlK$EJqP)1z2Bz??y!XnsQ^cW>aE`se^{k^NrD+MhY-ecR7ne@U$@9oYGmZsq ze`ico@B!_b-`nj`$gQ{K89l3YHigKUz-t~Hn5UUF$YM_eJwFP|)3mb#nJ(E#Q}QW1 zeQUYCwS`;Ex}DwoM;OVjCstWxxolfJVDOvz*CkacJ8XhcT@8oSbod6>mn=Vr8LFqm zmJq(!(Bs~@25DfN^OikxOrG*rYz0-o= zNoTyY`z_NSnb7=?_*XQmHLfC@k2Cadp{W^QnliGfIp{}f+7OI>5>DkbTU3}&xwoU| z0QVJk>r%_Br1HrsS{T=;QgQiKea45Zt76c`270qrg7|uQ zW`*UrWl<0kE_Y?Sk?q#AcCkwLG&K9awa6I*0IS!fY2ag%nugldEg=!7Zs)0~s!JPz z=}Uq1r2{n^?spKzK4aASQezPp?GlhUje43{<<1d$4gjfS+XO-w{{R#3P$J-YjsE}v z6=_(8$k-G>GxHu=sa#x$W)YQQ#di!0)z;+mRpbC2$Efd2L~obctJS&+)Pl{D5w%Gb zMgS0b?}45a8rz0-`v@V1;5ufGTcLAVoa|Z>m6oJinJJfyX`Tgo;$Pv{p_A zGxe`m_@>$Lu}1seU~2Ium6|Yw@H$p#a@@P9Yqxjh7DvQ!@Dy~ek3hKd;{=m|&MSy- zTIxI@Ht8jZ^Tu&XdjvN+$Cb?@s}1SS+> z2;sddH)K(?ug&XIW--;VRpp++4(V>9OnS98g>#$n82ntjc4 zUk#+rnR1!wC-fDq;<<`iD&~p&B95|0oZXWo&PQ`u@{{Hf=~o@H#P`inkmh6csEM@A zO(ris{?Z@sN~86!b&W#CtLOl)7x0uqZxUIrqZa&ydq5)l#?3V;u$C=s1+HR&uSwZPBHRQ9QwX;gp5v!BMSwajX8N`vT4 zG~F*?4FFd2q~|v8$Dydj++>h=Axjl*xD_SaiDBjZqNjG@?OAh62B+mRurH2Wb^?Gd z!2>n1-VSBUAI7-v6KkJg-KW^a5}+87o0_q$c!N|}1+}xs7;XfBeA|6Ra8Eo7Ou zyY(!2&#h-E+d(!;ma)%g9F865>sBPQTYE?!doYw1!K|AZ+8shB5^v{j81aHJ zTKd2;+S$mCsj@N-PX`{9;+2FzIA9M@deu!&QCmnz9Ln82sghm!QF)GW$7<&8E*|Q3 zc0Em0V{)4#m6}E@c_*z*tfJhEbL&iO!o)I+j-ru|?nfBO&1gu~&hUNfskIAprNXb< zfNPc<9M=Aua;ssIctg^O0xv_S%&qb|k?&S#hD&uv&hEymrPRSe+y@;!D^Auyc>^qi zk-@AQOlhM30EKQ5ZskIDyCEPSYS7cJ%f%yoDyZPIFZj4H}lWK)XiAk<=dIoq1%FCgmRB`c)DGkmzvZo+|yN*6)@6_i%gDYpN+r zqbWr*qxm-+H}JP(R#`&;dJ)i$qNcmENuw+WAIiDb@ddSv?QJu(rA`AUZ(~O?`#|*6 z%kib%=~!*#8*PZmanaAerDAAWta?tP9m9~xf*5DDc2_#KnRjksxErB<8zUfBooW`z zZG7n;F)_*i>$!H1RU*E$u(y4SesNqB)dQhi~){nis|aX3%8aVqf|P>$z_XJ zN;Fw3=6`ykN=4J1wPtnYqP%G|QU-Q7*gA^Dy?2UJ5d<*m4^nHh@b84RJyhE>#pGIT zEYiq!=hGgw)KB3(DDH`m?*~@uX~O1HX==|s()8yLDSYlc*7Q(CYNzfO$6RKuojyBx zBt?^YBW+WTxTxfeXO7jB?|n(;GOjKR&+}QNia+O|=A%nHYwWB9+s=OMayQD~Lt3s< z5DARR_~8fjr~`|Z3bybocSZz$RV}r#Yg1Nj7hTC%BF8^_a22m>;mO(?R=JRQs+f>x zIW^r4IzR6kc(I5K8uj@ZqG-Uiv5L_!gD3mPtym-St&w2{bNy+;#@oO1quI!O=)}Pk z^c;%Y`yHU_2j@=~wu7?M_G~%KLOG{xBy_FMCeRE3qLd}5gH3j6rTYGqMG3Kr zzGD)44z#Tdu5zfv9)Jp(L<<%NYTN-~f+Yh4D5|kHIeB7|A_Jo@)KzVUc|s4mIb&T; zvlpAEGl9zTE0wyGSy^Bo-pNg?3imU0Yt8bA<2Xd%VvP<7E^p1lvl!wS0G=~hk`nKZ zq@T{ZXrmfljuid&9GrJGoS8{=Jonu`KO=)Z`Yx1Fc?7Ed-x4Zc*5Gsd0)lPEr-_XPlDvGCqIXsJyk$ zE1#`srLrg+Q~~s(8Dwkw!+&stE!(}7RgFy%(%n))3;LS!p$!#jZFRy*o;vubY~tl}(TWE|qEyPsoA9#yE?!B5}oI*m#OqwFWm1%?tMQHO|^j%{-(Wf_SS9 zI9@O}P~`3$s@E=SdW17-K6FhX47`GIgHY+Z^tL5~FPMyfi1I3&w-BO(FC_a_V5=}` zV{wkaQoFs$vC`l8p4Q^Yp*HgFet;ZTANKjJ+CYyYFSAtsM6!Z+;Ny>K$GM5n6_#Z? zPeI8wY4ZuI8=7XZ5S?Xc+GiODdE*|HqkG~MYjT3lHBsqYj+nEx>iKb|l4S_vSJOT=k}jYQ#5p zZ#W5@0oYY_2L#fP264&ts}aNxQIMl1wb(^hB)=x5nRc=#d9M(l+^-ses?% zHFC-S0Clka-T5_9Mo^!6(9+aMvc!9iw570)fJUY;%hR<_VQ`Q$DLqY8v=Sf?$;y#{ z!n9Hdj@4n#+ynKlWfqvyFKM?PPO%(`%x(9v?^g9KC`WiCw{~}Yu|?{h_wT#9vp>cuQuR?Nm`qn*+OJ#KSYn_er z#7}=wT8OQ)aOw{=qE6b07g2;@V~ljE_J;w`;fJMBjrU{F{{Skr8QP<7;3=nJwQ`=n zt;uJmG*O+izZvu%wa@rpL^`aJ+)SmQi-VuyV^8s7nQx$2@?C&Hqq#NL=&-JtWfi6Z zvW|b+C#kGldFpu~U%A2GX;Nr9ht5k_OmH8r)97lhtsGLo92_!?smEVx^&9;<4MNsP zodY9`GS{2x+C|oxeYK?DZTFFzx7M?(PS2Se!fTbqoa|DE${rkWRQ05HC~^&J%O0NL zn8sz4@x1Z3=~FJ5V{xX3j$`(aLbPqv`-)Ll)aH?sE~#@hn!qD25dGyIwMxbrwZ9Nr z*(#Ot$!Ry9!;_lZHc|*(El}`zKh~@KKZ|FN9yy#4@JJZ->4Qx>rbW zjjC`3HsWPQ%z0uuW{bIE6P35iFRATW*Asbo&kgBV#kHXiQ!der%5#jCIm!2?yT-0} zumF11SmiOa>Z2dLJpF4f?q{CoYT%+AXFW|)jo9sDeBQ?+uJ^Y9@w|BwZrtqWadeKdaHTQau>Ew~5rtraI7MRV4!CbqOq_ZKS~ z`m;uPb72|=v?Tz-Twf> zd!^@_OPqS-e~oTwI^LJ8Df=_9br~q*1XqwE7d#5-{3#k*-*1ty?l!3HO+MhYJvayt zVfoX?+5H7$tc~kW^BdNVe<0;G!$1ciYMtfEV{Bk&J!+%OK*n-uv7!6T$)x3R@_{7$ z%z3EckL^_u@l=D_=-prueDOrN~prAH7L zGdTHI?vGPQWG>k?p_yU%jGp9Ho89@=v9TjNK+R%_NL*EE-bchr#0KXAsXm0uO-_#f z;yCn|EW?$?E6*>kT6s$o$3a@!t;mWcwnokeCb{Qg&ww&dK~X1ik}v3%X6pNSU){&a z`qcg*zRP(^cpO%pv_nhMMo;e;Wc^KX_tP2fS+mgAR=ICY4im;BnNQHyCuP){>^x$F zYd5Nj;~IG*fb9l#<4pvez(x#0U8P7EmpgjjQGRA#s8Dxr{hMu8^ zLMg1JK*7x-f<9ozkjWSxrD`W(pSwyWU^&e(r#b0DOa$DgwkoCMfDkjB)uT8g?h4Pj zkTZ{z3ZzRb&VP1IK@#pKo|PoWBNWu&aatoOZgd_W$J!(~{t;a;U{qqcEe)H^lLxhQ z(C%aen#OBV*2LK=c{K3;)d~8FP;hgaKuBBMQww&cI|^r*Hs?H23j~qwHD2N@s9fZC z8LK<;IHnaiEWlvW3mKy^ZZMss`i_-ad88R;C+k{eS10*YenfIBBd8hZz~-oj6D6mR z{{Ra*(XFq6B`dUEQuZb7yz4hiaNtbZ$K z(xff8b?a0ih^H(^MH!|pPEIRU=FU4!3K{qAmf?>B_(1-(oEH|wk8?~{8lURV)~{)Y zW#Jj>I@LHpFw35xWK@=0h0=vMBoHWIbs@&YB#eXWR{-9M%KJeW-RW5NIFc5P&Lv*C z?^j@xWxGbFB(~A`)(%CcZRs%I+=-SelvvjQzB<;Olm^ztq6`{n8_dY=4_b#vk!`G< zaCYt+fE9GeiFq7=jgrPe=dLJ9;Mr#Gc)W21D?mNY1+>m@rj{u=zD`+e|dQ& z?${8K8aWCNPQB}}@a?+V*%;NgWaRy7)43_?YvXnZ;-`g!0&~x$Lz1T;aokgyXl>VX z$cm?-ClsEA+#22$y-Ue)xj0fsO1dxXXG5Ea)t9KxBC%xg29~U1mgN{9L7K_be#IyH zRh`6b{p1}*M5oKFMD;sJHCuFlE$(B;83#RT17mIC3u}pF+EzkQ273ks`nAKwSvf}=iV}XYkJiZC|XGad47sATi8?N)ocZvk;kLk$Z^(I zIjT+EqmnaAAbiB@TNW*jeERSi1+%KrdSSf5LYLJj9@o)2p6mr&Bu zF`EbAbNkvQCf%7fIf!8M$W&r;?Ok4-q1-l_@H3E(azGtx%^O*|v)a&3=)<_hcOC@s zuA`}HFzL5)B&jeVWGDkEA5OLDuEx4EN=*b?uK)r+Gg>+qho(0o?sX{|eZkzC?QNXB zWl$Vn)W$jJ03lc)!65-c(BNTkch}$q_b^Cs1_?eecmhE}a1X%=!QI{6CAh;d+5W%# z)^62)+TEt0W~#gCKHazP>3hz(&#&e;S)8Ix5>F+oQQ;bE^G(9{F3Z;`wNn6}YnWVJ zWr{I)<{M+z_SXy5SVQTC%e(r;C)zJa#yO7Zhjxcxca$l!TJGEOM+Sjn+KO;@e*9?; z?_o;yS}0p|3vi$^>Z@qyrcRg%;@}P9ad`I+Bv%R%v5_iwhBo_ z-8>6=^C!w-$Lc(dA`S7n!tRrqee*?ze*&tYI)&2UIUmI*pb+3!X}-M$chHt6Fugx5 zPvuYGEl88_Nn3?JQ=Gct-j%S~Z8FCoP^&ds>ps_(Q%aIK4^U!xv0?KSoFhwO20ilm zUZ!jmHQEPH-SnbjJg*KMINvp7TYQck`jP$7d282MCDh%0%%c>iG#;w4<&1O1pzFdT zShyCvQrLI--BOf?O*s9%MWRE;dM~~0%;$D~p|0{Ht+_rWPFF&{5MLjCa1-ddz$gGpC@z$!A-X5Xw_nPfi3=~z3)$DR8dZs&oWv77=&6~(J zppHgnM)wWwos-A+$BQkK`n4DGTJ_J#pm2i1I6G*v88^!}%E#`QAR!+B-t78=NC686)ran}Q2Gj=i44eDdXrSb5&y z_*}SG#X-WX_M2ZbNhM1M>6vA`=JK|0xxrY)XS%)F?#MtFUX2K^SHnulT&J#S*t`G~h0e{^$b!Uv{ z_WwY9j?+5Uj^!p6X0>&nIXf1?bMoZ|T}eTdUnUQeSVgO!;Q0JnU^-IU53JNg2v(R7atP zYf6eD`@-h*jT(Qz#vh~C_wpnhMJG_!)Vv>}PlPUnF#;y{)`bXBSylt=3+nC zI<;a<6bf6aOR(UNBR1$VWm@->FpdAxs4B{&j-@RP0UF>QH~Ed(K5ZfdDeUJso5b>M zWQK2$^E`nY=POrvx{*l@dqGh$m7VW!p340-dYvf}{I#l0_Yq6qAki*W!Ni0r7bD(2 z$%5*)-3dLha;mO*n_pZqYe^8X1alCU(h~LVp<10pP6gAtem>)Cf%6a6w%dp+#7twO z)u?YH%*}<#hB{mfRS@}~E=ovWtGA5m~?1U3}ap&}_+=BO7)imP)KtkkxY5>qH zzCTE7to5XWX#Zu=+aQ7N(hcMn21ic&6>Ls*W78srucK_Dvc(pG6<2mOg}FZM(w$tELu2aN;r zvP_25_Z%yul!I1kNmxk2>8C>Rv0mlST&U|_@h7q1)-Pn%$0e5sPZzBbH0EIlXdUhR zu#Xhf;5|{^sjme=X5)l)n3GncvxSpH_w%*ENIOh9dH;C2AvwVw+tab3_IBa~V`%+u}&Vx(`pJyso_{7nsy$Fj#M=*uXJo=}*8e1cfG ztCD`-ttUn5O9lmLyT>J$`Qd#u1nHVr5b&Vwn~>H1shzUiGNx17>0K$+7-SUQijDqE zmsXTPoeFnj~8=uv!WU6|^i}l2*xbQSch&Jh{TZ~WGkLSMcZA9xkX7#QalyPVN(zlz` zR|VVyB1tb19ZG1F+1-H&PAY z^a305G~xDNW+-w3LC<4>cgC4unJbI<1jKyHNM!~c(e<-4JNTX z!n2BhAl1(c^YRqbVY^GSSrUe0hD(YU7A7a;2lJ5PTJgyG)OwZp{e$Rc3+-XY&(PS# z*S_CcIP@+oG6@(z`<}}CUO2m%w9tp^qRw}-HI4KGg(%Y#_YG zRhO4VFA>AjAP5o567MQmXiDa|t#zI~bU3e>$C-HWV)P}2rE&N4fFZ|rG**hDnDOu| zSuwQonbiz8mR_G;f;T%G>oze5HQCmbV)+Zruj&5nQv%}@+VHm)Y=2HyqcH0RE#D5h z;x4k$HH-H&Z_6hH?vFFxB?9vH1;4e~c`)rFMLfZ^CXtPYzVE_cP5j-gk6LtyQ}fN* zsl!g8vUGoT#aFc)BDPmNb2z8}eVI?NB!n{wbijK%6;rTCP{M-nRW7qb`y}#!_e&C7 zF4)5}=b~j4nI2FPxUoA+MmE7ziSymIp}2a_fo(K>^^=gct6tL!VO}ba#J_5a#d&O$ zIJtX5+1gTX=QoyC52-ya$#hM=R(&)aPS&5|x%V8CqY9x0#QK}LJfBZMABxsV>d>|} zMP^mzz~2{qqrJ?8PSj8DIN71!l5;#tSjBON$Qw`4*Vc-tPPop|OA`6_P~^WD9k zy|A%xQ|FtDWVGmoCz;TuIPg6%K)D_d;ThN0Wy2(0%f@#V;$9S^Q`3(&SDH%49rRl> z&daGa`dlK4C1kN7^(Tyi_jnuaFMnq_Og|o)Eb~Pa1ZwIMz78?iTVTWWcmiMoZSVD& z1f+=^OHZ44?d$0;iP2%KTgPgQfqv8KIb4QmhDFPk%)7>2k(93}+2+kVXjFbGbL8XIhXX2p(xhAX10D$9xJ;8Haj`7IaBilWBm1GC@iJjO%?cjh znGawV_{NjnV8qcJUT#KgrHO0T*uP_JuyICwiz;Zau{;f}vYayl+^|_xUd3{=sQbG0 zx9016ZbWXq@uBuAOXqm*s?cjz{zkM)aR*(n@Xbho12zjNE>x^$IPFN?T5!}sLS5|* z+Cc6gJ)UKUL}57FO1iaW_X~5Gmkf~ZmoYTA#ZTyZa%~yDAF6A-N^)m-?F$IDpdT!6 zG$fD{N%F4-xxY7M3>F)z#_A03k#$s^pBwqn1gSw#XC==#oxUOA<>Xa`bbF9viSe!U z&S?wl#&|$ngf-gAk-8S<3_l}ncM@WVS7}Nph$1j%vOUz(pFMoXv%~Hfr2?xrvu@!B zj(vH4YmmJAVb9^$)9(by->*cwUso{b`TYYOQ3Tpy^3aRF5p|?i(B^Kqb051d6}b7( z{(?J=3>K?K;i$HJ8IHNK9A$|S_CC`6cFHWpuu~yMxq2D(We2OCjzzTpvV2h9?1+k9 zet=m{tc`jWB9ND~)JfKCs#<2u=(BqOB$CH+6=k<$Z+wP%wYqes^*lr3vSGH^p|$7R`m$g1vrvhLpmqSG?ojf*ea)tA*m`j1!0$ zT(8On6BuGGWzqD5K>G%MOwf#-wC|>NH{46?YXE!aeazVOA2W&CX$!{~752^|nnMrD zN}NWZJQ;jqw4u z!wn((=WaGXYsb84zKAJ4{XcL*A&@cR-F1@EK<_u*P@Q@ z%8f64vob$W{WP$_`zv}M{2pI|Dd4Gtx7+g0OnDaMGY3_KzDpVnmiEG0+(S2XvE7TT znc)dZkbLoty-rnBRrb;e{Q|DO!=JH-wRanap z5S-!a+)m})SNY<4@Vg4Bj3Of~`SRvTz}3Sqg4S5OJXOTM`6q=S(7(BVYak*eIYl`T z8X5?M2K<2jErO&$IM~==Y%Clw7>tXHgNIN06#vN+d-)YSBhoXqsE z*cqs)-@IdC=i=t$1|r5m z6F|4cKzjv3Cq}~{M*G(Vq65wo3+;av(El!I=opw-*kBx7yeGg0wL~CvGz<)MObje6 zOiW)XBQd>*`PP}v{xj`=Ec<_FSjhh`%lGFz|F%T28n}iXwJ|yZdm3T@}d*~9GC)! zB8}LH2#qi?hpq9oB4zzoMJnD~2lpMYl;{+*OMK{2zYWw?GUuNcjQOUrOTkq=M8?~h3g(8(e*P2JXYD{qype;~)87ZuLibPsQ9edxM= z>340}qTUm^Wp9g$}Frt4-K)_GNdP&;eZmw5B5 zIV$p@GDI95`o5{wkIb?eAV{9eXOFsSwsw>(d6&uq=q?kXXKk->1yFuahd(@8`Dx{e!6`-}<(>YYSuMbP6GRM)IlOXpK$&0tn z11?sYz8}{zWo7sWtfwS*d5Q=MLPFG*eHQg=R!QuI8otgg`~0Bxi5T$25ETmEH|3=b z3!vVSt3hW9=!Jr}X;)SBKb$@`&(*$LK(m(e%i7Ee;9jWFY}s)7>mp+G!=SHUbX}@p zPyPsakUci=b%msGm$%??V1k-NBX}ZrTSy5HipX6hpCv&r??R>=Kq1})c=9H zW)t4e^GfCP55{&iwk<7XAj%ExNfusZnc@erCb}t#LT@qDlfPw^WU-Ai!kbpM<+Zm3 zd&Kd|{Gsb(?XVgjcA~%?wZcMS$59vN=%>6636tAq?tPDrX~EYA0x|8h7Wl=R!au1F zXJYVg9x|0?MkbWgc5v|>3>NLz2(CsI&)7P+IgRw%_Pc8e|ADNFMHHxS&9rdxzx^E! z!uE6TooMlHVQiDYSLRw*!@Z|0>ez7UyMKau{_!(xk9GAH_rFdD;8Q<&*8xUZ01%d- zYS^ zMeVsC4ICAh;0wTXVMaaH-|DGlLun8lhstS^#O=9^am5!z&xsUh3!^W-`g`T6G%D1h z6T*HaA!FSY2T%w6(dzC1x^Mj@a2C6J0D7hlIF{zdQ!YTH2PpMubw-hG)19W-gNvFe zj-8E=th#66(O_PcX7?sA-9&DBp+%$R($(9eN50Dj@tdR8jxP^>s+*7WK(DH@h-4e= zh1YNv{6)Z6dM^v#!fVjMNEIT>X=<2~@x@=U_cx-#4=~jcu?g78A}ow?w85mg<5jL3ZaL3`>`xmG<9lkHqhuwsyoj-rFLP z00jHLJvTS(D(Dq0%+LCAQX(MWgnR91f(Ft*rP9}ShXBIxa`f)*e@?VSapRHT#XQp{ zHt=MS&^@L$mN#c%2PRW_f5Xa^3Q)*9=rbK*mIbbDeMIZ-Kj38O7612c*{`46;}AYv z{sSRZ-C-8>fQJ2foiIIVj}0%3E?;Sn*3T%uMDHYp7_`b{3>oHH z{=*<%K2=Usu;`m~jcA77PI}o}>$ywsD*#lK5*? z5AffsALJSDIHC8x({FFzqew6OTaN^m_pc3FZ==`Jx!OwKu)&GsU_wC)?MV+szaS5} zh>1F{JtnfLOZ{8Kx$_oCjNhmX zgoE4ZtTKRB@2@k{`b^}dA8Rjmi9wvJ*f7`rY>A(KB5<$gXajp+h^^G@@eL~bVE$ap zbKwm)onJ$x7$)^Uo3lRK4bOSfs@b!H*V>Sj=uOwD1?zBLKe_v_>GhGo&iFT)vZM{* zuCs4=_xr>rDzVx|$zlyF2>KW>M~%eJUA;2zvW=OpidGuvZrXLKb7+hm9@NmfdJ0|PDcHad`jPmj!&B_v6|Ii?PVaX z!{Uy*6OQu={KOP zPI0BUJ;K^5Q*ob!(Q+kew<#k>NscAWzB#F-UAVqFGYj&Iu^P97LNV0IvMH5RID{L7aQKTN2?d`1E=zEX zL2G=4XlIXdNMK-|*U9JxTf?nBIijbENPgIvyB#9#p)V_NpOL6oZ$TH$_%>Re=7VOo z^RO}Z$8VUsb!ZhQ8(CCc2Y?4oRZa{nXlNjP5w%72W%5q1mbL(Dh?fwLK|-L$<^2yN z`t&`v#G=73SDYn3I?eR*m+VwQMst?r1-UefimcFtgbY0v<8sXtF~Uj7(~+#jXRqnI zzs2+RwclIRZB?Cr37nf(Y-bg}pGs${GQ+bUD}m>+G|JO;j?$%=;=HPTv*ZFC{ctUVdxPnxLb^?U6>jzTT3zg`W%rTnNNjBif-x=%9?K1)<24mJQbgdo z22M&JzjM?$z4y`L99paD6WqMhu5ji^#aW(CHb;M+6VL8obi}!ZE)}=ClIh8ky1=AG zg~5y+o;Z!}yk~J6&h1#B&L_yK%?cXDu4=wX!>X35M-lnqTHt+91 zC|JIwKffDCtv3(eODUY)BJs617qapuBjVjZWM(<%*D)#d2QB1>!!#sm9s)fxCXAGw zK{DLpuN8IUaA{n5vmV9E?Y&brVMGTBeG?z?yFoNuxipUsFSwWHs$>cIB7c+0_w_O} zoR--v4-=<}Xr$?W_#CE(?>f&&O`z_Q$(D1`{Kw*nvd5=rCdMFX`ZY`1wr6)UB8IwG zi*2u@Lc9XtWqBN95zw*q9cSHiSxJvX-AAcC52L@|OvGIKFyini z;5BY}j*8{YM0-h9O|YNx`Dum-@T|ctMdidA%2bt}H)L|WWD41CW2h2DS?Q(Zjg68- zZqZvpSZa1#Ca=n7nQHxdNR&Uz#lTFzTHYg+s4(Ml)yHj7BP7)&{o#1ABP(vWNH+8X z`>iQU5=&z1pXC`@p&7my3w3~;+k1|WvVD#5rAf~xGg!7f#ZW2z{k;92!fdhKb&2xi z7>M3FM*D~T)eKP( zvwEdbC@OSuh0+QBK+dfE>W# zi%o98iHQVfJf;)&n4xNE4b4$IonlAX`;vI#+nET!iO-ZU;y*#$-qa$k0CgHIA6D`0 zu^m@uGy<4LF7_^IJuDB3(^(f69TeL~n8>}DVCt4ixg`y< zZx0{_*k$x=W92*EVx&fcRBt@S$&w3IBIUSmXTK>|7%zL|V&BQX^tp%xpB8ru@tusT*fT(F;H!^h(L4{t`3kbhJzOBk9x$HA7-&=? zSXNO!Rc1&^9OBj?z?R`LLRp=pKOJfR`J7a{e-1BtY}zwGYT=Y!?Kv+A_mKHo3g*S_ z{ls;XyGiVNHh-fgee{tcvyArPMUp|&@aeDWPuX%&R=O7qB#HVAISkb20S>Zf1cjp~ zM4PCz=#O#z&SI~{yIs0AS80lLsq|G7M1(@OlQk!9c{UtZap#SflqLp_SZpm5Vcv6( zpz?{rLQ%(z+dwaG2H@?#uJxX~G+zW^>^qTu6Gz(Pd?F{<>pqitxOlIL$d?ULTjssY zMRY;c`bNZ+geNQ;Ca?)F^Qd<(`Hsc!H{wvq@3G?*dhhs#+@s?+1-swnFrO9woP1@@ z8@C);5Vl1<6+7D^y8qpZ)~A>DGW|_#ga+ukTdu$N-W2~p!G*Uo$t|y#t9u8L_9PYmM54!+;GcmP3^_`e*DjA7%Zch9?Sp$$YDWlPwj?)PzB!- z3D>^9xXkHN%k|~u)sQfpB2L;ZAZ%HGE( zTadI&b%?Y69L!qH=V3v>S?^MwKWDn7WKP@ z2P3hHi}z##V23|uqZY#ho@wES(n|)(TU%g@quUSYSw?f0aacG0R>;+EIBlR90T+%@ zmV1=gb`-N4F($0;n})7Mc_;d96R z5%TOW>ME<|t-@?zrr&ZOJ@3}JY2=zq^AZli=|8~fVKqJqzK>mkn@>Y`cup`s_+pD? zoDDO&^Y*;Lh!060WG5aa+i z463tuRTpWFPo3FG!xfk}JxWrsN_>DJem<9#)loIAoJfkXiWrG{$)Gja^=xhwe|-1G zD6V5(J0Uay7~tblh2*f8pO+njFs-TqJk;EbG=X~$YYVQ0HB)2fimR5npYY2a?7YYrfWr6Ur`suK4~FbNIQD zx89qxuuC$MC4~|R-H~gzE;q$K@f^M@iLQ`#@4mjnql_u`yi#Ydh#R;Zhh5<#=X*;Z z3N$q^m>MsNY}4XE!NN_c()bKn@{Bzi+V}`!rkg1rr7nrKyl|CMdw+9+e&n0^v=pF#yTd6?7i>e#kdjWh@<3-%vMeMyArn3Xt9&zt3yhZ&n zV!|;{q-&|B?GQ)(p=tx(majAJnSuBTq6S_5UJGrNnC>ihlp2=*_VJDnT`U&va;x}U z0W$s~neMX7gKWob(GY=?{j2Zz$TeK+uqK2z7*Y7g*Gf&kMxzH7lERl^Dffd7c^+o9T5>1!bs_12VcQyQFG5%uOX20wc zydl#xYm(z%ydN|*%PBqGvu(Quj}6YEn)p32X&Wjm*BeKJ|H{c5NB~2{f>;yHcJkf* zbZxzaodPL&&1mhu5kj@(zvh=B+EwlQz2(x<6mh-;$;-{5y_*1;t9M_*_@Z|z&Oz?m zD^d5OiFkaJiGxLM1xo|SCO9FX2YNRN+PbIw8XCgI(jXQ}k=K*<9NRaL?=wE}k(TBD zmFOS)2HH$4`bJ_5id|~Yf{Z{K2Pr`5O66nz%XqH-!(OEwB;cB%xpem~t@o7rS^zkw zfBI6G=2)yU5FM#wi<~k1=i9l0YFR8NYoors!AD};JcaB`;~xq z^s1z=J^cmpUI4brH*@L|4o3P?U*C=1PL+(aT>N<=$Iq_t+ZuBht(RhJa`Ootw%Z-* zj5>f#obDGF?F1!_cjCt27dUPZHo5&~AN8fVtRX#ZM|C-% zNNHfqczxRJ0>^n=r{64)^VzB8h6m~n>p8e=xAQkXK^x|RM<)XbjL3q7c%L$MlRgE; zjj#G~DJ&HGstj={99)fuw0QqO2}*LF)oPY_LFc*kYB$3)7$M^x|c=y zd&*McgkXr}#@A}~XlNWMwPm^8lyAt#oNhyV%B$vBMj_pm9n|ZvINtV6 z+K|e<``5)ET|1lw4hKCULts!`W=ri@W%utL^MWZBW*`Xl7!b>27UOjYi4)YV>QM^2 z7O_Zzn}*3b3xBN_qWyLA;i%?!Qgsed))nar$*g7^-(U#gjU_W=3&5J*8ZH!Dh~$kn z(5?R$##Ps4jE%_(1;N7bxDD~N-Mk=12psRI6ct0&9rr%sEEwK_7>zf~XcB2cHhi51 z#eDLSHH}upl9fA28mT#>!9CZ2!RRXbNA%FKy+v%wUvh)=XM;vnjTV==ye{>pE1=Zv zixd?PivHGCZ@O8ElY{RtCh{|0PJ(xzW%fkF>a7G)`Q@EQOsk3zTJYWM_uIlAp_Z`n zfN-F3wigiP8kLbSe?^K>)bm8Ts+iqPaG~8bt$s?KX0pYpbHtHlc zSS%AcA~P$OCi5IfJV<>d;UXvPL%s{JKe6>aKYpN|UySYk!aOVTpr4GOJ^UT%W3TBW!`7=SZT5SPn}9F)cy| zAx8Z{@8YyNr@4n}_-_3lpW8F+!iG?{+kYVMi~CvYHNVnWNl*2f#snF=@7yE2*{5Xo z*Uw}FjM|#Aa!Z#@au$m@+hPGMoXeI3pdm(8i;^%rk{ zK{82QdO>HJ-wv}b-aTI1Lktf1%wXY7zo}z>bE<(jJL%$BT2IS7rNlR{+uSGp0FSg; zoAuE}k$J?Qb}?4&ql-?Wjb&0iuK1>+BG!CEHi0LT#M8%kdQ1B3N@MLW>&qw0&T^A3 zz9DKbLewAk*rKIS1Q*KYdw|j<(R1xl3zmZ5ddqS;f(o?0X%2U|c~Sf5<|9k&>5i@5 z(~?E^Ag+cUp_>ZZ2f4j4;YNl;EE!MXB>eK7AdNVQD-l%k>j(=gJ zR|iuq=;_wh&Zg6n%h?3Fv}Qm z=jeHL9&xMfl>u-pej^#^*7a;YUm)oAc>;-mNO{pk)b0^4Y&9EIOEn@sv%I^&jUxRh zIQq>i>oyV=1`2qar&zt-9eO@DVJQN)yG6Q>{Yz=LzRHoY|KM5jU8~EaUb7LLEy{C-s5qbYW4N z{y{djI)pd8RucY*W%eg}n8ULEdAyM2n^^LN@8_=Ez03<$BDzpMk_B>Wwyz@+CvZ!C zX{{-j59xCv!^1Ty5~7L};^mxahnplIdT&+gWX3TCkr>*s;J!wZ#*`=X3;pH49zp~; z_MU~3&<}NQvV}GE9cW!8=!|EO?q#9$M4csBtA-mWnxA3>b+J=C$4T^Y;xqi7sk~F-CiRw4Tr#`fI1!oz%`Iw2gmjzjW?8vFnA>?`V7>eoAeR&w_G_u@P z88AbiQRbDeUAFa`3YO#qzV}mBeke#WReV4(Tr9&QkB0&A4`jj}p&6Z+%eWB}7Wvid zNQI%0?B(dTX1u~E^@I}%14GiCjUy`P{aurrsFuXnjz zxdi4c5+hKx3Oxe68QO%t@mJRcT$_i=V>rd-`&vLxbHAVGr&Gf_?N83VuKf0v4i!G7 zxwd*o<^+_T6#`%M^8qPg#3#+$c-}P*1?zDCEd9#+smjc>{1+V0*BS+1HZVha?}h`2 z&)7HHDAPtD5p*|K+-C30c>j#YGc(>FQ9~yRU}Q0I-SYzS@jo-ytTMKqo!r_K_rFZT zkfkt*fM7t1pD(9QXJ%@~xK%76R8XWX-f>ehbapFlI$I=MLc2>-GENoD8C*``qm0Da zYSZG28IV`+L~l0_7jiubcp0i_f?Zmb@P@Oqi$3L4=5ZE z8vbp00r&~r?p6j`nNE6Gb@sk7yBCkj;j`-cFJsS<3S=utUO02Cvx0?u5*O&{jb*(y7QT2lNAozm>Y23qsH zffw<4v8d?iu4)dY-q>ffOgW@SugX;^o|^gqh#Fs8x>9v;zgyUClPNr)N|B&*Hnj~D z;5a5m6I!p+PaSzGeo}H#W6c8%z2x6BBZwL)apTex%WS!_2N$z%^E-yA`e{h1BmE(L z_ZvTW2-)krF(A~tf8;AF_=lwvfT{x?S2gj-^t{*R!OGj}8ulYXOTZW?fz!gXnMsSdjVg>6SXua_5RY|o1#x8V3ussJS zrLO1i?zr5^wpMxOU?2LzQmZJYig-TmclvmUMc8|uvycG695f@J9&EK0!m85jx%51^ zim!7&=G6zpjzfpHrQW%jzrg7h+QWR?F>i)VG2c`2O-xHbqBmh8gm?9Pqjs{hlKc@> zSkN8Xr_TwCFUF3s!lpAuh;~S+AEujIz&DHTDeLo#gz~@liML1s(w1hZ-bo4YgH_Uz zldAC~-w`)Il?6Xb9+Z^`s|AgI%NAj*t%{pFhBP7n(y@@3D*HGi>0aY1N&P5)XeP3p z6!vhpjHGeg6Q{^kT&Ix*7;$&zDVkXr=-&5ucjqVUM4+wXy$JBdusw;@C4+DkkeNSa zK-#xLjsmh(JCXK1K$bh}g~1IFx%yS`bro{X*?RI{j?h|9AIS@K4rqEmG7&Ud-J5_Gp zfJ3`&i7wNJ9{|1NU-kY()^IA2fEoR%zn}w-JAwwsxSe^OvtAm&IZeCD6!}JyI^IV- z4$GY}KUbLR3PLm9Ggz>4Kq50MJ5449cEu*O%J4mXDqU)yza1hIwPN+IFXD9GNU^e;P-vU4KHd$zf2O%~YeDgmG1;^)s0iLug1vpr_FgJo?}HhY znAPUkMd2j;qOE$6^kp)at=YQBz1L2KgOKypQj|^V-nfhx8s!_2VEEA`CjMsaCrgFI zMbjt5a&-w>?>NP$gL^Sr)eWmYu2boL1vkLB-yK_%bJA7vt8qp1c=s#mR}Cz@XG;AV zeYQkCK0T-Ud?t8QPdXcBR*5;o3@4Q@P{sf3Q`L1K07oalVGY3PrHPZ5V4HLj5r~%= zT8JGwvM%=s*hst`+70r(h8<*f9$d9(g`Q=Ud~d= zf94yl`oUiA^o1lG!tp2Em3JzY8HMhkENZ0Jn9i>k=A{rSW~$iNe=>*h4|GtR*s1C) z(6-d=JyI9nqoM^WyfKdhTK#=sOu-Lj{<0UuO+5ytCgmYK;K8g$iY*$k($eLw?%DBN(}tX+xN(FxGngM4pkCzFsL-O8Rz)5 zBCp|%bdAG{Pr?ZqUO*9+Ls)8xwPR`LHLNzb=OLsY?mxIn+l!3OGI4sHI~Z7~$!Gsc zjT6H;-szbGhf|9?K!=I;2@ofHnB+rJNM`*fdgy9 zV)6jX$~JiO&@fYuG3z$jMKXIBm2oFfqWrE(^FY3BDEtRCJv;MbnkQ zu5~)04r*9PE8nl9x8qkn5w+5bG~B~-Z6Nu={Fmtj@{1ovA7V9{2gi)t0t7!ivILlZ zII>Wjh!2uC8~M%~B!6PwTTORbz&~fU0l!%Xy#Cam&kuwPGHDTf6HoQtmEb$vG@!{eYMG<`?}vWRT64j(Fw1LMsi;mi_`wMp5>ZOh{G~nwUAm@ zx>!1^@@i3X>brQ?YIJ&pnn`FHZR^^K2#4)I5YRM%*nn+Fo0i&|vz zpC`SL7JPWY?@EGR4kudfRcb8dv0%5t)Hp|Z3|yYn zxw&G4s6BoNEch@B?x))`3V8SUT2$3*E2q+#GyOu(in&k!#l&Q39Y&Si1fuJ#cG=d&ljJGpO|U@&wRn-h2}l?ux6?A=!CT&l!o_-$aiuppz|(EYN{+;``9E53~SPH^p3=+^L1A|L(0<; z0{O9YFOrzXC64^%nH)dt?C<-tM3UUH@lue>QLz-k@!Ie10M%$YqCg}E#Xr*F(6JQw2$Pl_jIW5*;Nf6`c<&XB5>T}o;}gLaoE+={QQT`R7F1l(GC!<6sw&> z3eUW~vf~%8S7nbSS8cHhtIF7=5c^S6a8kw3Fz~^Fdw;mp6g+;+X?wBiSN2s;_&5H& zCRq*m^J%VlH80&HS(Vquva-r=zc*S_Z@;39aI)m+t9o1|4pb#XEGq||uu&6m)k(zp zTJq=)y7OrZOcoM*OQG$VlB@!ACpql@GO8Nfy%P{PZh%2I7Zo1IPw*dcH7L!;du;kvOptahy9{UAl?^lKX&Po^c%6!+Rsd%G(&eO!C z5K^LMtAwef{UZZP_M3X4X*i*eFk{k>WKLH}1`*rIg%Xk-eHk_CilHl}>QfnG+yyrR zF|8w@#wa@Q1l;4A?m3mU@nZMlRhoTX{-%lgt}-PqUI|OvZUuNMYuXY;yeNASpYl3Ibl#8%Kts_lR!c=?!BHYYF0Z1WQj#Ssu_iL~F$YhI<^Cbw zpGjBxGxWx*SsTK5*+C+BztN!jYKP*=SrpUSSmi6^h6ZL>^gC+5@qkvK!S5rkS+kSw zK#7&#Pw?fi|D#9rHd!!zmW1HwrkTX;EKt(;Szhy^^Dhqf&$ZA7%d2jGV6Lz(zI7zI zs5)7*LTVQWKG17u_o}zRdUCnjGwzsCD9YqvT<}dywmSd@P`{trUDA!0-EPmmk{1h+ zR^Dn;zt5em&OR$nxBD70(W5|4-nKvSZ3hU!i9SQ!7w{es{vQB4LBzfb1z!c|yd9%Y zgMrWdvlT;79f&4@ zfG58PBaf|Woq_VI;~bDU&0OAy%_}n-Tez0o+u6e}m%TFVI-GRpw>kW)l+<;*Z3g1X z8RFu9wmuxF47(9 zj}iOFAaVvXj04SdQb|N@okBwbIP>k{lwf?uFnAp>Y3pVMxL9sVA!Z~j0~kKQdgiKW z^BMGOH}aJk;X|=bK*-<^!mYx|3lPc*p5Xxl{N|jY)fS}3L8dGxB1Go_u>ev)JofYz zTK>w~CG#VN6_u9+4xE60O1!t$(a16wo}5L~{Nk!#$>xxY)!cA>xj&$+lD6eG`;>0- zZWn?+T6{=$jtTXt9Yb9?J+Pzj>-_4{L^0EY>0WhZd9$JqLZOW0aoU$=J9Ac<039it zMo%8~l11Ey2N*e}P<`q{f;v(RXB<;v6SjFh>9QZ)vG3NM{&=QJGZ7xu1sIptCOyX{ zvi{m1BJIxANCnHR>yp|( zXrX&XY$wR1C<}r+o-yl+%~;UJP7ZqHkHWjT{4_4&k!@y11xZFMaB;xr9<{~XLoJlD zK{_;Y#`_%@atP`V>Nk1kxF`(|roowX+8{w&Dxc$yN7M(b=g4_vcnHNp9jn1E#D@LM_h ze=6^OFCqT`go@bu!TJ2nUV36?QhW9lRS_{{pjs1zS+SBqI49eW{=H6-#AO8$nGSFq z{{V;MRbav}4V>q$dRF#|0_yfM5%P)9;P?4~tnAA~FvJ3fASyT^Ra+fBe;ie&m6{0R zINF%T1~GyE0M|LMq*mG$EJSINbIOBCV2GUgYk~gH{{UK!QK{LVHWITyAlf|ak{phJ z0fC>u)gdECDH$US0R!0o0En-rnh3BC!N(&59XeOf9xseR;|*tk2iqhd4l$O`N*V<9 zD_q?9x~#JvBX%4!V2lh7gW9Z%rBYcGzT?SabL-DO)mq25EscQxHfxd9OK@#r=b-c$7wd^e-vr~RIw4` zBY-f+`qw~o^9X@nHX}R*U@?L9_N<=_Ne}!a+N459Y0mtDxB*0m9OskIYUpIUYk8DO z1B{XbF&N{i>s2MGnr3X;Y_~>uqd=R~DQ140bH+Zk=AJX~&DNEDb$dLBW4pMTL{vP0 zNnxMDy;+*-?(-2_tZ;`?X7f4YuNe0At~=ta9$lTR+sS4wFi!vlC#U05hH-Y#ELwe> zlQE7#8Wak6+KPw!XZ!Te)rp!N4jEfVjxx%uhTjTsD$GKL2S7*b9+>7RP9;YHoz?K}~H z7`Om>4F3R1lUt9>SW#TCF3t+61A&g`(AAFyLOQO^$r(7^pLbfv&AY&(J13!tcHSq7v z;u`_CpE){r!4=(VHfg75dZR}&O3h^8kWU1%4y2!Y=e!kT6^;1SZbnrZEum5mGe~eh zAQ;s_gpuezG_{uV!djG2Y#~RC(Ov?J@o_bfd_l z>K_W9_@gvTZO8U}A9ttvvC@-RY{G!_-QJVKFSo%OJcY6boeN}f{{Sq-Rn#vbmtYbp zBerWgQM`pSTGDlj(iLOT2Nh?}M$cZIFO)k<&dn$F*wPU)$U$*(#Hc zgwZ7~R5a6ioX3a@+3OKZ6zuZA!0s}0Rpz|3nHM1(G00W`@;Mm?-nwmLPhCdS3P6=k zP@m*|tBjJ-t!9!bW6X#N3Qu#-U&^>cqjIH7hhWR8@{6Axp z`R1lmi(8bb&#}Qi;2%)QuFeK9?IgI3r zdB+&_sH7ud`cyQ=na^rtA-0Nea4VjF+X_#P1T=CBxs`uHYn4OWc zW6)u{huE^4CMAM{X zZCOlzI3JJenu|_!T~AoLSj3XVSlh&r_Tc{OGz18>!>{4`#8%T!0E}Ho z@&UJMo}}aPrC$`zq_MTO^AGU^1D~K2{SQ!<`^8q$+&TG+;Z$S-RObUeqK42J<4wD{ zzL)H9IaHN^b>J=!A2&hM)|Xwl7aE&3^^w*zNhCv%?iU2->zac8?%>={3Ave}5;hRX zcJaXDK7ds&r@!M;*u^K0@T536=i2~N;)&*!V~iGOZK|O4CxQCYO7cfe)PP|e(sNU# zypqK%h`%I2GoMVH57we1A5Qd!ePi%zpM?4^{kQ)B(ehD8!Nh3(6zGb|RF62|`Wd9h z=|vzqFBIC_-uPzz-sBH0w{j^tX%z* zN}6*Jn{MBboM+|({PA4B{3L_El7xTPLH-n5L8(~Hx44F14gIER%uOPfj@!&oPScF^ z&Ium1HI31}@a4tKw^Cayft3{}7(XZ^@;y(jTYnLv##yw5+Mu2!Q;c)csi%$f=>cEt zutK)otVdDTwOUD#+dRhN_EQ*ExMY&}QR5Um&#BetppV!m9W%$d(+INTaTWIat z7P#|QEK~;|gPwl4>s{`(;~f`D)I5z#Oh&oe!!n1+&#pZ|toHbi;du^UO;#hjL_heS zYP<9jN0QuF#?VJ_gnYx1oMZK=^!!J4ZpD`&(XbDp&3c;n{{X_Sq<>~cM_hR${{V@n z{{X^!;dqyD(@{Vf$c;zzsB2P_Jqbu4oQ`>_ZwNZF&%MhC_Jk4;y>d<~kslR&J^&eh zJzqbc6rb?5d^||y^Xt1m(uEMY-I6)ywS4pO1|YsQ)&_da9N_fIulU!e!|^vjxZERI zTW?SfDxJrTydONqTa7j;C2m>dfeAR^WRcHq^qK?3(RVgU8D6jyw#$C~QX4>H|zAdwM^D*!Q;?f3y&-Up1oiK6p9?~yMj*mIse>#>^q z;r!BXn$F#%#y2z|oObU|w$wZcXDFJ|C}EHSyOKN{9;5QDBLtO4XOB}*G!g8aEC{5GDA)&X-;QdVYrYbjhMRE4FudVya) zPEVO0H|Ia{%yalw@H3Xfyu*z7*B^`{{3*U^{{Wuzrv+%W9<@sOdtJfT(|e{2mAmUP3_aoyB}DG_)-fWOlYrIb;>rejxgo50R7~k>~HBqxW zA@{Zx$q*?T@qvOf`F=R0hfIn{7DQl$hd5tODy`17ZzM$mK68*r9lsh&eR>F>2+1KM z`=EV&YOOv}N#)b;pyu9UN20brO2E~G>vSIA5hIL}N2dey6}Eg!YNwdzKSdwoS)L`8 zE(M(Nu`4urQ;xX79DNN6O?*hWtxr0l4aQl2Lk| zpyQ82UX5X`S>K^CxsBI`jzNYWQJ?Exec_<3uiA$h89a3CYpjYWr17~(kOqfy`x7&ZXx6Mi_g6@+(btbFXvX{vA1Q(j|0~=6qc}~?Fq(l&1>~_ z-1B{*Th5QB_5A4!@#h=I=T`-_wllDPbk?@D{_j6pPJk(tjL3hCeg>j>kghilM?h-S zu);I7YB3xn9m)9~w0i;bob~$yoFB@wBO`Yny#;idoKXlei-YgYbBF+TKJ+S96PH7k zxP@+Y300UVEX0%TP-=EDC7QY*TmW|ylG);WwLDb|{hPiVj zw!T?HSIyk4IZyz=0P~FG@G2ZZ9MLBjPErV&*mWa-D&u&2Q46@ngN}wk zGy4AkT1^cZM>5UF za9bb`det5r)QLHOC#YrvAFUtoj$h*?VFNwn6)r3hP+Y2Me`ZW?mR-^j!NBMV`jTn3 zQp0UInrQsOgE65WDSq3%fam`Jtye#=E`cGM07tB!{c3u6dg?*~$qCP3#Qy+FPAm_m zz7bC)&%+HOX;`zdoRWX*w3q|i6jJ;qw|ifPIz%#Ka{2kkyv}H;8E@h}{Q75rwJXbF z1kw3hOAL->KzRBZ;B-A`UFep#He9*M7Pt?VOQOq;Sdr?#WfCKy@kE;9DA;yh9+p%Bm3d~{sCNL(4Kv( zA>$S@M|(BhnT6y+vBdkGLFXuH-Xq2BEeMSktYW^_pW2b+8&do+S^S5 zmJseyhz0U{bw66^d`!y^k70K#6xg!l4D;q>53dHQYSFFMm*u(_LV7ymgV!LBTDg2e zTONJqTSpV^nWQ0b0f-ORHJ5RyO))mg1`kq2S-aEjZLXOkiJCHUq?~c?e=4UXq}Ju0 zRSIN00CS9Wq}VM*j@g|{DoWsWWl})LTCXayg2+k7PybK0B@y{QXN)_akJi*dD z@<7f$l`V~h{BTB6*phe|Y-2qQXxn&p?3R0*fgVQ)i@1I!rczcZE12bqh9vWz2=D$y zc6P)F(*%9dobg#w=w5bE$_Gw%5!3$w*IgufEP+&{(U1V^k@?nYUCMDIi9=>>QN{o__qx9`c@bA_LCUd2A+)M zatvSqdK~`%^@S-ndn5@X0p=mHNgjmppQS|CN-ph`zZ>##&t9LY6|ZSHy}n7VR4^SP zjuGp^j+p$jTNiPn+*{oL0A^l7U{kkjAr(RDNT_bxl%kA>0$56UZ-(O_AMpCsL#L+f zX60~ttY`cyu8UM@;6WCfGdplLt^voP$JUTt>hrT;2o-tT9AuI8BvVe{)sA#%QAl_B z*QB065w}0*>rqLflxzchPIHil{42G$@flJexVJ$u7~Q)8zLh+hyo@lgAoK=IarDI& zV$k4INR!L}3*2+hmUz#p#XYoH!sm774^mRQF{;Q|jge$?#GOA%6JK%&YhB*q`hJva z&}kzF!_Zo3^On5dBC*5AG+c5H2srDUes!6k-lf-#bitfET#Tyl21y_fpv7x=qg4&0 zPNoJ@$}qt6Aos3Qz+g$^oh{s-`1k}5U#&Ra`jxWK_pcmCvG_*)op%7e0rlX1q*u>U z<)n-SkmQquY#-|e`FFYeC(reyGwCZ5N5cDWy`Su4``v%k_pdrh zCbv~3h{)pyuldb-1O&(MrUtI}c`oi;0%pd27O*t;u#t?nH&a4S8v>Dxd*jzNy8~z-W9=m4`_9$f8EL8K z)QVDeCo)b)7YqqLl!oH%F^SZkxGUDB^Ry5#=P%&wY6+&%PC`#8KB!p#07|ymorlAI z_X0@f*c}c*9rM@p%{Kc~jOn&^r+AjyX2~EBRPr(SVyLCfjjKvxTZ#9K0?CXXoFAdA zd#0XyqdADC201zQ9?kSw?1E4?W)4g!o9EKfDB~g%>{dliMwV7v$T~7pK2dMS0 zQw+V`PnygsTBAtRH;z;RfIX@*a`L6h~D1y$OhYQJ7<87e<4vM zQf3qydI{rs7_+_^LY|rAXY(EFVeEjn1I;@@zyuS}^Uv2mopO3cxpMb#Su-b=?Ty8L zZU-FwYiq;+$!F(VPQpZ97B}mY+-I%{p&^zGV|w9%1x^k>5t>=82H^WrWGo|*qmvsm zkO3pyj)$%@#Ydxn)|Mh>A&Q?cvtW$%$^7bB@4U@gcsLfx3L=m&rMG7z@)YbBFy7V} z6v0J;U8D&5Y00TJB>6+WSc!Y)H+lFOg2R&G4 z`qp=f%TK08mWp0QbUBNt$2lLRCV@VLR`zcK$!ld2Tcl&;bmiqe$v?yV>l*soS+{d2 z7E*2;4y<$AFUdcRMEcgRXvJ<|k;o>98P-TtA;K)uo z=k%FZ%Qcf|-oJMoY3tzQ$`M6xr+ za?r{lVor0F3<3INui;r%I%v6yTbZU?dpLNKQa2J$MNyxa{xy?Yf!%mk-qpN6Zek&& zX&I#rlQuqV+f}hLPkDe(<42{ zzg%{%%Tm<_nPnx)GryKWWzNvRj0}K1^YyEUGSsn&cywL|A)i0YR7TFw$Mbn3o-zUJ z{{R85GJhLt3~+?1gP*=hKdmBo*83QJ$JDHZev~;N)a&B6fgOjL9?`*Iaf9jieLd;z zAck}aw_zi5D;8mpagIGja!1BMj$ydL=N@1kkUSw z_;}&`3#T}29-oc zm2`$XbtR-KFir=za%->OKCi07JaaXTzTK++aT^pbQuyPHboKS880m6cx+4Doi3_id z;vr7cWZy6u!Sgt(q})$$C}eOj3YHnqdd}AGZ!PuMW_yuxS|!@XJ%evU#!fz9dep~Q zY{bEWs*m0(jAz$1GMBw_qOOc-A(+b$mJcynK&%_EIUmmzs=gYrngYMsux??)VB{0f z_4TS3xA8iH3lY2=0rki7t*cqqLIJrb=d*%2Jw|csM6TV?)`;LOrqs0SfTq~9+d1;f zkGMhXbLuLVs&#Er>FzFKcahFXI3JL%vs;U8?6Tu3=XT-i-}TL5ZY`coEr830(^Xeo zE3eh@in$ezrgo1v8@MH(0B6>$D;?eP4`Wv?Zf)XI=UW+@vED!)oMY=i(HiK!A&Gz@ zP&*ZqApJ0FsD^3bD;>qrUOEGk54im5J4?Cs3rJyr#-I(38Hi;Afyf+;R&~U(ET%b! z$PWY%NF$|7cMW^{4y}fjY^JBBRR*nBDfT_H>)WT z0)X;*dyan!>U;&{3&~S-%_0oOv6Ht1VVLvJ7#%zM)kj7rdunsODIz^XT~-AD0H#XV z&#v0xC7cE<2HcWxJ6EY*cy@h%QPx@}V6A&HlDOjyv~@r1R&x9(yO3vn!;kh=KgPHA zZR&G-c&!RQ3=N&uqo}p*t-R(45?IWTtCYiF<0FHTIj>BX^`vJp&J}U@OeM(m1mm7_ z-nq{VcqaS98idzS+D|E#6B8={I4Oae~;erCL^ zP9DFsZZW^j8pjm{v-|`%7Ynp`v$S}~*#-zbI&tV~#1~RcZ!{t~_XrtT5(z)MtbKu;g>R7@ifSbE@pI_KL!m@tlG2_$^1sz5mEKDeQ)XeA}s;IA&PZY|7i6J&rZ zGmHbtKg`p-1}3)gzLY?Z24Ji{yc`Pj!Qq<;^!WFw&ebi0)Q}0l`qw)Soo%N0_eY8W zfG!gxp1I9eFHq*bhkfxLXrl0S$C(I|e5i^^#>P3r55qs5cpTsvBL!nP&k`Oh=r13_ zv-mGj4JyVIE9Hk=Fd=^vUIi_*#s$0Df(4VyF8ClZ83P&g9+jj?dkong5ALjg2FLWI z@{sZ5Ss$b0`cPdl{cl`MR`zx zRPdk!=(w*|)4(4LZ8%Urv6F=8KXN)(n~1|G0eSjYI#*{^DOof_5yN92ffZ?80RI42 z`cjK0#_S$FDn+%FV*{LY^sc2Ak^*{cqaI<`T z&D)@=u&zSCF(U$+=r#j^$;K+$KvWTd_0E5#HbnVH133KYIE_fhmf9&;?~NPWjQ+I< z)2;LVr9nL441ZeEWdX?JG@e*E>yKkuN+(Pt($SO#nE~U<9P`OEhfRT7h*RoDYg8Z@ z89!6aG1LYlAE!#q!FDzFv+h)e9gG4f9e`1a*VAs=+CUhS8Q|l$y;6l(&Y)vC;MMrx zFLqJ#uH4`ZjF0}XtBSVAG~M?)B_M!7$UVh8W7f0mi^K#{0p;iAZ08=t)#B=-mkZ7^ zPAjS|=;fA!#RPz8!2>_7G~iWOKyi&Nx1{W<2;e4uOE$csU7{_glw&Bj4%me z2|SYLbZLwdqq*zUjAp7wXQ(XBCQEO#&+^s~Q)5r%e+hbnQLdU0H};mfKzD9@X6?xAxE*u!TO#(4~eTZwtYIp_gB za68s;l2?dA8Hy<^s5u!t6Zr~{y=~|Y@C(VKL@XiL8HRC_&Iiy}BUfiZiiD{Q3CQj^ zt$V)^-QQ{A(sEw}66KFL(Z7P9tl=h$yP?(tUD3Xl(RdQ zW0qr*eRI~3&#Jx;5BRluFa5Xw0MYVMOYndGNoI@x0B!#O^n8?5jMeeXix-2h-Od$! zv!DBDwc?W9w7?=r>dtYNT;O--KEt(pXO2Rj2j7NW!wJXzv|8~wri~*~pe%#$0tbE% zdVJPKbKI!bqYU0@0A%5Y0X?b)vS-2be(=r@;g0^5V|vLM5waj3#71$Bzo*yptlQ}> zEn;cb1bcjyF5fDG2@D3C4*Jh|(Ry#E08>dmFfx&#p5gM*$i z>4E&Yta!D1NaJR?c_+v?SRlX|=YmJID(kjrg;K`!E;u4lAMzr!X|j^;c3MrOcb9Xc zsD%PNN!<-gMg%|BpZHE$7Lh0oZ>rqNG3AKl zh^EBOM>rw4;~aaFOPRe8HDgwM!Fh3S04@>P85zqi0U!`q_55gp%`YyH#^@ECDcm;s zWcrH8{?X9>%Oj?&8eAzz-ZuqsFh+5XnD5)tpKYq?hIJdH_9gJt;f%6Mcy-ra|ROsU&W=ZKb<|laZh4R_C2&mF9LT@*UYy zILXPt`qw=AqqvZx2+lI76aYN|@9EyJSX93>BcM0M3Fq9l!Xg|xgVLwely>_ zZCU0$K3LXw+*Me>k%uNc0$GVX42AcrrrLU$&8MVJ=fd->cY4@htXoo?0LQQI9=bVY zbhjcRAd#DHnLL=xfGll;xNJP&ui@!jj;(fC1D@W$ z)|MUXrIu#evB}6JjAQ_F{Hr>WwM(O?4I9z3Ju+Qt!t+#tp_r60l?=*8Ng$8qQU3sH zTVEvN9od0_1dL}L$3I&0n<=A`cpOSX5(^J75Hrc+9WV_ojok69vb(uQ8&^3v11xdR zpdbFa&QiNICyEy1FI3HXCT$JV8;s%ab44L8t4w=t5{c1S%AFPc+X0@Y0+I|M=0!eqBVnG;eIy&U!@-k|8P<^5!aW;0w zSSUHhdlQdL_NZsPSe4cZOl_7@LvxUM?bDx1j(G_;bLc%fAwPg^B2W-XrJ@ak0A?UK z{QWDDi^I~!yH?@2@3l|#uG7QYTfur(^MG{da)0fZrfjnx#>9J9T?r+7qe-{W<%W$m zG4_Y`&ei__3VP_WDbJk$06+&nrF7rueB*l`VbYl__7JIgOGcxT&cF{&PHTR3(HU8o z14wW&=Qkfu@TLC8Vt>}Z&>U9k&uYuKDL#ZYNAsqcV$Mh(oi!`;7QIX}p@Kiw$&xtv zsQ$oG0B3xTG8-AK-?YKgV&H$ZN7}7{KPqG0sJrYm%pdqh?emfwKBbTN)4$;zx6j=_ znXS*ZTLaQOd%ynxUXQogj`0ESC;tFmlfPn=pK}y=egHw*aySPkiVufv*pgI|dy;Ei zwHqA&0I!4*{{T$~_|s0OVh1;H53GayXkLNL`UWtL!1 zQ|6w1Ijs|`X~V7UzvC$X02-1T`%8jEHxS52P>vh=(ef49lO~faWA}48&jdN#53OWh zTt1g`8M}^Bm?O-IfZd*)AA0U$cw6Ou{TNjZR?!|87*^ZJB|#bLIK?F`Y&#sJJ|(qC zm3Fzua^ojHwDICQ0>C@GWbu{&{{SOU>*O_r0WG43J-|iJ{qtN7q#ir6<+G2RkTZY= zPEXK!RjEFrX&$e<5uEYZ(&wI)<^Cb@jM6oVX!^4vn8}z#2qA&M&PUh1citbmmiNQf z7c=>7-uzvUZ%2I9_BU6 zm^?KU@k*@Gn3dR!WlW(Ce+*Q~s$90K9kih~jZS+fBa%YGT?pr{aa16TP4LS0eVbV$ zRwr*=rZ@hkrn*p`BA!JA1lR4H3}cCHz(1)q*s5(~jlpYkM&jlqx{`l7P_}AB@v7t< z;1EyHVAU;OQn|F2c%WqCPnza3S0K6(jGs&zxNr39YuoD?e7LSMW+}Tqa4HFGdf*X{ z_I)cZ@l2N*SZKVFJ(bKtAal1ch70&@t=yfijH4BD<-Oe3cN&}oj7JS06)gM zhOo7tPC7`CnIN|rQ_vRqg8}swg>`QBei*j3APKG{!BRS9v(MlKa`IT{QZQ|F`>W|Z zyI3Ap9hGyBsrnCEWyPfSa+v3gqvNB0O5fFFZ9dl9NVH-Yu4E{1Nj!dJj(xG(lf;PT zJ12FF5WZeeV}$?@&>Dvw3c1dRiwuA;0sjCWH0c_W}F9dqa^$A`4jIFeUWq-)i(U-&7J+k@1e|0x zf2B*3MaCT3ncHn8Ajp?4N$Vu!)Y5=?`dqK0KR>Q{{RY! zt)UB*E(kdpKmBTzRTb=t?*)vlowo=`#ubO-#wr%mbm!xJ$oKh8To6RSoz64gkMO69 zc(#GK=dMqD)vBU&ic6+x%;Wc$0|Njs2Oq6mvyRHqT?0s=h#`_S1Yq-%^!ihN%7ByR z9+>;NqwK?X5|RMVE060-+6S{`yR)X;Y4gB_Ri0N?`$Hfl0OWu(j!$}NHgLrzi(9Dh&iLilZ%^L1PIRnu*w) z4o7ZiAmce06uHCMk;kS^xGaFKdBczKss76`kyNQwMZm)YjC1SjP2M)+KR0tgX2((P zaZSxV#q8NAhFI;FdPyNy$v{3{0Kx0X$n@=1BGazfx3F*na7U+6&r!*#@>BzZ^a6pu zAdo=oQmj5g^{kUc9LqJkvzIJ4uVc_>KVJ11H%UB#L~kZcVTe2&3m$NrGAO5pS z>f%t+%zVb?;Y_zTN+&IakRCp+F3J5(m@Yzok))Wxx_U zNsOF0$mg#WX90?*t=rD_)-Va= zo!$i6px~=#8BxXv=m?hRY!|V%zgUC=w1Mtl!60D6o+mG!XImGI# z<|z3UvH%T{o;mN=`qmzq4YiiIm+7#Zf0V_NcQ@LkUVz(ABu{RsOM~t)6OojAZi4=hub}m!_$vtv2#bwQ5r?A-9O}_^!SR7z+kA4pn>z2o-YU(e!8P(MTFvaa11~g=KnI}& zj^?9%5X&P(vfWCmcDpl^ka}Yo?OgTs^Mz>{+Bh@9CU&lR5Atccb^EjVwzJNoJ6VY1 z`TA4ksP(yg^wyR-XS35GDu#Vl*%V{uoZw@w0B)J9ns0|}wJUP*>JHM9cEq=A(l8(F z=g@opRZ_;)FD>Vu<_WG>X-8yc9FTBMGu#fwyT1tdX*9cayp|>tGU7FL0i!~5$@Mw* zx|~1vTksTZd=s&;>O~(hT;>PEzYP!h zc9{Da4M;v8cw`a&g2(>aAMvh_=I7-5-gHFJi&mRDIV3JOnq@HqI^Pk3{o8j() zKx-wRPq<(8bHO3Xp4?Ya^8u2)1JsI5++#GFzQehT;oJM@ZsyT-cUdoO#^jkt6Cvo| zmOonGx<9yYrC?ow{{RV&l;C{L1j@(M4WwuBCbzDS?i=Y!+K%TNtjZ*^gCGJI9=$qO z1+7UVF+nL+Y^x~fK^Q-wHP&huLQOgs{_xH^`&SpNmpPFEZH%lyARGhEKO9q)kd^K; z#QGJElWC>TWf6?X7)Z?Jw;TdGpRII8>hAW>Oz^guDT$iZB)ps-^=L-oGx+rP0;zbW z3*Bc!)8MevuBLdL7hxVTmB7b7hy04z_7DU;b#01u~c!jj&@>2sp9 z(d=|>5?gCHLjLUUkmGcR{i9xC<0dPhNVz#_(0*Lkt?G9c_F81p+{}?7QUP}AamEFB z{{V=tZ8bYJwT@z?W!_sE&H=|6_oY>&tTR9yRE&Q}}1E$CC=yP0&>ThG({1sw#ZvszeJ3p5?2*WQVDU1V;#8e(A zy3;gdxtV5p@8_Ne@)A%m@Qeb-90CB(t#B5;D%I`uG|=qjFcTqp8UjNQazcaY&uX@b zX{YICZOr%bleM~n4_tpFB$(r93w2VbE#HEBRjb_%HN8S@Lh|-xxtxO{t-jC*7~m7rikZ8u z5m8oqoUNXIa=TiR-~>XY0>Bx(oQ<4FTEouCdo z0D6=B=`}Zs^TXDcThK@d8%un|@(2U#gV()IJU^!c#5axZc&irLMZK)cG&Z*aMd(`u zbJrlBdWSTMNXXOgtzN|G_eIqSCweaKypH~&x-B9XF-tna@Y_ciND=sK4y=B>{{Z@` z;@sSV2(=rB0FXb%tK3C%bfsmEQyJi=nnItb2A4cnQKZfV)Uy-nbDym`C>lu>6cWVX z0pB&)u7?(L_6Yv~7BxJ2BC7^lH~}1j2P5>XT+vB0+x`?p(0nw~BW zY=87>SAYza2eGWDxzS5ut(^5gjUe1XLc%1T>k*yqJO0vtT~aJmd0sv#N(f>UjELzP(iY}A6Q$T(wX*Y&}HrB z2i`OO6eRlv*K(N+p7hc){sOJMXQ5UeU`K=(Xio}yMOmFJ}>2N=P~r?a3W z?_bW86>dWU4`65*)7PB)151*B4wUR<1Ht-GK?fK=Tv1>cIOnc=8gk%d;($GK`BM6G zLrgRr8A0jO(xnUt=s6wg4z%SX82xE#8WNTq4nGPv3}+|Wng)K9Fx!B8Pz0-vpz}`Q zk04^9QN=iC2leKY9g!{<2Owji$jwI4q2MBZE1&5|PDkU`nYeZ5J?KIgWJ4K^J#Ypn zVG9F1pc)V>x`aw4Gy< zlQ5NKvKt+knM<${qAo(5jpNjD{LL(>FWDn8Mu{5oA2=_Q*n4wT&ZLsc>mXALov8|v zyK~6@06D2I5k>8+TZ_fBJ6(Xrao;Dfro7B0CfQuXkVYgku>I#Q4nY{>Kdm}suy};? zyM$N4^2S%M82kyW{Xi|NNMN-rtow>M$N+FhTGF?-H!&1=F`UR_1Yju~V2*!EE+(A4 zbyQT*`!_n|&?QJS$RG`p(hMQp-HL!TC?QA=AqYq#DKga1A>C5a5|Th@T^q$a*@ml6wf_r4!J<>d|@wxBk52QL$9q)rluHp_Vnm-U*ppXaMCw(5?NMz zDTdQS^Ete+o_qczRrK)3Q zZa?_!%>MQJE7r6hJo>EUon6ur7nMEzeh=W?O0xnb_#VKK+xr?d6uBfqaY9o*BR+Jlxs$H%&|1hB9 zmpS~*S;vM0t>ixj1FrpOUmOF<6_e2H%3qh_g?4G;43oO*w$uzJ$x~>r);PaRE8*s@ zB3o+6W=%eaP1yWQGCOTy62JDgnuMpmGjT1;F5=;~#aV<6z#n3?@$-K0hVFZF|J~!v zmj}F?Xh(BuTVLL@f|-1CCUxJE`N9mhSpKDaY(EdbSyA88d4ajga#b2AqCyT{6j&(v zF;dsG2azn!p8c-8n8#Q=Q@-t_tPeIeQTwTdPVuee)!i$c`n;Zt@)XT{Yx@SS!l*x8 z1zW0aNsPy-u3=b@DiUvG0ssphSanvLpX+`Ah5JFIwh;{rxRyp zW??nFY=O42$R{8mC!5j_0}YoLG2rs>PSVyB(vESDF<8>=b03@d>e@c6HpfDN{Iq_& zt-{iUZFp5NKWyYh!yf8;Q9}IfuCx@>j~oDrrM9HFg`4|^S9Ghgq;kFEQme`S&eT_w ztQWm&6uN6*TG$X9@~T0HwsX5r7A!V6R$=@u5<5ZeKww*Ut|@fiw00vo0=yjEr1nG4 z%w&)K3Xw~ce)e&}K6uHbf7fvSPi&-^C5aZSou3iDA`_fhO5+hh79h8zL z8o@VCMZRt?u7nNgsoJ)5YMc20`x-Ufe=JXFMH1SoZDHXN2oJ5g3NxLoXQbb6u_$9$ zGD&%sj=d(XCo~J11qn+o?o^Q5{Az?$0kOt%Mhdh*JhRsTHw$7{xgq)IhZ@yIf^5!a zS=t#_hgp!#%{NL@5#tUy&?{`12}=SK%^vtJw~To>C)e-J3i9Jy6o87pI- zn5!_cb1imm1=W8<2I@K)N$A^?1Z~}w$PAN7gEtHTJ)s4h&v^Z@Jqhscst-gMU?>Cm zD4ecO$Dv>#w(i9ur7wBdrJ@Rr-KVU7$9p_NlTxe#tDpH{^Z#MDk^%y%EF*|J(!U@D zC+=ss_Wx0=K-|{=@W_%EGWm&JePB$0%qqS08`DnqlI|T+?Rr~w6nLXyb5*z2@q?>5 zy164pLz@_V(FWs1*q7yduH$>c=FP5vSeO(mPvm+DRBQ`Y@d`n?Q%p^*+XR(TA`$v^ zY_ul;uqi3!2;A8N6n~WlWw_j1eh{Zc*WA%_dv(aixAKBa(EfKjZ;!q61e&eb@4I0Y zHfGzi#$@<5kuI#k>KIvX`!r4Hq^|*hw%u#1&a*ow-wcc5*%vg=ZBU_D%ej!5Y$=;J z{KN%dnHWW@k(WNMOxG3b&%ij5=iAA;{n)dhYyP(PZ}AG&c9n%l47|$L*q9Hh70wa> zV24^ZPxyxsE8jg<0DZ5dnE7y?=QX{J*sDIrpZp%m(5bii8KKb0xmi&xyOFuBAi&1VmfJ@0-Rqnht z{)X=LTpQ%kbvaZ&{`vxXZOq}JCB8j%s+kxa%b@@JNFY~ zJt%sf6P?m$pSr%nfaxFar$gJv9~jFvK4I@5#huU3^t^Pb=ZAtefUJ7ji{32!EsfT{ zcaDi-cni(^%DnNjM2Rmd-4PsOP%{zlle!PWXL<@bHf zTZ1L7?SOkFQlY5zK)01D(Ai!Dy=5hjfqedV)adBB-3X-aJ`Yt2Ts9Wa>WH+)+V4oV8xiEPX16jsIz# ziIlcfR6_HW6yFVLl0eGig;{ituKu4|*X9%Q46WDjY2<9W}Om8LszDd;N@37Ug2@BuhY z4rlv>x>{yk_Tf;0#z`r!^456Hw4tc1r+Z@H4Q8+YwSATJzPqbJ8)qzmR ze0-HiqmN(p?`JP@DltQ!Hg`#DDZAkVzi}f<4`njG;vR;(?_}EU@uSE%AhMd7X2gLC zZpUD{_tV?8!B<@RP6lu%zLoNjRI5rIljy*#H% zy40|Ytxh7-noxhMA7dNnX=X$=%yQtI$n&`26OqSvFXlCx0TF;*?rbw*6b=4ruNmjH zb<)&Fnkv?7ZQle7cXI?bjqllp9bMq|{h`{hon9+j>vkf#g?_Sg1-=T`Lo5D;?wUSd zuAZhXfSX~q^umwti;Y+|p%oj}hL6XFXIGa^KW@c&F9+*o<)}XPppWOh!&tj=l9J>^ z*aF(i6sP|HIo2*JrAMFuX$|#Mk`q6toWQ`=O^oKDh5&4$mb8WTiG!3Ks7#Q4Hv%cH zO|#anq-Ai=Y1nk%~!jy8b5dW1}o!Fnp(Q;R^K`K)eT- z6>6N!IE9AbwJd4c4PVrig_Q7L9iEpD$MV}hp%5Qc;I@kIoRxE<*}BMr4~w7}onm%6 zQIK%YLFe3`4?Slk0(9l+hnnPb)>8&x`X+0|Gz@fmA|(R(Dsm`$R2Dqey=0Jfi`1{% zX|yl#*LWzu!Gkzyr(Yl5jQWxE6?0SS4pO{(oqyAt5k^M02hS)MCe5Q|g4kDuvC(3|f zz^ag)PF3WwKAd(YQ=@(mDTIc?_oOjd|1C|D>ke(+g&Ih>O8J)sg zwK6p*p5fDrLmQu`@P>DP4$l~5#w;fl8>@It-+f|zw|xVvj(2#zHvW{ zdnZ%qgUH`+3>ZR*U&JkY#1x~YUWU%KT3+|6RZEQ> zT)TIsM4tyKJO>B&51hYds+UkhxQ?e99EBr33uX@5csyNgSEjj^#IHh4<(j{tfUyld(+Ja*Q=CsGp*iXv7E-Of;_fUR% z8{0Pn3D;?BJgE$*$xR&PR|@I90Z{AVR&25L)1I$+&HFZbq>&0TI8#$hYB_i7X4lg+_Q@}l-))5- zy}f?51oI}y7Rq?|%`;nw>U&9ytF^%P_KkI+%3NwUDk-pa;z8WKwwc|YPflXy=v?N) z_Sqc(b*F;I1YQ7hvh!yh6F_NLe0^52h~Y}`&gLo-fcslqNxcQsyp|$Vog8~g+l;E^ zF8n=gz?(_@*VyYv?{>7ECMc9m8w#AH+HtM|cLrA_=;>brU z06+Z8AoVkCp+%27L#nG2xh0`wJ`)*Pe^~a4&F;3|(hix~`|uolNd;f-M=F272~CVz zc!MG-1qqiE&r}v!wSe4B{B_T(pt^fg+Dgq|V`&^jCKmx9W0P(*yYia3=`s-pnbJa zl`T~eskYxtOOhxDt9y<7I2rWScl(2L##liEFAlW^fgie7HJy-F%RP0h$prT-|A~tB zYzqrB^Y7#h)Q(x0%5FvDGaL$3qy03t0p#%tV<#1XhCQrEss@qygzr{wWWJs8s|1cF zQYL)AD4|PBwp)F=0j1?lp2VgzIU9bg!8PcyNO)aFfw+_d$mfDpDr0b2y<~5FHO_%0 z9G0T@MZ<@p7)-DSvYEt=gLCn5n_6a4(|N%px+j!{Z9KaKXraI2+8}CUJRmR@_$%bz zufs-C^Oe+-Q(r=ql6jU@u)dlaJ(51)OI~)O>p-P19 zt@hFNfkw}_`J}hPWuI0qkvQj z3vjb(fq*7KU$jJRwp5!{2)kSdk*de*ZNW#cD~_LG7#?=L-Gwlot~Xf+!O`Kh$kPD zub+MQ89swYe)D56D`(do?14t@uh~YnIaj33ViX?jIt3{M+Uww@-j{lr&p%U=JG}|^ z7$ZK@@JK)wqq_zw?y1qwh$Ycoi2yrfwZ(=1WtBcosXg1HJs8$p8d5onq^=dvahwsa zgvl(2i}6dl>xPA(g>%EXF>N#c#N49mIs>;o)p{+097cqM?W-uuxsz})V=Zn;h`N-WwT~+)I58~iv#&+Q#Wpj#fXit?9jYN0mK3xuFl|rOBgEnMwNlVtE<%UU1F1I zl7q<`!>Jgf`nE(W6V}SF}09Dq4>9I!Z+(n0S3tJ z`Vm}WwtmV$z*O}E4Mrg*|L@&6C(HMUEl!QVBVPc}5|% zKNKW=B4I4mkjVOZO^y<=9GO0U%pLu;W|0$XU$3JsM>TuVI}v~RNMPSG9~OVRm8n!ag~AqD9M_+=cE^KR2E!%j=}x(q z@G59ViZ#AguNpSnWOqp*hR@0;p-U>SPUvWY=P6*z&{omHK%twNax;v`WX3j!{v?3y zC8(&Rq%Z1?3azZP&_t~}uPRg8if%maFO2ygz&1QqQG1n!ZXU=)6@3f;)mxFZ@X@y`@C6Q!3^icQP7y+sJYjodx!KK@#5a%tWN`F_f42zMPl1Xk?YGjdrGywjHx{b@$H4oVrE)T zC0ZOnE=3*P;_gXr<4-2oyaQ$wlN$e3d0^63lj@&tcWskE8Ev%> zY$@rX#muCA$XUvd%AbOBD>J@o>llOL2R2KcE-|@V&54cEF2nWXgC~ufN92WjD zlxQF7A$D_|`LOuO_~h4?Xihb)(P|ZbQ}MZ;e47ptzfF$Hp^>J3p7#fd9}boo9x~Wh z)mt6#=o%+yfh18O`pAVNiacX_?M3!r<;E#p>*TGG^q@g!`l?KK_M6|;F*ztFqDy8Y z`9VUA!*6yXyc0qNkzMzsMIt7lUGvNsU99J0+PkV)&NijZ^@7?N zYIcMKgQsX&fvwl)=#I`aAAOJFw_i;NJU zli8%v5ttO)(&YXis-#+}F>ay$nttCQ^D&N=K($)@Ky-wq7S=1)O?&4-69lJ0X&>qk zwhl%a)-&qzRkU?He(1=03b1ZeGn4t=B>Bx)J&s<9%RseD;ty8DW&b3~^zwD>HpfpO zvZ1GE4nnLOiy5Sj1ssp5&DPFPa8&5bBNmi&gWXscyV!S#?FBI91vnD+B%V41?@n)+c4{EmXyvF z6av1XYiUli9`kHD0RDb&l2Y16gh zsjfl9>4T;A!-wkI_KglH?HFxg@(BV7I##SB{PY-htZ#br*-;xUm6Gq}iRHaAWAUE! zfvNwz8uHCd(-GV-eSOmmzF=m?wbf zEHU_=Ti0s|t$Xt)w|;5@#edwafx|8{jZvS`e^WSkXc_rcygyau)3NQZ(NQ$OiDDdS znWgVoGr&Lo8XU@im06Cau1nw4WcyjW(4Bu{9Kb8ow|=L^`4?nGi#-%fw2n0^<*9Zo z=k2?4WR#@}ph*pLF7S<@M}g5+m7ncMOCllLmdn4b8SS)lgnjv+n5G@k<{cB(#((VD zd=xpu(QlO)jy8O90Zkr847G&vrlHO3w>S&UU+wj88?=Rcd)mlhU5!XNs0gIK9&Lft znPH$S#+dRlXB#46Q}Jao9Ibn}96uk?48Wq;vW4Ga+y>wKmK~Owa)_T$XCE3Oo__`3 zyFiW}j)(*m8`B6kQA1iSF8*W*Zz&&_|3nZ7)@nF zSTkM|n{EaagpBFa zH$6WhBZp)oYY(1r_^i_F?R!V0$W!pW;35eu0u!E*FDvqtc@M@VPew8w=PH7GdsDW4 z{u?uXm~{efucidiZL@V1Q#z;p5uLs`O=(su}tjVstXZTNKMDIrD(>J1a z72lm+Etf1p!l|UZbMNSbkft9Fu5<_gEs{6oPu9zkjIsd~!*T7B{>JThibMf~#qZgD z4!$ZwPNtnv^J$qTLm7+-LDXol26K3wsYn=Dr`m$5f&}-40g5QE@}2 zWE5m%9CxwCNpnHFpx->SUsQ6+id%zZI_;%*+1b36GF5_3V{qT7ZWW#0b6Pv(e`;%} z{!s)vpzSm&IndfpkYAQw&pre|=AQkD}e~FQzRDDyYkjrJGEew~vr* z*n3)9W)lpez3eyB`6DvKho{>ery>offQ>p;g!d;ulK-ys7sO3;Hs*dfc3SpEok4$~ zw$^+P#-ZaYE$$L1sjqf`lG;A`Bx1>BN`6Q_Tua3j)3iX!;6A^7GYzR8xQKc!T|dC_ zW58N=zD69|eu~s=7U8c|9?t*4BiP55>HFobj^&t0kCGt;V~jeE1*W9Uyv%vPj1fh_ z?^vh7S6&suuhVBYv_AN_po-Xq=Le1c{83F*dwre$AnQRFYT{uwsSO$G@<#4jyHMH9KibKRV*5hq^?}*(pM~52Bmw{WnGP8H24h_B za|yggu|^z^HX}GaCR7eo1$V08a6o!_J^C4)BDE4nx% z75-B|v~oPq3&Lg%XNc1jd%)`7&Hp?DntPah{j->bCyE-_*onhXGohj!FUO-Fzi+eU z7n7E>XNrjIQt9+&jT2JqIfo`Ouc~5g9Qf;t;;?eu<|EbJnH90I20Q!3&PRlEmPnMb zh&;2f%@7!z#gP@)2W3E?Pb z4O&vBtspK?CN)f>%^%Ky(_dAD(#&@JA*=kQz!l2qsrOZZpaYs^LxUyLffV!r8M5XO z_B`3XnfO*93DMr7g%Atp#*h~%ByQg(T3m_U%qrx<2Sp}@fCFlZ$c{6WN?Ru9+0MN& zQL2{515ipg%>2QAGf|LoyC3)wpa143CMV{lvz2$dioppYw13j+6P5PHu}-8N6|AQG zgDIN}#wS@v=tFvi#B)R`O~9$0!klaM{Mj%9%Hw;Euas=SLi7vRi??-*aNGE_TWi4o zAgjgK_DqBREVj-}%e7iEr>n5v9ZzxCcHrm=dTv6Mgg=$aKm-sFkyt%b}| zJ&g0NIK}LFd-uER%!McjWR!g&<>um#K)2iFPJOe^J;omUl4|ymdkD7aJkXedc;2D!i{8j7u&2_u%XUKrz-h_RY{ZY|S=mu=>v%HL_=cB(MH$bDo?9GJNxaS3p$xSow z-H6!$OEPpWPN$i|X>7>%+%NPEeA3u&u|z)1;%NrG%>q(`t(z zy$~;~H_h9yd>R3j?(?5DaNQf~@y9fOa?H)+@0qP6+uz|7<=AvxU!9So$O};5u!xWn zy}Q2R&K3lZ8${FM3N$OrzY~`}1)j(X&d2Op&GJeCUw?zedpx5DLHi5z?%-Ue`mBw> zxF3Q)t5@PDA@XB7;*F9_V^L$^^JjR$`Vq=%8&f|0x3oqO3Gl1INjA54KJgl znj`T$j`1q2Vf=yXdtD=bcVDa0JcxZH$>gag&uVwF#_rXmmmhQEc}~fmlc}z!?qHXd zOc9?JmfWZP&1rgbHjd?lRG5g2Avv*Y%*Bd?Ydlni=a~+-`;9cyRDLUF@8j5GAN#(WLZ;FB9-P&~aUJZ0Dh;@ne`RTc&;p%hYLfv~j zNU^RNrsu;bQ&q0DMM%1hv7kBolZ9Nq#8m3PSvY9tgf@iLnNh^FtTz|1p}eN()!!^B z!xDEnm}2MmT(-UzsS!_>ZE%&gnpPl{lGFCD6xQW8RbNRy*YN;Jdm}a-ukoMn4b0NQ z8Y;aLL*?$K?2kw69}>w9(lTR88h*Sq|04g#g+VH2cpu2jF<7Axx}uv`h)jvbm;Xk9 zOAGC&+Rki$JQYN60yne8k>@EELN~gew|wTb zy>zKX$$IIB94N+~hG`+tvEuvSbL{2_%4p+Xr|RovUY%XEs7a~}8$aHvuo|97EhbSN z4^j);w}p>?%-j;`kqrgOE6K7~(VP{1rk<-EcxgPpdF|W9qRoW01Ruu`|U2C)_C%JEp`|kNCbBO_O_qO;n z-uRge8QO)FkZj<`Ntdk_iYJaulHD=U(5QfWFkDFX_2n{HGCC{LQ1M6aALlrtWku#l&@Nhl5tlu0P5}xuUK8ex`m9ZTL~h{(CCyy!$zvs={@_T{TAo zSpY9>gPq+2Lct_;Sk^UX-uKgo(oZU3XA(ab{5A_*l6`>iy=8aak{GIj?XW`UKsE6S zg=F)Kao^Wia9;{uK(U075;<0Iu?cokyc^t=Wyi|woC+|^FhOU;kA4M;r!8mk=_{ zJdY}6D?%#1366;^farFi4dK8NaXzWFvyxiS%(`||45Vc_>tJ3pdnS)<#25Khlq@sC z)LL5=7smTcMT4+g5g zhw<#a0Z-jKKT(`^C!8#ol>#y}$af;;ioMbh1Qp;Q@NtpJAXW9rH!rkv2~@@0JLQ z8}j}wM^+hIm!f~1*(=A!fp{us5b$&J9hrx9Ee)Jle_oNac?e}4c^X_OIeWh)cWg-IepDqYC@UW=^1DB~gUKTC$iGW_TB)1l+8*w|zJ+UjO;1f+x2^Vw4Sc z-NnFOP*Z6sWsF##7_%FzD3VN2Cd0*>q3b!;=^>8YJw&u?rKU~LK?xW?z0TX z236WXYn3_}M&Ob}C~1@}Z)B|ZwMbzMr-Q5faw}Sczu4t|o8D&|(1lBr{*gRzq7GGA z2biVhI`O6>-e!SxN6+&r0DAB@OTpl*utn zYdbBMwchk`k+cUD`j8_k3?c-0yZibvKGTk^nXxRgR)5&E+KC&YmBq$IvCF)_C``s3 zs49pGR{0@gV#j5uVpWtt4{co29bvo0QDtO(X3+_#{6TAB1n$qJ`-^-ZYqkrw({`RPBaKJI#jn(GdhNL4yjU*l53gYF*Q=rX*~n&_{7 zlp_1+V-mm2g_7SIqPwI{@L6I!=YR)h>4}mrgI&lgq1sW(L>k(zQH<>{gpJl}VPB}( zysDKOo;P+AhWIaK4K%#M&=5zKaSVQZx6K|!*1u8RqZHG5lbC#yX`=N`90)np{4C)l zT-le!nlJjpjcCMj7}%eccJ@1jc3rluqi7+Ayq-rYRPJGuaT11Fa|8%IJrM0$5o#{a z!v}>0&6X}zaw%axU&A^rt}#hcJ=@huvr#&OzQR7cK*rQTogNZs+h~gsXa z7qQ0UiB`f#Nln*zmY57C{#gA88z>ip7B!b{QZkcr z(Sf2nMg&-7XT{t8;gh%4uD=eI4uhPe7bPYlF015GOa&pWD5RJGphL!Rc7`OLXZslQ z8JOz*r_7s2*#3fsK88F!aP;vi7WJG(@*TKm@<~2$D_FqAQEU}QE!cFFENZ{ zbYkozpgJKT<_URq=y5D}@kjUJktg?8i6rULPJ>=Th z&U|-7?j&S9h_dxKvv{k*UAk}SS));n{0fzO1USLdLUT5RVkI<9ByV`s zLz)?35LXNcn5-$0SEG*M5fYPx@n%S82VQOMquj9wR~SI!K%1A`tt>-R3p)hP#9V=J z-}BpZ$+YfR)76MgEQX&2iH!zt2c%GmhJQiT!c{l8rtszW0J@i>MEFr!PbSr<)IPm- zLY3x4vHNv&JevFo34FW*316XvuPRaozd4!-{#vK-Z1S1wWr7*o2Geo*iU6^ywnRJ5#9NW)*ei1}>Ga-yJDQ^~!BYUX`rv5!9wc0KhZTRv zsr>#<0=9{P{0mxP_-73W-p#qXd;{9HK`%l97reMq43I`#gSO#U_>e1IMNG0Q?VcjE z6`kqL?|w9(dJ%c$j1J!iGJ`?+fXuH$Uho4nIfNMS55R%;&q2z7W%T`bALoqG7|Li$ zUDOBd&bM5C#XZo^N7r3D&}S}UMpObdGO zZlTp91^CPDX2Uts-_R95ukFFSXe&uHc?EWpXWsUF1b@&_2DaC}CXhd9)3fGtr*gI+ zFbvOA9^6gtrPN`Tp+35#9k1*81r(tv^ShUr7=?NH;qwLmj3%P*F(Z_nSWRTs$erPQ+~y?S;h4v=0Da|6Z#2^9peRf;3jJ>d;t8Q=5qlkFE*IFP@f2g-L)CMM=b<8|F&xeJx!yW+g`2n7B+*Zz^!dybgP8o>X? z-9Zc*dq6iG>SF!CUB-5Zd_+cQ7=``lDPDJMu590epG}Mb>^I$OXTle^Cmq0Oldb-w z!)+axPgkA)f`ZcSIM4%=9jNYE^!*dy1~Hr*FeL6leoMT*yE2C_r~Dgdjo0DzQ3Ie% zr49^JS*rB|t3UWc3kWsHJY2rqhIhyBeGx?m@OB&y{j*eejBj^{D>tp~R=fsdth+~54?svqLI6pnPZFW>vU z%?sU#G@-Q3R+V}yQ~%`RIgqL;^K|)N$I%?E5H8Zab{IOqwc~ICFo6BS?!VRB4tYh{ zmc2va=;28~*{korLrE$63mS|4?^w#dZpg!50hX=-j9Wp*fH#LG;s3n(Z`6v^FNeZz zxwQbLCi>U^ysv!m{J(Dd*EQ$j2QKPc_J7s?b90pc>(&3bB%0GQJ_Y}O46piUc>XOr z@Ob|BQbkJ3kN;{5;o~y@=}ZrBxMRl|>HqSv;;(BRxXt$eDS_Jn)g*QQy#>lb)Laj4 z=`{cV$=jm)tADN2s{>zL5&ypAMf3FoBY*IJO7j2jUx4afTa!?{3TtvX3Pn1U6H72y z>Ieqwi{>cd`1@Nb1_b*1>+cGPOif8u350HR!n6|(jS#V2@3O+!ofn2nu-lS^3Ssi>H^gu*jLC1n*=HGKm^BV!X&GaK7i zcJ>aAPM%(Gy?x&K`h~p@kN6N76`hp)DJ3;6JtH$OzW`ZSR9sS8Q(ITx(Ad=6(%tjD zx37O-aA;z3YILof@BbL~|FDZ3 zunQ9#8w(r!&n^s1AK*V0IX2EC0bB|>UGPiy2P}dic$D&qxz%0xP$9h|Dr=9Q1P@t- zS05k$GwnZ?{l7CT^#7M-|J$(tW7iyr2nz$ad06Bi7$_hKXVkq{Kjq8d?@y2=Xw|CG z4_cSzg+NIHZEmLXQ~RdqM3p`M1^$)jbWCM4+r$*B-K+Q)r)a;)pFM1H10rTozi4cV z*fMH|uRlPqI{FI3MbSrRFwgB>eL<`k_h@To8Ynbzhf{T~ZOT6&Lrg3xp0S~N{|q*^ z!#N)?+kX@`cBN~DH#MN16!;;hBzR8wQhQsE1tUbA3p3n(wgSY_$3=a;jwb=9zS0lq z*C?DAy93w4Z(7k*QUWWUgO@qr_c(93gzqO^*kkU#EiD5ODx`&hTS(N%qXnfm)YAS| zJ|e;bFFo$6*Z-7u;(Kl~?1@?UK$g%5M8yj*H2O2ByW=T9Iw(48X0bAFA*}XJg9)*h zxZoG`{8Q6F>8UG+VZ=U=WiAaLd(oY&u&nU>^1EnDq+;3t#WRoE0)N{Vxnlr?sJ?FG zx#ZNt#^_Kxn(Y(yvoo3H6pw@?kdgSy<=LfjE6Ar))rQ4+_7t;$jiLjt zfms-;19dR+L^E^FyQwLD_|a3A>~HJSt&EO&-$mn0C3yOiIu8Br;S!Ts5gJv}eUdZJ zN1ktZ4sr`|9lYRG<)Oq#QtiZ=sWHunZpF1Y!q}1d_7^mKT0t28Ai=MXL^+>)c?))f zPSIP1d3QHNV;guH_78J1)eAjUa%!uH5k>0!6lTQ_L3NxIq!hcaNhNBYoIj{|BmIqxcE=kdnfSyZP^a_c9 ziiT0Iyw)Z&$rp!I3;tMNo52f8l20JFxnI(x3VONwy8Hv%n##ZCO-|>RG9U1W7=20w zxJ;H1tZg8E(-s1!N)Th<2tV{)4`FZs92_im`;BBkF}apD7Gxh3x%@VlAWs~DY%R60 zWLnXbC1G+y9tFq~XIEVlIL(#Xqh2G;geSFAz(ULpwy7sKD@*(=Glhwtt_DEytYwpw&j#Q zAjs$M+P`a-NK|s?6yD;NfTk*PYt0*A^h84@N;BD9dZ*r{bS%Se2&VmCTwQ7RU(?l= z#Aqy^Pj{SAmCkhR&TcIa%q+i5yoeTs9PhzNH@WI&jmFsq-CWlK;!`#ZVTmGkYb?L3 zXEED?(|}D|@X5EAyvs{1E0uAMtwO&WPSqaPv`N4A#A*Lync#*q&syZY*OJH0aGULH z%j?bl)-$Dym{5yQ3o{FcpSGbv*rfJ#z+cd!0g(A##zU}mC(3mH#N% zV?jo(uk%LtjzpQwUl60&YW>SbqgJM*uT^SOIzkKV_o6^vP+atBtb@JTxBk0JILd;i zxWN16m!XShpJ$A?7JosH?%`ZBkyZpFlX>&)HL4yGwae8pfx23C1uXei&gg63^J<{ zT92G9+1=y33V740Z`0Vt`px7-Yfh72lgQR8r%(3nZNb?M#0KqW7->hr!AvG72QJNC z8E0Xtkc*<`YYjOZ^;9iP&2O>&arHtCD6u$~B7LcW-FkMdyyG+!jH7Y^eYMWCmRkZ% zU^tRvAoJIwi;b9B9>YSotAN$nWhJJLEJuiBl+(M}cH>HgMknV*w0O&J<9x`MCyH27 zBAoM;j`ly&Q-@<+DZjk(y5lOp+bk{^!*Wvy+w!CP%17Zfu-aPk7bG>^n`|BSrF__M z!wO^J-Jy}2@<^f~rf<^v{nYD`j+^DNZA0sX)RpU8#37mTbj5qM|@EmQx%pR-4+PKWV3H(~q zs0^``Zb{StZV@R&?j@vZR>X=9d(be@34e&By?LytT{?Zg*qX`taMbuJaIESoDCpC7 zagskiR{T3Ssw&&#ad_LB#z25{9P6WXp7)=ukv=EPZ_ZRwIk!u|q2yms{Kl>(s5%5% z@L`rCupJH%dLi1cGT%Vf5N6JShu>!sSmaZ?`paBF1n;7iwqkXg9?!nE+=Cxan$6IO z>|Io=5l@Az8C_eiC7xC8P(sBPv)-18NwNgjr*wQc6<)QzAByj5e9+?)53Fgk%J+Y$ zuQR_$v3vJ_mV$S1px@7`^(cDXX*_4K9Q6Utl>KZevkmi(W(m=7x}CM7z%GVHf zC61Vdp~Y}(-Z~KJLWvAOawXL}w(c47x>eB)aRj}`ksRu9tL^MiWNXnSf)?n;Ux7+M zgyR@zl=p}XnaP2T@RZ5(R@f2VhpWlH6v?;Q+uy<<6D|Xc{*Q zztrMN;V9;6|FjPdnr&%=x1D~sGjZS4)SU-0{@C@w=a}JKNq<3TMF;KDXYez+A@|sV ziW-p^u~;j>at3URwpOQWz z>AU0h(P~(BDVknD491^{gu4WJ-@z*0`l-cN5AG$A{{;oKJ4YC&7QS6f>~m(Y9D;3^ z!MC^#c{^s|o#uZ*aw4xyfv4m*eD`0m!F~sTeEUzi@J8)*tXs4(?L!ohRy;P-R`2!s zO5HbAlvY19j}4~f9(^u}xVnUWI4!jCV3vyA?O(Ix_cK9dJ{sp`nwUG_v^BL=Z-`p3 z-xMfi^F?LZHGq(KWmz1w*8Nx|(%Lg$qW4rE+j@!yD*egA;hud_zTJ?N&VsMCE5lyTntt~e7_nBwU*~)+AKgjsW-vIiSe1s5HqgERli>sAYb(OI=^pV0m~{_c~U4|>0K}Uk@;4@ zj_4#gF3eT+;w}2r;A%xLvOt<_T1oU*nj~3Zwopu{``A*^wZNrpP`$*Q)*?kVkNsJ% zLy78MDGa)|bu8CuwF@HC=npl}dGaM+-Bx?4W!V*0rt0kh31P z!{tA)9<5Uo*~OW-an>(FX(Q>JRPJjYHK}DWDA*vClg%q5LG;`~R#k(`-*6_D=)GWfM^&l4)9XXTmdVrN{D`v_`@e{K^LVJE z_I-RxC|Obz!YHyP$-YkZ-Pp-mjS{kNA%+SiyJR%5zb8S{vRqmNF*`q+BVc~;YdRKS#i2~t(jb&7w?7H z=ygn?Lf6KL$q)V~lopr#Y2`*#>^Ht+U5+qU^v!w-vl@>i@@bcKpo9ZL&tFJx+82$s zmSZp7K_UU5V|L_Ts|P;}55y6*+JNK;P?7+73)N??dOLPuC2D+xTL*RCGLxo zPw|8Pg;F8uikt*occd|!yg7$>eYZ_Iad-k_{ia-8nw1# z)I_}s%tk(naNU%9$ea!yo%!k5psq+Pbek+O?%47GL-I-lpq0>g9d>+rIBnw-an%*M zmVqLitF5@Uc(46=jELob;jH>Ua0VO+fM>ugC`owzyKn+*3HZ!gCBE|;$X~+X38fiL z5__$3lLiT~KTd}7Nam*-#5GqK7C8^=Ge8B15LPh*O*e)*gY@LRbRrG)nP)s9On(dn z7URxeZGzG77S4mLn`g*OukZB-R2`Y~Ejy^OTsL(SKPlII(!tyj9Z6`QmLgMqgWL>e zE{ARVRQmG6f#y@-TKX2GEmhM)vzdhe6sC2xen4b#%|h4W~KgQyO2| z?!lLgWZM*dby|2XqM&BnAYVE5Ic2dkz-%n@`G{2&MJ!<8O3aaR=H?5(`}0V~KoMoG znU})k{rL0zCOeC09+k^f-X(ynU6}`LG!<$d@{ZU;psfUN9 zf-g#zIZLs@wLG3wkeCl4>8vHm+LG{U+%r9#={aVX^I3r_0o^~ESK6JQV_XEh)4o9 zKb3{uU507*KAp_DyX%5aKHS6F8`GO?*#@aWbhy2`v_pV_f$jW-FdPqGSwWd>#9I23 z978%mL$|`47C{4L)NUa7Npk9uF1!Nx=U32-L$eFjK5BvGSuIhtJ|u&d!}|XQTHnPY zdj1c(;9*ZdqP(}kmELa`D}Ux$le5V_E3Uyu!*J5o(fMchNT^eVci(GkH0Pt7W3e=N zchs4c#jo6zx!KM?>`#vvMMgW62?dcrvA}AO#s>E)s#6-(3}nB&1w?_%(Fti)jYxoeUqG()Z$|3l?{ihI@_QDl;yY5HE-VcYR@O;u_-XJ?#v0tM{d- z<8W`8U1@jhp%O%Nd0%18o^j~puO zu_LhW$FQ){^h3Sl0|wvUMV8|#cb~4lt=_$auRI1E>py_?c~s5~kJ+IDn_SOotPzv` zzfm3J|BdohpM0!W6N<${llPfcBL=Vi1JHWtI+B^}{p)>yA`K zREv%Ki{*^eviZ;u1W6DyzIQwlZTYx2DE^GzMtGVX!F44HmEtb5(v>n11HCrqYG zbVKBeAqKJFtqTXtA7+0|39F%b2C(&g|7?LdqoHGjRWD&!M|$(oi%cxtL~FygeAp?9{^ zQwZsly%b!?*^m7aYqf*AG#i8f$jv3u_EO?&)?bKfQ`+yN-dm=(1BpdM=Q^q;e%NfD zVDHGY3^pUiPFAH7QRQ%lnCpf0i9B!I(j-|)g1*kp!47q|QTxLb4Z$|fR;d<&mWh3C zF%#nR{9SDVOslSsGJ@AlpZic069s$gIkA7<7qAoy$1DG ztea(!_~n$@Ap!xLqb_!z5wT&YNwY_-IQnQOHz%65$d3zoKA(c?zItF5X2Bd0MH5o= z;dIA6cn3OAUML_*TlqqiW*3|b$pxo6-&#!yr}1MhpO#LjLgS0sisT{Loo{j3#2)wTD`bOorSW-ETYtw(^%xUxh_P;nkAxmqdpeYF z2v@E-_O{{stJyFgn~X7DDaM!z%+aXSrXjO8}FTIb7Nby&N(~)eJ;cuR08*~z>++72fjtI^!J>|-!sTn^@_P}hKQPO%vuy3Sc$?WyZ5Jytv zo64}b3zqV8ki6MiEB&-kbw<4H{A#}q206yk4 z2Z${cNCe~l$b%cE!r*PkrT)P-eDaBpzzh=I)r2+oWT`pHrwrB13^jgrH|m9At#B71 zhrCm~WoTdhVqVVXffY41rKZFPXZYoHf2<4h_b&e0dUgI1Rl|o4_?0^G{+njC396AV z%D1H3BaNWRnKL_+s}Sxug%YCe^b%G7W0|~F9-@4q4&`EnHT$be~Yk~WcuugTiy|f{-Ief(}J*FjUq*7ud9BcVQ zw;tof8S=}q*h!ZtMXkE%T=LRKAmYY~<~%QpAT~TW=puy4 z2Y5}PjnXlquh2{*@A}*Bq}BUf(h*MUF7kxdM9`!xsG{yzc*~QdEAQ{HF9sbDT7B38}r-BGz9p4+%b0jHE`a8JZ;vOL;CC z_WC_N{R=6BZX+$1_+Xes(jj=-(Y~LNU~@qckO@ zwfk?x!#^f|ffzQWgS^KHdVHD+>+8mzgFiy1+3Z_r7W!5lEq z|6>|PqH0u{YN}*_9>~BpQRLTF3uG}ikoT|w$4j-v9v{1rD4hpeG;%E8k46do%00-pxSX!GO#ow~TY?gC!)?mO~-V}ijr{oEF7^Ie{R z2IQ+7JdZx-KY=^FlJt2To1k%HaJRB@e0+b~Y~XxJ^{=A2DW%?c($>6Pf#0%x=cPjUJf6eux zu|$aPzTLxB`s+kNtYe5E3%BZZ69~0aLkL^krJ^4OUW~Y{#L&%6GpiWEns&zDjG1lc z0*%fo`O2@}hpiJQB2K&d?Dj!ds&cBht~gv4sJ#{lv?+4nm`{7U%NUpYcTL~eZ3aR{$+5Iss3cmxJZJ6ONjV(Q(0T74dJ(cltcINT``Y*ESl`YyWy=vgy;5$t~u2$Qur2J`m@>agos^j(Ao(S{$BSi{e zc2xINoBgG`0xWV*KI23VT8jkkNg`h#2a(p5Sh=}h+c*Fqy>04v>Dq}o7c_&OJday5 zllo|r7rwozZ_2W!t9-NcjuN#zZY(b&Oc&wd3c(kN?tOJz*w(zl7Z|j?tiXS^MfC0T zqb*N{%KBr|SD;3Z_@m@3C*Ip#J-+gIw`~S^~IL;yk z!MGTm4!b?Z5HIpSmCwoQH8{_93RGNOuniAx6}RZb`#UBk9zIIk*q2cBO1BdIcL{UA zQ=!o^P!T8Q!*Ft5@1H@@GhOcl;tX^j*@8UaEg9`m7%2Xu^9&h4jjniyT$%(rNFtKA z!l``)$N8$s1hxuTfpz+cgrf?Yxrsge6LO>w$f+a9Rpk<;BNVw2Mt*Hpiv({Dcq0g0 z43*?V_eOw8+RGctlHd7*w?mNC8?%K9cgBeUv#|tc=$=G4-|_RYr&p-V1y27qTt2FR^T{z;^5H&pC$KI38+2I6UW20cJb; z_<~Bd&j14g%l_;27M;d(m>>M1*+T^D{+T)I$B+}u9T;VWTI0~+osN-l3F)=DoHqeM zbE6NyU&C6$YG1*kcfu%)yX=Js#3?i#YCceo0Zzu(gfcSOAgIit8PF zDYZgkL^=JSuRaxXSYk2;zcWvu8K+*7>MVG403%!;OPFL89=yNP(>0BpgOk#lAkLXP zJu62f{C_}|I&C$FDE`E1$-cKk|Cgp>_F`7wXJol*;!F%u)FfA@&d<0)eiK^#bPR~Iop8wIAi)|b1AWa%gFe|xWf)Bc z+e0Ph@87o)gwVnnNr%vAz1!$p`C%Lu9z&iXw2@PrB_pSEdAgW+cf)y z?kwL&U1I~z{N=RA6x>_Vf{NS6*i%h3;!2(?MOM;d0n;H_tn^}@@N%_QvlA4P^?Lrq z=q$Ec@N7UBiHUsPw#Lh@#lA%U=3|29LND3uV;dH0|Lyl&uBm((h%RW zA4}vI*hZNhAnMxjHdJ-XE^Ff>V})9;?`UioriJ#X&qP|F7N8v_!aaq_w{O~&1_|DN zXr$h2E;wEPar#>a1@` zB}HNvG+J?jQzwLH`*nP*)20SX>n*i&RytYB-LGK-v9iyRT@BD}eb{kYoj#>$+(Kxl zLC^x2^L3yHy+2HqYzV)x)?`K`Z;F$0FI~GNh~u!>1n6U06Lwh)IBF!1MoTdE5-?OUiX=L8|3>m zh-Wkoa;2BOzT`iIMT9Vn7trgqh0~qZ%5;+f&u;$<5j$AP-c^%X{O>9P4MnV?1m0bcKYX}RMp&?s4;v2d&w%>Yq8K?5$g{TSe+yX4 zqg@h%-*!Tp6t}1&hs$9bKVj)6$r?w#dF0GZWSaT)QwQ@sRsdIkT}YjRg|94OC7}Rb zz|%=f#C~&${(lgx^-yEV!uyxG1}hNjk%Z;>p3?N{MeA`D*8V&&rIKVla}6}JzK)p; zmKOjpP>NQ4%s3N}0wyC`O{Bv(A$GAIt_dAh%-;JsR6~q!sehW*H>FVobRQpto*Eu>g0+6=>wH zx(sfepqFqSWJYkOu^Ysa{5u=2G(qHdppTGtW357xE> zyGzJ+to@4AGX?rmBD9oFQK^plY668KR9Q|xX|?&q$E34q*9dGq$!8bn-QiG6jQpDl zS(Nsm_eon8To*g&(fYY4)CL|gNVpEfeOP>jE~gNBh=Fhg30{-~QIa*xEhJ$S;|HvZ z9o;|r>dq+%+r9UR0432mH`lt&+9Jo>KzzWOAv&GRD+!Gyhu3-@8)gJ^|CE;)JN+NZ zQZ)|Jak?Sr?f!$mxe+@*lkB47!D4uq$Z%+N5;+v(cx9`Zp|jcqth64?tlNswsRuzhzR2aW$zHgNVm@7&$J#+n#d zP{$cXq`*q@o6%Q!a@1d5OjO2bpd*eAX;A!S4qVVC__`tnS5-=bP3YvYDU!i+Xyg%DLOibhH{2^fixo zR4RN=(AC$(HuY(1afP|ZUh+yDv5np3FT`15(8-JI%U_6AxJ;VpUW~^<=L>4$Vg{n` zTeDE+-Ir0X&f}?aP4D%5iRdm8RIoaok8lGO%@0y_cNGUeo;&lh&MuQ)3xZl?k>~sp z?VNq$Wz4_VOs8;O#ozNB%$qbPM=)td^W6Ut^R*Td7a`hs()4;AHh8j4WO1bGuONao^FZ#l~C^5++7QQMn zqn`3pm-$wPAKTkNh%TvWWhtce(@RLEe*J?139=C-XvZCy-m{qbJ$L_dP2qwG zvNx6!;Wqe#s7U?`SvVQfD>JtZ#?k&Pme)!6@h5*jic@K+h+`w*sz#KompOkv=_l_d znmv9;p^4o@S=XV8L(SS%LqRa({V|)$zc^?!RearYC{YzDQn^P&x8t_Z zp*=?%c=4fDeggOV>D3n+njH-~|61N;W1SyMRF zHSz(HaM8rii^Zo@PTs+N9`orW99bX}d|%jC{VeUjggGTY0T1_8@X7*#ocIqmwE$5bi+ zk2mx9F*_11mN5tfc8ZSKc*K)4s#u=Yl6*!{ChzW0zoO$tR?E_MzsXp$%1ptlE4Oe}zwDGtb$aCzE_Bc{ol~y8^5Y z_?*Tuj%A|9W}WsWo@dx&vazY%B~?&i5R$pb9(xEcS{mO+?<;Xv8z!cCNZ2~K^j!9F zeSm4gkb*zZ=C7Vz!j~jQoiVeii!LIcg$JPJ&fecnht6_=)FxD?7WB$;x`<4Wy{K(qeHX z@lE?33xJ^GJi-kB9wOcTQOoR_Ubz zQO3F!nnH%;k7UjwqEj!-3GyD{KU}H84Jh_Q+B3xU>!lWvbf4a_2Vr3yo16IZx9Ro| z1NM#lGp;=74GLx9Ie+;I3qNaBbhD>uVu!7HuK;Ii*`~VrJsn?r*qvpHy7_%#$&AK% zCSMk`ag$*CN$a3fk+ccn}h)HL$D8FB9 z*P3yDGdM?SeOp>ZJW=;&@r@)Z9Yo;^k|WROhRHY4o-YP=L$}ScpKFX$ix*mBW$HSr zG&>K?elJh&5!jPqi-99IqgsFFtQ#}n^95YclLU!zg);1bZ=? zWzAV@!+!B+lN*Vp0AD^zA@SQH$=hpM$H5;UWQDLZXiP}nUN*JZ(ibf!KkFc-w2)dodoOq7EIBuhsbXOJvMRDE z0)!0>;T(_uLi}L|RK*2ywM39n1v?NUGjnw!2$cXrlO)PescyM43;-FrM(aubfa+G8 zB<_o=kW;6Se`=v1+PbtgPyWIU5EZ?bIc^wxC=5Ha>^g~wIMkiaOGhi#GD@TGAeVj* zeo=M)2nwn2jidyj{eotHp$c4EJIXv;Up?+Q=QT7gS7blaI3PK$ap~qnxM!Dx@mx(_ z6|WeC3+PtAHz6yNbY zYFg-Ch^B%9dZ_2i1B+LyzEvCyC*4I&M-(F`ZSd*>~~UK zZd9V-{fZx74I_{#l}AC8s=r0@--z_n+G|d3&6*-Bf16*!XVW%^%|uivXgKxypHDI* z9~G~*fxTC5S)D~^ulsA}W{jr%sfV^z;@P1!8}%@d{sSo88KB36o#r;VFp%Xz0Z?MJ zo5rGO`A!(XSj>MhHe|OO7_!22kSPH;#aQzxveEDVLs>=XzmT}XRA4(+olQl;Si2S# zF1BHU|EFENVEeXnMCVTiM(xjN6P>r=F=LY5yZ^FkBtXVK)7BRyVE%6K5@7qx?tx=f zVhG^LT>;=(0hHhD7cdo5fV}hn0bW^XPT`x*Db(zLpf(6VEe4B}jTt9dA-^Kyie|t9 zEe+DxFB-DZhjpO#IAHir*oK8CM+w8pn$=e!aGx8A0`ZfTkg)K0*~Btmo{6ziG}0I=m_oGabI(W4)ba6f`p?y#HL$cBvxRMWM0h zH^$Q2C}o^t%a=2WqxD(&)UDI+lGFGuX3Daxmre!#t+Wj6O9iSrYVEu#UogL5OGZG2 zFhwLkyD>p*g=K!|qo z_E3HbdrTYirTR2Qvj^8!SCak+aVxLM57wM^)UU;je%$)LH(-A{Vim?c&VjM~S^I#l zrqKMUZ+DOU61$m7m1KZ-2x(F~5f2I~Ia2%Tl|2i;@(U*@7k~Kq%lzEZAbo;`tnDM5S}9m=d-zhGQ?^kXVA`dG(Fr&U({L; z3NI-LOdRHS)f(P>7!ds7|9)Jcv(O}PkO!m9KLu-o3EutpOXX6QlQZ`zhy?i&usjS( zCnj|n@Y&3)*-4oU$UJR>YycAVaJ z7Z28~rAwaOrxw(J9|E=%Y9>6{bA4)^CmFiCgPY5BO}ahPHy^QAoL=Ci<5fUEk}h>G zRvDJ$9S^oi<6nroxfo(^7I!)W?KaoDMuH?2*(~Ro+Z({hs<-|Pe z;a~Dh5ZxpR*^Vur6|=5OX?OgH!4Cp*{+!%c&Vmz~Yi{(v6^6RHq7~niGuP|`cfeQftvtT7;}WTiytn@GDaA zCkg4dtB+$3U=i1jWIu)}^e;tZ^w)ecXrz!HP^_Jzur7KfUi-6VBb_I;@Zz1#KcNA* z`UM+0b ze8+c2TZ{}Af^F8$)zA;{X6e=^;%(^e<5IFpG%)_ggs~f}_JidvFN!kV0oP;!Ht%sg zC2X#eo4F+x&hVL)NA7sp@0^dFhi;_0jE1#l%Uuq|ya_<)n@wBJ)>=MP-nMJO9Pj)~ zsf+ww7onLTvtYi>mNqJ1H&~Cv>Aq7s!F(Zmm>FM8oGY}v5hce_oum@pxoV+j=KK6P=} z91|c}=JZQIcqp!G3(vjMsU5$jarkB0I_c`f+fO!Ivx;UvpogA$Im_(3rue|-K77c#FAE(e$BbFzU#v=TaNW#y7|-~HYTm<_3t2U+L}8M|pBXUIr&~>Pj~8(?j|K*5 zUoZAm{n*V7pYcaGP(oZDMib>j2ZtMj#EcwOtv1J|9PeBeNX1qRw`A;2vq*CnvoPnr z{$TU*TcDg`A%3ph%f)w>l-Lh)&DFbLZ@(!7DF9LBNcU$RT@Hh-WSrU{c(`#4Gb-8Migf0^|qz+4_%+qgb7d%Y52FZ$Q&yvJXb+Nw+q{Xr9*Sg66 za7*I&;6{!~mF^y(R@&71RuC!5(PDV#0YdFQ+?Gm=VDdH?5uesy`w*gO941YXhO;6o zee-%%Ph*a+h~-M4!IxdTe;6I}c747AR_?4mbs^2lPhIeAA@AVM(P61aWw#hM(m95T zTH@1N*00t$ipGQD#pq1pBHQcM2$fRB{e3qejM@S; z6I^>r%^R((8Nsa+wm9ME`H(UucHm+&bv0J91ot zY|49P)23=-F)&=Axb+~9VmOg=RGBGs+=botZ(0DuV6)}FkdYVW;GCY_ZmFl%XK|lz zCrQZN?}8cBvTwlupo;%O62|J<)eQ2s4iw0>(Ue;l^OMgjcycqR-L3$$Tg^~^blov7 zR_Q?&mCIDv(BAh)<((SD8qx-F{#1lW)hxn%^J^|eZo{6*!UxVx10{Bkb%+N0Gbu|_ z`^D|4&mV$T2HtEkMS23LfJN*vU#Az;V%UZ+5X0^sRQO-d8GHZHQ?}GTSh?vu{l>!} z1K{aIICQ1P+N}z?0-~8)c^U~jV6-%vh-}UX>LLJh8H73D&4=T?kc}iY*(FjSh-l0c z7{Lhmh$h&<4V8(C!r-FkBDd{p?-=M7NS(>>uztH^J)}pQ2zL{1YPc6RbCGMAoZ+!y zQ!{H!T8}x6R&+a}^ERI4zVN3{PU6{fm!-rrqjy^amJ^BVL5roXgrAAFxn^e4wgu(n zgK#qnbV+lHv2H@pGZ#8uEs2>(>A2X=Cit0|md#RX&uJre`@fJHe76=|d;6Pk^+PqD zv^7PJ2To?g8%FC_J*RKKJO4)W%YBDI1p>LY=Us+Bn`FVYzSLDuuojQ69h-|v5l&$9 zZY$13am4u@k)4XiZumXq*ZIEjB^J}CpFn0!7Ivmzp{%brnx9jwDGz^?gMdr(1` zxpFvgfz!|K_sg5SrHC+m+*a!xvX#MXY-ADe~ShhRgnPOMGbWPUZxs&0+*@ZppQ z3b3gt}ITyESa~Gt)F|~+g&uB_VVfZg}ujVl4_?7 zPhr-(Jx6838(cG;jicK&8xiT3b)52F=3nxd*_zHkRyU9p1V1!Q+01nLFt-o=_*8N& zy-yv!RsB%ZaGbUG)a*-t&9~^nwK-nf;)MOnY}(MNw|rl1v+OT<^u|qV*ZV9Z)}|_sJ=eD`Iwz^N(O$5B z#-`gaNcjOL=NY7EWfgOuxO;K64=4Tr@;eaeF07 zkj0eIqbue00afFz-r4bkn74&Vk(MoVy$ zhd0`4rKPLc(4AVDlmq#?%j(1Ts+r!nuD|#ZKsd98a{3w3UN3oP?9#c9=V{c-OsX9+ zzrm4P0CpV|3ma3+{N>TY$$bs(IaI-0!K)Oft#rW4EVsFXDtNDH%4%2u^>1@v-KgJ~ zIU+oyE%~0o?SOXoOmC8MsXa+)b%^HBr0f-XV&%TXq2NC_B9T+HaWN3%ui2~q`LUz} z*Oj5@ymKC?a<57k@yL?W)2~bJCk+xjmvorodv|#!(tRy$vgsq@yDOwv4kWA9$!PTK zDG}Lqw^$6>Hj@0|ojf{MM3_c~Q*Ky;^bcn~I+enc~%_ zBoRzK*3~X{o$)tQ)MAtU8W%G@U6PG$-`8Kf`Rtp%#>u7@dsLA#Y1?i)a1?c$5Ia_a z>AUwO+FHy0&#%x8U8h$vo~>8*UB9uQ2Q9}?#xg*^iwzvW>~8)O@IK$wd|yAS+~={% zBnmQAmX4(C{;TX^RUWI&EQ=ETE7oCWaO1~6WL3kLlt+plef&5kz%Uvog)c>hDHXL> zIr@4OpO~F9L>)PKDwNi;hyIJ4tf?!Y?-JFlAT)UaG8?q)RX1}$~Wf+()``nm5 zx=5VLev32VCN;98IP3o!)}6ZBNri{MYS&~S>u5ruN#O~HWFd^R2aXFYwo2rB?$ zF$dhr$kJrePh!UvWH}7n?ojkaas~WIV?QsO@VVuS@rL32evuyew~mMjVMGKXc`x?R zxe(QH1z1N)<@Q~(%aiG~db9`yA@w=oc3Sh?$?kX~4)Mk{8_xIzWX-2r5ZmR0Qp=#o zJDhgCP11LmnJ8N+K2->gLRCsKE)s0RiYx>dnc_(n9w){v+O3mTVXbGhk z@@pogy@lqlCPir(BWe(-#>XC%YfeiHUh%au)=k~w_V6V(ZDYMcF^MIt9ob^76rTm2&6M{y$6`NnFLZ}Boj@~k zHkn!KiD&MY%IWeLKgWm-7jC2!vlOa$3vrk(F+2GjP_CHa#aDt!S_RGtYu^sUw!BX& zc{H+4{P<=kpNbN?o$zJJhxgz5NZk)u9Jqs&AE}BI5KOdDnQW_)n#PNbn>9!uyqZC7 zis?z2chqwjohs(_*!^Z!6@Z^Doq{(6^850gJ~;|9HiR9{YV0ZLo!ZQKk#dT!Y)C*zn=|3t={>#deh)@oo4>{e@gJD0`FPf}4uZ zr_?`Boz0!WW;+Hig?^4)RmgXwzut#f9qyh#l~6t$suyv-f^130WICj{B2?Ghse;Yp z&Yi={N8#qn&9W93zf;J6?1;OR67#FstJC+F(TZE~$>mk2>sE$?6PvIdP8gXt=}`!% zv(`0y4nznBYq89=XB@eoqk9kV=M+b&cJc5>%{kJz*s+2_ zqpFe;>RDCjR(A2N(em;=x&aF$t(_?JsQ>Yo_Nj1N#|PzJd-(__3^-#tLuM{;wjZ&2 z-J-&{S|uW$nOuwyRy|A`p8xUNvPV2N!)5<0is&Eqz&ZJWuCe$}P3g%yzhUJLS~;KW zX4a)1ZVlXF(al=Q=H4d?{BSPrzey^2W+q|Nd}XBzmqC{1#7Y-^;vM_F@>D^K=$QXB z>yluYO*?;U&uyfve7yo=?1F*eMXoL-)2DE!p*?$8E0XQqX2y{c-bugBT(k)O@l&P# zSVe{N?O6Q#hcoV-Pi>=o#FKvaE2$Q)E8EU7k2J;}D9`Gx4W&RF^Atni&$aa3QY!-dk zEz8&d2+i6RjX2~F#EHpcz6;vBkMxDr`#w^3!_~mVeNLg%q0d;yY`}jT0n_yrIR@=* z+WWHz)^Hllb>bXnnrX}D-xp4KBV{|BF*x?j5PIz$;*Z; zxo5LT-#222(wAYFayxj_s4C|b(mbg^>qCq~yQ*Y%85PrX4#j@SVHcSfgbi$jL5Av8 zG|*3viQq_F9$CF~3fXP=FUtr@xM(i*fP>?v8S={sQV`*dzP=r7*P90{1CMvEp7d*= z@P8pudSB@Mm;2H$bhhhV3SsO3>5XhfrU9dnmY(sZrRM}tb@e3bt=|L&oF=gzN9>mg zUItYt@^b0RDn;ht<@b)>vj(HECZ8En6>G2NfI_7w!LLsuSBrMJS)cbx{)LDx*11?p zhrl|liAI~-W@f)2Qqs`vyUagjW>W62bAK_ME5EgauXrk8ae!Ovw|FpvyclfSFRVAL zxdtHi+3BbbeALy=C)0LV>t>*{x7nN9fe$?WX(vxGf~`$h`Qia9rulO=L8Ydn%L9J* zi$MBq$0MGf0pbIDmcO?!OM7mFRp1>N-txL;Z41OJo9icIK3%F|V(hlQ7eo`<={JaW zMKD9lDn+Z;@e?t4?fI<)*_EOio9N&4lBB^SBw-dNxbm)=>bjOHI+1jubom8}HEa-9 z0e69EM@j5QUjAu)iru2OGY7`^88;8}p6K3)`wO9yw!(N)vYoMdBA`ncjej24b3)^n zknOpWw5XsRY*)_BYqD!D-_T`_&{X9<22kUbGnpFIV(RbL-I2~iSgh;%D0?$9I^S?8 z*!07s$Gtb6O`*SQ9h}q>7WsYU?!Ef+_MW;r^;+6$Uzq~Au!`Kqg)PXKA#VL{e9M^^ zvovASseklDqA*;x@2YdEWVPP@AWf9l@7`O__emWJf>{MFXn2_tZN*Kk>mEK&Tz=F* zlUX4y0AogNBv283*RJl9(65zlcvD{c>Un3Cy@KjlnzVIHwW*ePpik(97 zk$UC%S7Unj-uoXg96tmHrAsi2qnhu-|3b1*<$Uon?4=B7^rt^I6R7>ZU=h7(C~-e_ z*UO|&&8zVPf&{WN1fyD<2fPmKb_p$45YL&Qo`!%WQ)O5 zHI3smSas4#+PVgGRw)z&=E9uWCA6Nu^~r5YrE`A59(uK-UP?KkibSU$hvRnS49Q6Q zX%!?(*&5A4kh?~rqEx624D|Zl-T<4uPrceQw)+iJ)0ZXSt@f7KX#>UGpkQ-SIg_hcrlRkeRv6+0|u6J@- zM-t`Im&~;^1nRmRj|$YLEbBG-qC7k&L4@@mkNg_r)E;1(LH#03hhZrZ2TL~%=^mz} z=JDF-bT-u%dh(bZPkOmF~!SLj_$sArA zGla3Q6Q~1<+I?z*D`??$KVq^!1)D=1((7bf82^+&`iUu(f zAn%i$I7nf6BCzrrC3&Nb3GyU)G*oEsh;`YVAN&pMKZd~jr<0cazSW6tECpj$PKcaW z+Ir#eyB;&)FLWDs>7qCkf1!DJ)O(2Zl1+z?R-J^Uw?@Vr; z@bG-^6-$%e4Ct=@fZMYcDbw5IX}2Tadq0{;efGX)u#QoebNm@(rphf_nARB zU3~{zcFnZBUQ})6!i>d%(k~bRv_zCKfdl6D>I#6&=S;>9U0M9((nfgP zlIM>T?*){}gnv&Ofld4shpH?vT;biS1FO*R{2(B?J8uZ-ZdiG(UH}#)4Y@b`O zr^3Jm{r5awm0U78t#)8Pi&2=x+s%uG)xWI9WLrl4y(xsQhw5DUcpGtgv1Xw9@8qY- zb_|a%OV89yEVENW@&n9d5c?Xv0$f;c`b2XUXHtMUH)*+@sNj8hs+N3$eICuiR8L-# zO(9id*-Fb_Q$f8nL`g}Na&NehVph6Ov9|>rWY6H6M(=QJS|5!q15NUhh8A4C7blf2 z+b#Yk<;g$jf3)sGf-#b4?iIz<)J{(HkNY;m=J)G{f^<5oK{8HXS`yw8dWyBAw(1{E zR#C_W%C;klcw*83ukWPsA1L1Rbc`cCd2l_`4fPBEqB#BfsiSc^@=em8m{O`d*PWH% zc{3HCKh8Kn11S5D42&GFd5{xg zhpWgE@4QOeck*kO6*gJ8kQq+cw0d^XsjtRT==J5JPSu47JxKPBu*^H+;y0!CI}88e zqu0MX{|Z!xXI_+{BP}yt4Lq8v3=+8d;{u&eEZdkDdzwBq`m%#*vuNABlr6^@6o*B;oM4Y247g>dVPVmBpXn}a4$;K{(zsit+mhL_vhMh^mSri z>mFtcxZ1t{fzpG>SM~~L*jrk^o60nSIIF&K^?elUGfARy z=V??-07vU#43!@lt3(NM%rEFX^sHAr9oRZoxhgj|4@}NHN=nChtBrqIn_l=s^6R4L zvm%Ts=`e6RtAA(thDwKgs!lwf8GG&vT~*G!3kdq2_}5=)yc!Urw#S%|4o^M=X4dn- zH@DTW;|5ue|3KW2xMuQComP?OFkvDQgCQE|bQ5GnbZtBt`4n>}mL^u^eS4V%T~zm( zsp;HZQ1}moUGxR_OJ&7VvIn{UfjV5@6eI5spSfqOo#7OnQLWbkMTjvvsJcLmlXVIe zxzP2P*&yxj|3EF!OHB>2Dp&2y?;6cRqdd(yw7LU#@1R>ukgpu$lakACNSlGOWEtQ- zE}s;YT~rJu-X4--qOQU$n7{Q2t0dE+d8IWx7vffrhRA@Pu~nr)MZr1f>Y=O3XbahG!nn(=Fn_Ew2e<^;w#R2&t?FKjXJdMcYM)SsNO8 z3hOd-7(Y7S;I<2){^fX3aS>tI5TvZJ(bHG=ot0UyAT90<+;{obSL4wKd<1m~L!p+h ze+k?v|J+k)>LrHa<NMD=x`xvwwKv36V3~1-ZZnLLvWiJo&(eUL- z9tqu7Ro_@XcAEWb#xSS$ro8(M{)NWAFv4=O@SK~a0tlj2L@ z-A4ae;MX_6hQF`~Y*J2Zb~FKx`6eHpOIKXHw*{34?D)c-$=ww*$x( zhBL<6MI8O#l*C_S$C$WC*e>BqZ^NkXx_g-AKn#v1$c zY|^n!`MJ#``|fx_j$$X2v-ecQl+DeBY6DCDinxSfe{X06TQPh)${ec zj`+SxwHnKTE}tTF_JkTg1!s!(>wwsL#UsVO&zBFr2o!meNtXhAAbw-MtdAxKeNX1f zpPW!(Sm5YnO|=-7124;iXCrl79$avSgMwDN-!CC+Dq^2GDk~-op(~P*^cUCdLy1>v zyn@aBJoL62iqr$Qxu{KWMJ18F&(e^nFX?rJy?$wq zm!vNne0q#p?OO55!Mkd^sjsSi{J~Dz)59on6_#?z+f|pnFvFV-@>+5|p}o&~1aM#2 zH_@Zsf-y>w1E+yvlR5gwaSq`?$6zMziD6M?|0oi4DX{cp42T)2J&Mpk246VS%=dwB z^B#K1s{6=Xqk@-h{H7p*{Nttp_$baGUvDERI@f5QHlZIi(U$5JdZGOw>w|3$fc+X_ zG&u+XI6DA#BaV_gk>t_Ghw`-7*c2+53m~7$)Tdv7&H~0`**Ar=cW<|2F-jo)alTF@Bjd0e1sYn9yB0!I6vst50s&MpNV88$Q9sRr%Z z@VvcRkSOs_`07^WrINC4j<(QylXbUra=2Jx;kda_AopI>$?C$J^RDdHh**P$Nh^#P zYd|iljp*!iNZ6a2`zl!d$6xd8Ysv4ePZG~$(Ndv1roJ2oa&{u<<3mLLPSezuk&kA( zKYIQCP*KWPfwSI8z&;1l(s^D}n#Y3ed`|AWyT#7DUz3M#-Z=vJMw`(q| zA2ar4d2giT%8_kBzhP<`v#sTUZ_`m5Tv@kjK03<3O_3&HCxwl6^F`E{rGyqeuW2~~ zc25sMX1@}rs|2Dg0+bgXD$y3_851^(i0h4$37j|m6iz7fw<`c|{ z&c>n*1+K9MlSNBx1KuxDgVJMEnA z;P=<%t0n3^*$E2pbyE=ssjmBT=udMXbl!V)KU2>a{006^tVs$ODa*W#4Y=kA%k8G; zyerJ>JXk$wSQ)_Vm6f9x(MCYF@f{!Bq1{(;n4~6`$meSx;=sz^d=nGev5EY2DfV?k z+~iX76>$>%gXs%^#NzQeW9^NxcT3dKbrDr5O zL({ulJEVF9r`d1$Wk&aRXIJCb+p>NtZw%Y#%Cbi&xXg4MhiseyKYndp;^2C{Pg4)& zQ?*avJWpblpgX+UH~u!fn15bd5`#4D(lJOGIlQFK&AzTkrkZ-f6(W%NSeMq)Q8js5 zsGr^efUEOX-?)}ruDYS3Ej1>o>IlT`-?GU|8M6_?7PQ#Y9h{r*dS53Ge^BAAyBhe+7ix#6w$>-$5pPC{KBsyxJgTYD=0|zAzm)_4-X*Amm0NcCuPWr>TDW?j#Cc!15?ddvDM41eG)?BT**~Ft zwfngJ_e2bmYFY9b>b1JC+1A1kPn2{oB*5y?qo&dCw>)LW7vyU#bO>nowcujZ&G(@_ zKomQBOPF0jft1_@pBW7Sy*waBqG|Hz+?;@{S)<6SLnO`|U@w6uaXsb}FAY%5067*9 zl=2qyA@#<%|EtQdTtmxFh1U@PQK0~C#L`0-r@AB2pys?H?cLR{=J}hTovWgzIZQ!jmb#5jG|KjNR zbBT6z|HJe_m>|wyVR!X0?FPh9V;jvEVE)xYwRbD^0G@hZ$zU$00p!l5Fcf!%`Zc|$ zUvJIV#&@0!#VoR`A*gmBzFpR&i1mB*Ya?Ls{bv{cD~l<8N!#NrPab|)t2_u1L(Ec2Q!}bKR6RIu{A7hm3d{&+I_XnlI2Aj z(yzwvhft|{PqJ#fB!%DqMFS}kY};f;o*Jw^|7&N(8mWC={gj)K=YymK(25N$Ik9*x z%C_l!*K<4FSg9w$EV&EdjifrhV$yXEG(OWT-NLSV54N`|*#pwAqvx6k7^l?{Fi|Mt zac#P&>~l5EKN&tUlMKhK%0xnPXm^vov--G|Qj*+cGff{gJZ=W{d!Ql|oox1d2asDl zL(Rqx&af{sRD;55xMRsb&|LT6tlcWsQcY1OQ)G&?3%|QQMg*ZRxmO$@$C&i|>+7W* zf>rblD#ChqU-z@SaX*buJ^i{cgyd&swVL)DtD0LvaV7M{tzl_naZfP4xXMBN7;m@Q3vl3`225NLg(X{W@D(UJLJtv=DXvg0t3Fcf}oT5nJz{ zby3itk+bK$#xYth+oV4~>L>3hS-scLvrX*pu@8SlW86jNI8o-Nw52R-+lGK69`_VflI`AioanKJm(7uFaX-=lXDKD&Hr z>DhtoHVVP1SxK`HCcdU3FWJAZ_dpwl%(1_1K(ea)<8l<154*Vg4)LM>TO}`(-X}6~^(;-p zUW8bzER_Z+#u@`NJ8CB5-dFCI)lVlr1C`{}n(O~SMxE;2zW_AHeCG-#s@2`Y^_AZ% zExW3Ev|D^WdUeC?(ys1P^=Ir74sPQF?zw@QiS&N!$o9HJ4SMjNo1IYh+rO&t7kcC4 z_Xt{&hT>`WxPiN*KiHW1%yMgm6R5%nbNZ7P&7AMAWDK2uHEaFwy4G^f2^-+>PPbfK z%}(a`ER~*5%s+TFbm#RFP$2RyoZje76dmVQBNPVw zt`U3tTN{FbO|g#xN-Mbu-*$vM`ZWB1Y)6pp8;|sdj5FT`kwH&0fW~zJ^sb#jEDdz* z^Qncz$M6%z@TBYOmlT^o@iHK(qAi#=zqq@ZJ}WMFPrIKljEN(qIU{f zO0-*6PaB`Vh#P;W_Y>HkAQX7pgrKusf3n%z98En|k1-tU~JiKi# z?`FJe_<9I?&CllG+oXiQxn83GhneM9$(X6zD_Bhz4f$9$xr zyB;y4|LR+Z>gJ|a8+!F|G@QV08+y}Bj{i3D*4MtX<1Q2Wz02F@qkRi>gr>f~;#gr` z%1_K0^05DV7(3Y9Fwgy3sC8@^|64QN{h^a&X8={-p0-cOh@R)ktM79fa*}d)=8r~1 zYp>Wu-=deIKmdVlVX4ftOlmH@%^kMd4!osSSQW#hthG8+zmXMtNdJje;XF#tOF3N1 znd&p~2gMhv%E5DA&xIDt7!I8CjIvsq5?&ILpy7}YA{hUDEy}(@qG+%rK~?)z7uuCr zhfHuds3{_b?QSY3QlobJiZ$29Kh|F(w)N>i9}(27?7H#dN1caF2#KHkCH{FSZg~)2 z?#XVPCoBG}m=}d!fR_(x=t#n8N^fZGMnSmLMJmSEEW(WO<2%M+Ghg;10gT#_V2Q4$ z$nQ8>VSJB@T?D;9ywN4MZvVw%Usp6C`F?Y3g`Fw0yO9&W$BLEO!B-u?jLC!>OgHN~NCTF!o}M@#@uY8mPmoGz#UrGGq3quW74Nq>Nux5Fc1aL3y^{ z5OIFRT}0jibCP>awMLmD&PvP4O`AIqGoV~X|7Q14w^RG~52ufBEPm_X{fV8%xsvq* zJXDT;>&K&_s~V^^GUlQTOWQwJmJ;2qWqsudxFmy^LTDRz4={=M6V0_3UB&DT)Wplh zz&97(nnP%}9%zkqiP6-DLt&}=A+)=TSKRgw;n_T_wTnZ&{61OyW~vFgtt;fQmTqDe znWoq9t9mKUL1kZ*VdEG7tE*GfpLnXi_r2;dbPM9(Z@0ZKKB+6e&Ey3RAkWXT7T03z zvJCA5Uh0QS_*zT8X4%qSEm^yeHW?tbd#^FuHW=6Vd=YIngZm%o9;%5!UQ+*BJ(%4` z^J434puEd5iY@LQfs;!q2(MZ*=M+zssK~xcgt6EKRDqgwusZ%j$ zY>mID(93g;bvxY*QK5n#YTKR$pdn6zva(jWp8Knk1ldZwprC;R_)L|n2jcgI7W-q# zxqYr4mMh&v-{`zux%A>nQhmLo6Tf|0g^wGA5T4*TSjTF*ni4O`2EjO#B%%XR!L>X_ z^+!f;x3$#yZ*e~=+vQ;p$(?R)(UUV1(j*BdhAyYZi7UR1BNW3Hepp}ODDH-vK0Y(oso>I+u7NA33jiYN_A& ztKMD4dQ-z63De}a{Lj&D^6PSIM5ilBsnZE^o$K<%&uS%iviX0*&&%mBEYNhBe*9(d zXXtxo_5`(oD^b6io-c7xB2?skAv^_cW;KB!o{som{e_hnedxwkmOl|g8-?a)1G@QjU{_I*>W6p6_1 z>*|6txeqCmTm_!%d-qfA)3r}fTj2<%KdwgrS;Ik%Kc{@OOP|Op81c81LTx~jvqH46 zIaal|sTWWt;_#6a&F%a?O(F41LN}vSDn%I1c}aC?y{@Y#jQb)_6XdWccp8I#4RG?7 zcO*WSataMEFo(SK*KWe9HDJ6D$3#sl^3r-X%rPkMCjC-l5PF&mU40?;F4b)akovR~ z^gB2G;_dxJtwhK}dnty=FFH>;u!;=ep-cr>T81Hht0zDZTttc*fPi<+xww9a^Twst z=zYdO*Z}))%zkKqz{tT4ph*=On;!+2y^#w=N!}92+Ey>HWm-CVxG=;HCCs%%IA5vA>E9Gl3?y%mow!=W{;e>H3@b| zQ^+u3nT7`FW-q+kg=inRKXSZvH(OCaLG=wE(Fu!sKxV|ze$ZXZlph54VCUQCamgIC zp#k7TYpssWI7C)RJdj^n-_H6!jo6TbMuy!PYsw~s=A@A<_aKAnh3L88=Ojphr2M2d zAvFYcs{LV~(pw2}6#6Bv@bUL&>w7KZn_sc+7R(791BBZp8d{-%h}MyUym>nDo9)48 zYbvAAcH~F=U!>oDQXC~da4k}S6AmxtVCRonANHD)D*?_2@z?ADlU{+3lnn($P@V6w z$d0q>T;2KLdc9}|wPZINaUk4Z%iQ^Cad7LS#_z%XeIXuKKmm`s%igIB2I@&6EC2)Xy&=NP<;5Q8vgqK^v^C{wbger}0kMoJC=DugK8A{v_XK z!lvan>$y_5{Rg{hB1$yKOFA;aZMpgre;ZjYak&hckFTzh>jU`K5uU_+x&kE7e5VT= z6hSK+vb7VO|AF9IF-mmQr}(OX$>D?ON!6_(U6KNnD&N+T@fA9>;{F`6W*Eenu!@{E z{Mqb!Y>AUloIpy?2rsJa9S8H=*2i`02^UzSCTfk7bvj zGJF_U??)a+QasS51PxQsnSq8ku z-ToVpx6WuLV~0?GI|STF!2S{Ui9fs$+QO}Z`T>~gq8okmggRszhWrmyLNkGHWP6ix zy{TQV=4ruJ!=$;Wz+{_9hcf09J68MhVo9bqt?-kUp|;KlT%sF5epKb3WdW1x77zjB zQ45&XQ|bYgUD}E?I9kBM!kKl47Xl{#%{NnKDLXXEc^6QeHl=J>V@%i82?lE00q6-qj zAAZ3lwo8eiGX%pw5sQjqe}AD(w6=G7B@^*AuCfs9;@IqoksJaz7cvnng@s5{orv@2 z=4dlCFKj9i6|f2n80CAnbgJ(L3IctpoC9FW$ST}3VL|$ni|R7w%Jq-G$#s5_(AeO} zt?8?U`4;Ozustq>HKhi z4M*Z)#t$hRklrc!CuDK_Q(qwJUTXWVnq^&R9l~916C1f`_g~3_tj&UHk5#!{^J$8b z7)p!t<6@tTQ;HNTx=Dy`+l%+tS}>G!M%GUmrTnfAoo-2{JW2dBhK z<@&rjC?MPAR_+jdG8yB^IBUudK&Zf%8+WDa02w{cxt5UwcTHrPod6DGt)6F{nXKpBha;-wFJHa%Qo)z_zZkC-h2-^Z1sy;emhY3!KNnROUO853~KAtc&Vh zqf8Uz_gngBMTw(XaF$fbL-kNYX1b=%v-8e>GJiA`fnK(s(fg1~{i)YIXm;o!AhW`+ z=LY+SbO)+qm-;jA>-_ghKtintUBaX*3tpu}Kt^kuQLr2Q%SyGLft%^?;P2O;oSNq7-5}4@2WR^ADLnc)kQv7K`2pcWtQRPZCE3+>!^K!-MXn!_d5X z=hsh(Bx}ncER4}rjDxZOjMBO>rm3-zi!G24{X<8Ac!L>oxWD@QgbbA{(Gyn&n! zYdO{IZ}9>CXjzj_NEJ;JG=K+CqYHOe=GCp^25t-5>b&aNm%1-o9s3^RB$`dcninAL z9wj57`$k3}tObzdfh@1qXiv!Pug!q55(B$M4n5U~&+Z}qtZbjQU+(Gjf9oGOhFci> zWhpI`Q<2Cxv%r2`yfTEfSm;C3qb*Fp8ij@#GrXn**$teJAtr z%bjtfgR1DB}Db#3MQik!~^YVgjBqM$(^wd;ZWhYu1)gRk>Rd^^CN9aAcjO!P3XhLYLj z_KFh`OsF2#0m17xanHb|uEX2*OL!+Y3~nqvPk-d1CU;xeRlcd)yt;6PjYj#y%_6^J z(eJK)15z?V+WS^c)>+ySkS`<8-}qikVcYw62?<*B_9~0*=-L)QF`~SKxAavpQ8@=O zT`l!rV{mt3U;As;0_&byp2m?}W8yfKQ&#Cs$!`4n>uO6`uXuw+BPTXE1sGY0?_+|k zR^DB{*e*U#hhRF1kH&)Unjj9Wbny&>;eJD1wXS2C$^nYv(+q4nzB`C0f4b~U7!Oc zi+Z0Vxz%$EVTpcL*|E_X5n9cIjD5P~z9vbv1EKG&A%3{C~JifPggNmC(T-RCZphx;*={`xJZz(m`n1EwX~Uwjiy7 z4##F3?-4(h&a)D#w6)@K5Vwjfw>;kre0QZ<@?nv3=&Jeg}0=^0Ah-(?415y`(l) zH4GxWfKI&<}ad5a{sPJK#RepLt#)ysF|F(4vpqBfp<}adGxy{w2AzsxBSQo zS$*ey6^%YUtJ?T@BM)a!c-VEXGdMpMtt zqL=jU7--bz6}(}Gd~AVELA&tJEBQ7=Sizy~uuEUe8pcqwPymjQ6w^r4R}xmWMCtP7 z*@Gl84R;wt=YoJ5Sa7Q0d6Mfm<<4;DFWY-AXw1PX(6a2{eQsH6cW8WEX(g;2WUe@} z4GQ)LTDpuYJxd5r&je+K0|>;;vHEEucXXdfAFt5@lf=V3A7LRlV6j-fdrT7%Go?R; zlIjAg96rxh0VO59-p6eAGJ%-JPy)ZX0UNE*>#KkM+`!(?oML4)XvNAOK7_3DTHwQq z!j~M3i{OOp#?}xf?2*y~)vH5AM)Hs@qFMSd`rOW*t!2S!}6dcn^ntUBx;yk%%_+K_!F34Ho- z?V+BCa9p8>k_Yni7jW5MX}QTQKFj`j4Y;2~;Q3c$SH+_{9_XVhe9`XnwLoi)OJoWE zTLYWO#g_XqQBdoxkDu>pwv*7*#qk($c@OnSL{gT-ZEq%%Q{#8(;Pkuun6OFO-vvJq zWi?LLuDX^L%yNDmcW3=I%GgT@_h*=3s(0x)n+c7=u^DmY12qZW+EM0om;pgAd%(Fu z2Iy8)EY1|-8>gQdHT(`IGBg??IZ|GF)mNmAJJ(Y+U!um;iIr2 zEUaRsq%VUfL$>U@Dz~~@=GJQ8&{3kw>LvWYZTSRX%M~n6fH4u~hrfydqyzN?0F_d( z*eQo@*ovF*njRt;btW45F>Sr~&sWeHb$qD&)#i9)# zl7^r7SpSA0N;o|xX+gMjBOI;|OR>LWU5Vy=x-Pxbi?xyPoy_IVd3z60@Bev5-PoSx z+1=KH*|Nl+t~lZ!`Ms`ieRynw%wvrbckw_R`;!!qkV9A8hG0UCbhEYnr17Q%ed=38Ae}*g z`1WJeOdbHwqm*MEt}a_NbV~{qHYuR2U;~%z0`D@oKBPZkm*3to`+%5 zKB#tuuMcdN2xbqW+b+kv|CV3m9(hUGdnrjv=p*R2z}nWKpd<-ezImL0J_wM5m}S$G zC;3PxbjFMLGJWll73l~1BG<+I1GrP*wj$Z^@(1#2ws{?XVNZ-~Mi8uszvalnO-bxi z`h*DB?qqZWY#p?VKuji4cu6Zr`z}giCwgm_aHZnEC4ED>UjCs6S*Hahy~w7GqghfV z>c6_*kA!@gvS)xdG6jO1Wqc7pKgCkv{c`_ulrmLV;6FC~oW;bPxjboIpL$*?a2r}; z4>Crt-<+B>uv&15EN>DrI?em@R~ibd5g=fT&dsHdOS~9iVrW(98uicJg}u>}qfR$x zESik3&3^z88zq(@4%P8QVz3#+&oE*7SS1(nU#|L(F|tZ8#M-4F%VVPga02fS`g(b! z<4_5($#i9{>lY1_h;tde#^1XPfdMnaG%j25ANPjPp@uGj1daj)vpmZpmK_rc!lM4a z8RV{4pxRpL$w39??>T6T0x^8$W9zUBl#yPUz=ulTrc#_t)gPcquKxakip{S+ARl3W zcynmBInD>|3<6~M=~eKa6ih*xlJFolChi*!iBzKQ>P)5Z56bunyXWj7i*bPrq#ic< z*Xhc%IuhOgoWNCtM8${hmnQbgbs{BBCpWXekZ0kJ)P?}u#1`2_k&!wKECgmwi2{w zFRPxvAl{Y!0Tx_3=V{ueWxhy-X)Vi28Rk60mV`@(pk?v*-)8TA3p5`(R^ZFF=E375?1gKGPE9r}+H@=0 zP>#vKDh{AeGI^=ls z@{Wwlb`6TcO$97txKxu^xOAimt)Ibc_js7K7~CeVS-n zvBX1vsh)h894kd#5F0&V6}GYLey5H+qWSJLvy2JC|KzX_3$&8I;*@NLI~@~!cqnY& zocv{X+yU_`` z0Is%C4(MdmjajWK>|?ZL41?u@0%fV|I-Kb|e6v#oGj|WVW4F+cOxe2A)2M0L;+zr9 z9+#Ma^Qxa&$$dh}n229x6)A3NaG1 zpk~#&nX7NOIC2L+Nu2_qWugRMt&w3qzj`E3;n2S1EE;;-NJ&{CAZ9Af;2rf(=L9l9 z?B-pqB+npSq+u}rnP^v)Sci81CvoRfKsoKJvxQlMM?o;SOKtZOS(D$+wsuKe^!C{^ zQtxkj+%-??{rkIV!5d~FjT9cb)-f-aPW_S9; zItIW8NkErdap+)hy1t<)(TQ%7gd9w}#JRq3ng|gDj+6-~?d$rci|Lm^Lm}2>H9UX5 zZRaTj5c`Ds!w%I4#rq_8^vYl}6~ZR0>ssq^AiuIs#9vIHTd%4!+p|I6R^L7->UVht zFDNVgxF{~y|M+Hl2v5#oKNpx;n)*x1@ktgZqqGQGWwv_-2Fm}LKn^mlE5r%F1+IDnFm-ZS{T86Z+^Z41p068 zzcmnH@(0X-nc!Y0T1?+*wJd`?`f|9OD^_XFzzHV!TxKEXpmBH#s$ zWS|FF*w_znuyJv5aDZ2b1AhnMkmFJ?3n}1H>e}M7_)-Z+CKnK}D%N*X>rMP+6S4D) zdPqp~n3nDdI|nBhH;<^;b8!htsTVJmlvPyK)b$Mvjf_o9&Fmc-+weTHZ`}jw*Bt;)7uB{9~hiOOijX5q3m{@_EMW7n$w3g%Il8(g&iSn3u`m9%7-yQ)R;)?H1+sgi0cS5&9m03jdg2Di^Rn%?T4IHt`wNOOVPLk;p-fAl@ zP^EZF#l~#0{>2|Cw#l8!PoCYWjigrHNtF#+AMudb@Lsdef5-q!W`RE&yLAge`>KOv zBMUdSq{|x}ZF{Q1ADL-tYU`tBzpjz4v+lXQ%r+St;`@9v;p@kk)D}hWFtr~-cfvc$ zI#oOFE9Z+|H?p6pojR|avaqPWRFD#Xp@*+06}z@^_7EJw$=D)Um2{K9Iz1mz_&Iij zA#>1W?vcN*dp$%g2_qINYib^s*oB}T@fH-#vu@PWR2P|=It+grU@LEtIka0MK2%2Z zPL`nTAz4CezJQ|rb!pmll_Xw#VHUMa=VZ41b(JH&MXX|OKt7=|#LYFK)`J#!MZS_^ zS`WC(@SpYK;1Y}7w@Sj3 zgRjcxV^z}iw1UU#kd@ab=k=}<9yS*<>0;uFB(oL)UHfXwk7(=>8h;y-3ZIkm?s$6l zPY2gzI4EnycbcGz|cRL3)aIA=2V@FY;_pGjognSy{zyE==Sy6-X#)PT`r(+MP5=hFbwmd z=p!16A_vgC7IQDJ{nU0gf-l-Q6I=E~VEei%rE2`enY^SYn|&Lk+GRCQ@P%^5n&cYR ziWms0ez<<`brUcpO_7(5=CzkAs!`sAEJ7)$ONi6@bAnrlMD)d7^IpeaNK65+|=T={GT{gCN(5>0gBjwqegh$y#45 z>32-Y9!6<;tw&sj02SXnmc{t~1E|!S{ya5?+BHq)?ph;3vOl5ySjZ~h!(?$aDvapK zdnd0XfDKoD*wT{z8l)jrPK9Cf}oYyGCvZgMcmtevFCU#9ZGv!i99_#T<3aZgY zNapj&(15#EV?_UMwy-i5!ia0==GsbVMBe;lBz ziW+?o@#+uw6yJK!RT;(#ULGR?Ms}qDd#l%S#q`oZc_G5XG58KXnNR)@6ntxR6lN52 z?Ri5le&pdF+F3PeqdH)?EgC~T2QO;7vL zJ_8HqM$zi^J>b%bqYA+FSL*wCY%9bC43`=X`?DsodnEKV*tL*nNPnPHm&(KDKHw33 z$y3ElnsD!w-tEGMXY@%U&ZL~@;r>kitHa`M-mxD>aZbYD**-q_POVssIc|$iz`){- z$ncr!=T>`{BB)&0)RBpP5fdz+73)z@pK6)eVd09BEmFSw#dxB+p9-b9X2W}zOur)1 ztGD4N;qtp+YxQz|vls6`6(_-6$l@b%!4XlL{(2E2y&x{x$YV2_pr&S{)%hPuNv*@4 zX!sHI$x!zAQ&y&(2ftbVB>s$xOPE%awpikMZ_||^$-sqE8=uo2(@x&wGpM2Ahe1nJ z_03F%vE@z5j(I?!3_y2!{ zqJm6H=^-E~9nvFYlz?=1cXx~sPy|Lxcf&xC4ngUL5z^h=&EEgl=XZSnH+Ewj+}QTI z&ht9+@r2<}S^>o=4bHl$VKS2tz1r8aF|_%fa>n2)k&8pk$_oMH@kiB#o}WUFF{=~d zS9H_Wjz!6D#3P^0j|yy6&h2>-?bghHYWf(b%ia6VEn@{u%+NLcgYYGTLitu*gLK>< zVI8Hh4zG>V&p@fZPh+6mKXn0@9VT?*lj{-wE2E6wB(8?ENEWJ#YMV6s9mzVA*mK_F zzam74{~W;gA4Qosc4N_<9&ssh2?)>-?dti$e%Y2yzYz}XcMICSN6!92yQu6D0=c$L zYu{?Zru}v?79@-f$1^i3Itm(u$SKMaUf(xhYN%;LO>Yg5oIAkSP2MY=J|3SxvV3uw z8!gG}XPiZ5gqc1){9T-|e+&yAvcSvsl>`^sFWYd<8uY*XLkFsLEP3WKgZ407mQP#j z(JP3;{!a0?ola~iQQN<7WcRY5TveAe&LCCRRJ;>w%qm;nZ2nVpNE8Jbs1)bvcl@2u zs^U#qV&A}|x6w7OZ36p>CVSvb`OapFrhkA!U2ZXQ>M!*>#{Ew#)a{wPq2g)MNmJ}< zl`lo?P{m6)2>DUlkWcu6Wgi;>oIPPcF%JEmY4P_iF!SHyt(i(yqs!2=c5|2vS(%;U zGZn4L-KY63jFROS#H(8hgMGtyQ`niG{kzyjZ4TkD4C&3#JHzHjgEe|@x@_CpkMos@ z=;1wYW?$F*UKs8A%oCt|ql)GC_SRDdy|aCn_Wo&Zw_9+xqpvX_((E&*XnlHXj!WBw-db9b}=IO95OVW-!A@5o8r~08QEPHc3 z5y7`E52Z`GRf`&sFyWl)w7@AE!MLCcfN7)_IMkcbz72GVklfFC7t|Ulf4Hdvy0#}J z_mjY&GyHEY5h;xF(WaAk;(C5x2`3WcHO&UUZ3h(#uA`Jj@aJnbdnV!)!5p=Yt%6)o zvBPB{RuOZDWe_N{-n|B!8#QWsE8N%j zKe~bJ?*Mv1XpJ7iM(=;Fbu!~JdVDU;z=Zkz=O#68wD^CZa_L0)#0MEPEI+>Y^OwM@ z5euqwBJi$66w$TB1B>=XGF?hhJoK8f>a2K(T{Sl@4j6vG0SO)T98gAu9!m~~b!cM! zBSUKxHD}&{0xGSjKKLp`s9A1<%UtHRh-27m~4miIu=@q9t6?p^HUM z1*UGF@^*2R5#Ud;O@&zp6ixj`r-L7Bl0Tvg)MO_h$hHNm$5fTxy}a)^)w-vqjn3n} zn{+g6q=L`ZLmOt+q0z6a=cVag29BzE5d0eYNz^&MFVYX1-2&#DfR7M34=roB$Uk^R zbl;}5Jn!RGFC^)(?WMQn-u4V1AaFT|*zM8CE|7ow@}9(o5t^)ep%W zJa|~uv6N=cWH{8dS2m^tt$qQ&kI(Z*EgNEf)B9|jh`Jp3u1^OA45abMQJyDx zvY4>m#%XvwXNcJVnknOvsB=t)nX4YREUl>fn2UxU-2K#dGu0K*t+0qM^!^=wRszM& z`>xWzv-YKa>7@I8q-3Ux#h}4Mlc;sx)eTqX69-ueK9l>L{ks5I+G`+^Bw69ADNbBc zU|!XC{eZu+5_K1+ZdRk-s?(4t{Wb*a!7y8%mmu(@8_B8*K&@m@J4 zaM4=wnC{Uu+vt4mXVt1+i-2Xw=V693uAYBt;hGQ61h0v0USHE)yz6LEOweN8f4I^!MAbZcBLt&>yv?ex*EXD!9S8f>GeX* zI7RwFCelGMB_YlFY>fFvOqUrKoda+R=%O-I<}0>;Opb=!j=VJd=hL{%{)JwFVWnhQ zp_Mz0$2itlCA|p}CJQRe;3N3oU-o&m{GD{0q}SyV-W`yjlHtE4JtxX}|LdWA@Q)9e z$&4tsx?79=hEMF;;YXzS9!J3JI0`(SWYP%i1#um~|E4&|Ic1qD-?K(~VjyV%%;cUD zJV|__*z4gb&dn5}raMNx0oo;3>_Bi5`3A8{n)5>OEsNfuX^wd^HsT)wmGhb z16*3W4`6IHS@+6(QTdw?4qLNGZi>=U2lPFfglu1RKH&O+Rf0Qg_~G-w0QbRj9IDb- z+iD(%xEC(}1W8hzT|C$U;QB}!(DCkeFLSBx+E?1vvUu;t%4<*RlHpk%8o9~4e_TB< zLbpI_90VY1O1^c)l~=()0Cc8IqC3;QjJb;2L>a3P21bemCZ+y7olo;><0V&-#fel= zBwq2(!NE|%OnWI1$Q$HLJ((BlBG3FxUv-@~5Gt=C-boBZ;2w8lm}USytfU*nix?$N zS<99&w8rFS<{FlZlw`tuE$$oj2(uw39r>IDJ;+c8Ji{zPJ=|;k2vxf#2c!VAIGw8=B8?S@NuiKwLX% zYeGG$ZqjErWvV>^ys7Ye6w02HKPS5g&#kym+LUSgxNv@`^tbva@rS#Xavjkx13b5D zNfq0s(HlI(#(VuE)0834_(lar?I+)P#>?Q$M^>upjJKymDRimF%k>@kUKTxv3~=$Z z!05SYOftVjy%I%G`f0xU-TF1~=*Z)WF#EKe)AmO1;U;zuPR9M zt?+pIc$?<8Xfw(`kcq5N^L$30b5g8*c_c<`01QMyu?y|H4>zbuPdeuSmN)uQZkj^` zSuveJZEt^qdO&8ClJbJMl1G&8oPFHIfGx@o0}EAMG4bXwOCISshnKuwC&(?2lKf@o z!u&1}w&4L`+SQdab1bm4mR?zM6{~Q0DAY~k%KoVgRDK1z_>fa00 z^%$N-_q>F4ZVk&XOq#n|V?uRp$owJl38Y1k%g|kM5-0E4ACGfG(vNeGPiz3g25wH} zeRVoWvl8@+8Z+hX-DEQGK_SJqQIW|D{>L|(1;5g=A+|r-N0vu2iD1u`o-3_fmmFz5 zvN!=^Ib*4;@lXPTHZcEP`YRimLu~e0npl(I)u02JHiu4d0wJtyUftL#ZDhRaZ9E8{0@v7!WyT?Db zpJ{>A>RF-5d0wN4TJS~b!xypq)BJ<|JsWNRov*5)ssS0EPN>W)PbyOK_hRNS=Go$B z+SI+Lg=ZO_eEh~~6Ze$O5C)S@&J?cFrMlwQOkn-p*LJSW1t zr7O8_64;;U2mf&@G?fuE5-pb;kVI|Zv*?o6!%uISuOK7!2knR*DAIHmZ{bq_2swxm z$GG}vZA$kA-t~|z3$&Fikj8=!tn3R`Mn8VjIzm?@>$!&MnH@g@TgUR2+_p0JJ!vwu zK_8WbCJlx^t>s}QCS5|X9UMA=ZjPA(QF`TGzTyVdwBYZ`D!&Z-&-D-Z1?$*aP9}~H_W5DyKKdJv zihS-(%Q;`ymnwB<49XK|LmF3{NQlaZBE#4XbDrWq+sXm~8w6pAL3@4hInt7j>xv_T z3V}e{3~*Bc=s!?`5@-yB)A&%@5x_*z_L50#GDzPqur>!QB~FJ*BoGIw9LwWS*?3IX1kWa*%Hcqa(20Rb<-=?fNiW4L0yfUFj9zP$vx+~79S z94`wqM=*?c!oI_KxlpeopSU7oWWW;V06>v{;>MqSuszk4tkuC0W+IM-V-&sWEXy9K zs12HQ1QUVjt%MbCaZk|Y9onB!BTjmfUo?@to*;1(@ zOZc49^AXW~or+{R;$vrt^`&Y@75!HouY9)3SU({GG~K*ztp{xn{eHl9`cp`DA}EVwgh=eCo1g`}50#_h(8uS~v&ttGO{?V>RJ zkpE>+HJt#~MakPHxwMb$bSz@0`s+Jw%JShhuoc?RLF=26K2RgI{<4kVQA&BbdH+%l z=|;cdZxU4RhGub!f=0zIF@wr^{c1YmaXz_|6-obQe&;pNPT{)jg-w*^`^nHP{&^q$ zwa9jwXTWnsQ61NprH;V0JYly9nH$xs4DAN9EaPO+h z2fV-}=#9FPnA=4G4^(rlyM2j79i!G|9)+}Be+@t3dbE8au1-mvNHVr1+ry&Z1ni4T zJlYhvEM+@B-NqV&>;B7qpK!QITB_=q;cTmY`Ho8C=BiAoDd~6Zh7wZ^!I<`PMN56@ zI+ZsP!&ma5I`on+q8QJqSBX{DorM40&S1136hoUyGzcx?t_5VR!QDz3|o<>N5vNA6sANeisJoO$D@7)F11crn#}r8;G({KVD;UkAGK zd8qyVoo`Ni6O|^9GI?TWg~AO#tC3qZLS*zU9M@I_aGLoLCt(}F!v~chA}*!LUhj22 ze*Zd(9yT1{lR-0qS0x=J-ZRi%fWZ-3nOW%%YAItLM}$v{{cIDZ#ei3Uw-30pv=z&Bil2L(_&>TSkhNSHff zCECE($7(Mg{FpeUBf?ys`zM^^{vNUQgu{wnHN!8ZJ&ch$sb~(r49H8R7$0YW>Vs}r zZKfPdDcj?cAmO3Zb@1oy57!^mi76yJ^ZGK^hsDa*c_xo6gabmcVy&v1W{qLq`--ee z3laIu57yl@Z!N^?h?p6QO=Jhp5J-?_@3GbnG|A1)y)rZW9R#2v>em*fEtDid5Irdg z{aC+p=j3a$ghDuFHkD(oGSu-@eB!2Xnvn5Lgse;SMvmO#WXR6g3fMH=0$%XNu)^zoX&n$NGESMf>g>_U5Q$Zd6MWW`*l0x z`^b3|&AlMmM#8R+rd@JRqUr7B2R3{LC!`0t@7lup{W7Yl{8`kGPoNcQyNcRRQUE3i z9nzRO*TA&fx_2ExMc@le|4!9Kc~nHxI8GkV&*JP&)yfdRRd6lGSNt$`H7pC$UQ3e( zNpScYlVKKl49i5C()9|Iq*$s7n-=#nOEfHf%jFimgVEYz{rQhqn%@jnZBv!-Q-t+Vc7O{^kpgW z`DG3)Jd0~9;E4^#V#Pi_F2Y2Y_1ad}FX4lVfm3I>M|zj4Bbi3nH?+}o6F+6-p%^)( z-8J>q_U&}yZ#tIj=eya)5&y%PVHRbIu@9}El_zRW%P{gVQ3?9V!wD3HYz~tLgyXk`0b_wUCN=*fFjK+X8A-xJ2iZYZjPiiHRzhqEd7nA}$6 z)NOqoxOI*0nHY$=Ef4-jo5O$qO~b=fP~f6qUP?~% zOG2mar*$^l0*Hp%L${UDjWA3>Zd*_LPYl1mJ+Me>uci7FheQjAm0}IHADt6`6U7(s zOx0`QG{AD2)xdO(l#ZY)_nEj{kBZ#>k>Y3O*e7=0L1soR)af{ADcxaFRzMKBdZzN! z|8}?cqOkVlD2PCD@tmudW`PA56i!RJ%f5yex;eaB38NBaHb8z4X`1ORzF}Fw`-9!( z<$4AeV_EPm4o8CT#4p%;8?%y zI*k}@O7K$J>K1pEjLXgRNLoRjBYn1;RCMy1?5<>wHxaqr%<~xOPOC*X>E_@OK;8X| z4A2y1i7T+k#MmGmQbb>PU@uXoF=c&@4GQ&?_|vhCYBqP0lHNB{1oW-jzLrOA<xH-V*IZ$*knp@DVEcLK)<46~+SA7GI=a-18 zBEt@WvJ|c{s-G%{r(QoHTriKA!#Qd>;}&hY(-?iV_iFc~<4nJ>HLPETj;*5~6d<%6 z!_xNROF?#H<-OOQtFEg4p6(Y+*3wGDkK98OA(-5;o-uywlyEw|EG~=h(i5?R8$JzJ zaZil(n|N~1ZZH4P-*L1V)wk5AZ5dVDhx_Dzf^cAb!wwN9hypa%Z))adD=z9At%qXw z_ygtQq`1tuu^L35lWn*~-A}>>2DSE^ydzJm#JtP(l=nZnFYe?;T1x!im&bmSeY(qW zUnf5aqChug?`v^D*HK1*sJ43DJuro&KYGqbxX^fDqhauQT@LW%m4xse2Be0fRy6H3 zx(zsFV~RS@&L9Nfn-VAZk_td-X0yl$Lpl)v%Tv82U^yUX=+&%2$Zz;%p3dSWt^aK>Chsu6KF$u|8(2j=W-=cEXs_0<&Z0NnC!=casd zHN%1op6K3!y|gM$n^OFXA~>qn|qtMy*%xG zXoJ)1&e48hY%;PIEaIMAak&BPVuC==>S^Abt0>EiQR1YYZ<;A<(pIj=ZM5FJ3b(5@ z*{EIUb-meC`km&G?MGR``@9;DRrC+@_G7sG-181#f^Fv9v2$19&9YMtx;8UnOfBzz z_yRIgQE$AAU_bLbr(}xFz(VDR4lPs0^(o0xBGi=I2W@#uRXcrFC@u4J_@A&+u%p2G zQCjwa#6C3*!a{Ad1GqL8)=9z8LrS+pR0GiwxTl)euf*|VmfmLLJ7qlRi+QOtv`4S- zvRbKLu0?xz^Kk?gngt-wRz7Wd4e@LNM?lWF<$z}lJ})agB~V+ns!)+88FL8(n2}HZ z??nm!jx=sQeG~TOv^Gm);TJ5|6}RGJ$Wr7}Jv1hcZ_Anir$M@1%F(-UZ@CV2JW6Vi zlSIO2@2roZMq^f4SF8#G(Oo~dWmmCauc(@!hYpN0@(k9Wqxv{YRA_I-Gwa}j~2f;aPw_A;ed3{BucYeXr>e3dNakxrT z<=d;VF?J*PShd`Q2UJ}XrZTMyb5vw=@* zM(}?v0QQBU+t2T8hwM zc6jshz|Wb;`KAscK(&FT(*Pxu$82d9rU3wMYwd7yDPjXyp3-KFW~|eMNMIX>9x+Ga z#SnOIWW4OcxJl@K&At;7DBkU;4NeM7%Jy@jO38Xzz4S=~&rOPXmS9Ai0Fg}>)9N?z zp6~fv&-1ke$1#vpmS2`_hC$Tr)JVJ>=A?o%Pvo$J5^LBpficb#ZR5yM^mx({nW?yn zp5xjK_9PSk2YOG<2~3x?y1*F}kOsQLx%4xODzSgYji<-@X(_^)gw&S`FzzDLwF{0I z!{Hlb6BW@9`##Y1e#AXH{DS50$trGP@6iEVcfsiPY4zwsjoZWSPo7qG@GSc%ay~G! zu2$OoMjzhQymGfV`1r)0b7kzy_1j)vmthfn=-mZMfFb4U)k$Cy8HsTeb4kZTX~N&CCvls&D`H7oP>gNActjbKH9GrVhW7`~f*$yd97-hhRqlJ}z<5(>T` z-U&Ppz&S69(FhF5hTttrJnVh9#4kE~_b`OH?ukP}8t?y_nOd$7%by|fW$&9}M zd+cDg}{YTST=n2Kgc~EzQ{{v3YSeP`&*XDLyQB8oo9Qu_hFB0_sze2V~<}NV} z^rVEaVrzORvGzmN=R2F}PUiSCI+vz-9$A$ly|Abrm{+&orb|sU;7j3}V+TB=^x*Al zrOWxl>)KiPheD>oakvA7Y=ZGbVf9gAAe)(5KzWhOAQfr88QNsCAfYsn`G|&O5pDx+ z_EtccjJ=tr^{eLM*vDkog46LcxmntU$t+UZN8B73IK!zKSb*vYw)qV zw*X)KQ~Iq2A%f=L?I}Lc>oekb*yn4)tgTz}mnsGN7?ar}zuKTk*FjG&w#a{>8$jCX z(w^j2;Vdg4*3&F}eE9+AF}c!Gb52o#H@xc_RW}u-<>;kMnQfKYzxhYMr|Vh(uL-E_ z*_GY6NJ367(FbSY1|XeZo9?DL?j;Vns(wtORJGS$I~7-$)DUY(|~v z0xfZpKsZu!U`vBKS>}1!hbxb+*i|tt0f>}3ZIfO2lc4=wRP6)&S^X)=%I)TEVob=I@)5*quc4Jvm6PI^Stk)A;_-){=&r zd&^kPTcJ199OeGxIFCogBHL+O-!Ue*vXfvUa_=Zx$PFMd!86^|Z(_dM>yPP? z>r-!JPdBrt2}AC@pndiVW!aSnO#pKbyEk_dKvRh#ZjTlfNpHoA4{Sy%+|f}3h%QCv3{Ew^YHzafjM zK*@|pj`ilHOG}sM_g(V;Ks>X4X1(YQ_A%<_xk-ody7 zp+}qcj&^zNh-2mZ_UV#(Q(G>PALlEvSmuP_LM;L)sMX(XfP611$fXgjzUhMa4X4vB z8)ntsawqa?8X)@lWGPDfl5E(+@U=#aU?*$1qnFu6$2yfumfHlMXzNSYE;_v6>=e@m zUeb}wQlD!)k$ei~+5Z0BZf1=wO;hzmk0v^j+dz}se-%88^Q?E#9K2h>kRpR(wYODnX5|Nd=b{zL^v5P)+togE=ZyUhqZOWnab>vXp! zjrmC{Qsf~i&9|#35HA72QQed}XN$qAn`;2gZ1`;KDa=*Qo zS?Q8D5TB0JBF`+xJd&a8h?4BWg6s!H>m72Zue^(vxFzlZa(GUSL^d>Jly`ob(biUc zZdz-|-=l8~0Fh>mpx%;UXJt>eb(RXLQ-kfOjo(vzO-BrMv9d0)baa&N-OkrS8uLnI z6^^NHBQxp6n_924qGEKU4aoL%*W7--HVXscUwX2##<&nr$ncriJ3720oS)S%$BO;* za#-=|Hm-BfhSPeolA0g1v5iVBTK*etqA0V*h=O`{WpU{d8a#i$h;f9^nr%ZZFfAL( zJ&+sz24^hFcdKHaCxONo_9mCK+j#1dzphqDkwN9knn+-+MJ}B8!{r&y8Oa|T*)CSM z;H8#t?-uj^7se$9<`sT(_EFR6 zcRzc`%}!;QtGctV%B207$9?Fo^7msPZ7wOQz19 zG+5~4-;j8aqOp{QOsDy#s^92LidOJ{1GGDN7dQ(eROaud@qXdz(A`6m2b{TcUir|U zE}Uy74KFt8eP8)Xn&?SMgixx>bi>oUdi{e_@SWxT06vaNwxf;T$7SYDVlmbNz2}G2 z{Hhu?9hbw>w#i5MYZX63=;-|`05X?)g86(LWSCMVEH=jT7g0vm!*wxGrPGT|2Lzh7 zSvF(u0u2;D79K4WAge`G{@94@jhNvz&J{b)4JZj*Vh=D%A#?S6KX@*J7wSBZB87zrpEM#_+hVp zSxPS~eh6T#V~ZylDCHgH%^&uE{;bbccxpdnL+D!uv*VB(Ag1?oQ1K7e5OkzYp6k7Q zl*e`(tF`J&L4Ez~CD}eYG$UgMLQOX*+X7L`g++$An~tzW2@?+8_IOx~D!+qY7?%kp zficc-*c6pgKyFicPs68oSLxaBli3#YCNCbhi9nx-=TWM#m)8S@@5;10=;T_ zh7b$uBgu@Q!5}bJBOS%FS-`=m-9(1Z14P7TXr`hY zx)_bkLg93&h~5JxJ&KzYR@tx1(hv`!TX*0smPPeRK*meC3<%97Q<=`F^fK*K0LoS= zuym|`9;{g>m1+2JxA6bmSLqIG(vcVPYbB#yz&Xv6eBmHbFrO*-D|u4MUmaaaxDSx} zk{1RbXr0cNelCiOP}hN7*J_2%Ph~VA=iqAS2{ZJjEE>(Sk0FO1nE;wyz}xkd7va){ z=32D89IYmPeu3A7I`gapx5kAhVi{BOcDGiJX3~JEq;00c1GA5{X<|{rA ztdM5B`l$6@L6NEd5OGovbv|sP6E$tdUHQODnfxjFTy}zanfgRP^v)O9fcn$)ZN>ZJ zS53>Pidn(ia2JFU>-acSxBK)nmVDIp5)8h=@a2+agWMp6+?;Kpx8(wmDi&o7zX>^zF}H`)iHfk`#uppb0v-2{3SZCWy(JZU~zCNRwP z)J2U(;7^IM-cN||BhERJm{Q}Vh(F5ZfworN1`< zgRHnuCXc3>XAp1MTHA6;DSr`+*a5IYqeb3*J4%8T_P!C(i0K+R^+igG*X}+mFMUWB z)&vUbO=Mf2Zwv4*7M?amzs3)>tqk9pPC+|q*88!v+}8; zy~L=0hA%BSNTRlwW>=y_%lot0LQPV#+Jpjm%Oo{KX8?yP*_IdCS`Xuugx`F^uAk7; z5gLB0WlVw@Zb2V&3ySg@T{nz#@Nsb0U38*pPF3eAd*v(& zd6`t3JiTXE!kJ|cB5e67fBp|~o)&)G;ZpQ(qLMTOrn+*mEr<;si&Vr>!1YWk?-h}D zs}bZpWd8Fcm`d>S*ns}7uGRXcIULIRXeaNiHGwd+t4yS9+r7kxO@HmEt>rJT(Vk-@ zdtU?RI@Hq6ELqg@jnDsC?ptvdy3wkVGN1T^QDFUYbXR=?VOr$FMjIo95o3#Vz^lhK z3Fa}U>~k9t->Pbj@X@%fajDIEVm=!i4`8D&3s&H*cD68t3;m(ssJ<(U_BdY+Nk%+K z%n&Sg>H6zHyKb=ltRv?*>oQkY#@K=ky1@e7j|4ozK;Lwp4d1g!NXkdwJ4e2s|AEe4 z)1gy1&`}k`27i}wOmjbZ_wQdvGw_-WQ?LryJM4D5Q@;^U{y{1@l=B~GJ+Jyj&(O** zLa@OsW`jR}E}J8XoWPG#QZ&Iw!#qCr_9^gnn6v}BQBX74QL z+D$jx=yu*$dRI+6D||PZW(PU6d+%}o5y$|jF;f+jwyGP$HLSG%hV~QD0U`HYz|c#Y zXbZ`k9ehQ1G47b@S&e!bQl>UyOeNOz2~V7)B+pG5jX!ys)fwiquEJsG@<){9JGGlH zN%^*V)+7RG@MMPU+D|TeWC`>tS@i8~+yb;B0iM8%-8er0fWtS@li{Yn9M!|S13DUg z2&4JbV4Cip(!P&S{#(m8(IAt)SEoDfQrHFE{fgEw>ne8qE_jU&|Ni7ss@9}Ia-tWI zNt}O-(>#CJ`<>R}?)P@r2`Fi^1a8HAxg=spQ+B*qN2Pf87X_J-OQDiVpRY_{6ik|* zs`9}Cn4>lw#GGCmX3Ift$!7#c<$ETSjtPu_U_C1yE}~g2lJ@jkWJ+wO^ev=~@%O|- zH%tI1W@gpVL-*TPaPBn7MZb^7h$R{0;}lBaztquKiK$B`uv3QcAWRP$V;cHGqqeorpmPF6BV#HbT)2y%OGcyk8e+)BgrrKd3PtkHcC9XOBUOUY40rReF z>n0lN66CBm8MOO=Rg6D=WGHEITfcH`=KiZP*1VTd(Xy|i*qTN&SDA(W!z~rph%JlZ zaBe|moYUW`t8YckO-*sKVOpMOC2>+@c@vex5+70dt3Gp`=b|ndf%;yR%Car5dze26 z9cbWCzVeLRJ>aO%JuA}m3aB&}pz(e2x4?p>fvq+xjYxNIE-Tw~&TAMqccW%=GW*j7 z5{kRiusm0+G5HQw=iOfyuKu|3$r%^Z=MU(5n`S+-E`Mfr(a^@8#TP#84v%~)~i&Jgqq#?blOd8ABV-X9oJFuJ*71d+rA=C zBD96K8IL?@ENF6?x5-tZf};(z!++<@6Wk+Lj&uwr4w~)jvqfn!6!_4u`^|LKA_xcc zBZdm%U4_%elT9!)e8L)HWqTo+GI-*@C|7QQ-VopWoo)kVGLdZgzC^iVrLzU0jHI(p z%)q)*9ba4QCTEP^M6PV4&2;t53q>I10jy4+2rzoS=*sF3eVO%E#rLp!-@W5QO!$t2 z_S`hQR^X6S;6agr*P^z@vH!=T(V{A}(4CqhuCT7r6F82o@MYSj22t-(QMn2`9LP8@gK-v?TEtv#S#+DJ8yE|-VH4jD-4LADf^dy4Xkq* z0C0iv@0v35NME>X);SP_E<{A#5sw-^3Bz3;<@$~=691*HzoT^^)#&ioRKP?`owc)d7f7bQ-?0rqs z%FEXEJ(rHMUsY{$Ril(?<*-gslA$MshF)q+-NZ_pemL8quZPv zkPQGtSkQOIZNFO!1(B9pzjC4I+|q@A+9s%c6W9d=$tX7p<5g*5k_3z5dIEXd1n=|id0RG8hyb{&>qmVJKkB_H6HexV1|?f#zb+3dJ5uGmIX_Y~Gd zH(iQMbsGw448M`7Z~m#!0;p_HQ*{abUZ)N6T_=6CrIJWcY6wxpPYRLYCFr(;Dk8W)==sF4sNhtpYn8gfk$IIf)$~ zD%PBn;kt}}2;S@0Z?p(YRET5?n=P7DVcY_g_@Ex&LO=Zx)qkqL@LBx0?#urJ-JQc9 zU$XQdNm{)K>PpnGx(W?oSOlFe!69I5V>S0ilny&eN2JzH3vb2BdLgqcpN;(^`nC3V zgRF_p4?^ag*yr?J6PWlbixSk0M=b5LNF7>OY(q**K}QduLo)`1<`{L8&$uqISY$;5 zy7k-qJmdHL%r%1PWD_XD*8)$sb62J{ZSJdJ&W@__*N23P@6$ttF$t6~@6C3D^`nO;U=S!FQPSza znL3*#{Z85>J@v0s#k`dyU1rv4CUz}Y%h9JYL21UZm}yy*j^{`*#~;zA zT`hKtm-w$T66WtF9BQ4|-zzZ4nhtP9nJ4rgtF`@P(i&p<^Q&XzrJ0~B?2P#HB5!qb zEEoOPj_rNjq#wn-^j-;0{eLl9D5>p~>c6+D5iI}(UoIQyeiZzUNK>h-unwOaoGkdD zXa~N8FMlKpp4w3{jR+mp2XL_J`F8J*WN&<(5w$- z$(y37mSxNEvhV1YZomX1>olC&r=)-~Gy!}^68Xxzg<2mNkbtM=2Hv5Gzia@G!S2CE z>$qxCi)=}?4Z73&xT=jhd5-+viH`oCblG>o4n7VyS$j+U46I?VG%6t7G|?7$uc}!S zzrZO{`d$lhzZ2-O{f}+fbBQ!Tj4Dz+fn!sSRO!!;TMO}rFg0BQfpgx7z3OgE9~D#H zD0^|Pj;moeFSVR{OHEb1F~BbcSU0sN`_OZ5Jf(X(pXG~j^P;CI$*dl)c$+P#V7riA z4@)My)Z^(#@!@rI9JGyV3M(Lqb- ztkdIjyW9yq{yn!5!AiGYMs6Lv(^?>GHvP#CC1@iDScP4J8!+hUs=A0dd`*tA$>|Y$ zI#KFc1?&yL4d3cyaRb8};?xx~KQE;Kaq@(kfJ1lWZMEHo|X8S0co)5{l)!_To(Gtke z^!SK1i(Zzkpixhf5@i7RtSH}{5Z{Mauwz&!f94nc4mMN&*>enmm7ipc(42?exaE%x zVEg24GW=rE7r-xObw&bN`euF*Z@gX)EJDZ+sqPkFC*A&tDIka(mr3Pyb8}Tx82ag0 z9I2DL3HRM#-fJ2W7j6D$b~0!EZu(t)v+6XeOV#VWIEeE6V*f^q+{?(2H!_>I@|mhz z|GvR?_!DcwuewR=lh zg}+L-0do0=D(_C99OG`Z@OTmcrhy9`hb6L0${rbcx&~_mX7hWOo!uA?%sj_)9nvoW zy2BFtNjp#7ZZvuKVR{+*KYvFJP`_+);9^uL`O=X=KWjvBaiCN9=1iXJt|dRVQotY(_GA3D7WG-C1pveIKJM zsI=-u@(uSlrV{_`(d1q1qiPzq+0$y^AEABjs$(`2mMidL50}!c7#hyX(y170mmH_} zapH(3i4jzaKK;`%>aSI*>_D8ypJ;{6G2oJbc2l0|E|1{x6xl-Ye|@SaSRg=y^X(h; zBlEBwU&Ywq>tt8RhH;~X*|J#H)J)2VPJ$n3hLn+^=CY7;{&vgL=2etb?U~!YuHQQ< zM@B}62t9p~)9?Y9N>(0`gZ9F5$4N%7V5D`b*w8tA?=90gFMZ2uJ?VF~2FsEbr~eMKhlbO0flt!1f5< zL$Ncv(m7h#LPlKgr>&Bk@5(Lrr0D*JABRQan%V_tU1pOTt8JVCc0)ZT+p5D$gQIAS zaV3S)CQ8+Jf8dFj`Fk!|P~Bg1O%7t`+IrpOMQJ7mExOuUgl2IaSlPYxta{4XkCto8 z`xK_P4jI&`eftIizQ?jmb$i`RdU^?nTU;RE)L%n8X*ah&d>Qe6ocmv&#umC}!*L%6 z|4?C|gKLW?bN#x-u*Z79cg7T#FM%NQyPU00*q{gUk^5p-SLEuvT<%4={x>sc z{tyt8dX|@&DE!6#=8d59LbML}ek`ftzWs~tkS5w7bAvEEmW*U$?nTgo`*sO3*Re>0 zJbvuFoWU-YURpRm$$2%R7+%Gvdk7!|!kNXj&$x{Qp}PC-qt6XyfR^d7&DNVIvMC=Z zoMj}IN^Gg0H=9OdCyxx%ENPsmQx|8ohv_D&H;j$O>$i|{zQx1QzxXp^S2k`HK86M$ zXH;ICyE<25M3Zzdn~fO4s<8gWxS3655<4`(j90GRS7(VO)lZ)5WY^tRD%~ymOK<9X zOLiLw+6t5@BX{=5bd;oCY(H4fR~`hRcYrH!pHV7nU8xRb62AssNPW_w!_BEjN8P5V zf68`=JbX;olc$%St8j+lulXT9X)1Mhi;n!|@04uZ+Xcq$yla@PY9g7;Z0Ol#rF z-0n0(o^VV4arUZ`|E(1x0o6@@Q(q*tVDQ5p_f7vOVGVF4f4a9o#n>m_7yixOVwIQ8 zXG{dHL<|xv-M{e7ctkO=0g-9Wq*3%-0B|9>v%=|Q=L%>7&nRlI?c5$&JZrvO;E6?l zvJ>a2x6#@G(D(4eXsE~nFsloNu(F0LR?mUZdr43v|IU}m}jZ;A-2oy z=Z}$ebI=IV^_cC<-Z2fX)sis>Er;@s{zJ{YmV}}C$t@{WdDn;?+oq)Ycr5rbACW;N zo;_r1o!U%kT|n!ct`zy9x+gH+j zgCTz`qkdvzitpcP>h1rF$skRjtFgMBt_m@?5wUQh%lK7M7$X$l#EVi(it&-|!9UpF zuFo1rq%>YI##;ZHLd}_4btq-C5cXbnd&pqBjZXjaV59zfsXkmRCCt1YRA5y)Y^4BN z74Ffc1o!>bTcl|%@;QDdV>%EY_7`GrjQ03JwUu4UpSd|NM{H!9YRE15d-Gd$LI(Kv zOP;Q!*Aq|j`_hQBnf8e*!O#;L*3qLnifGYXg56I&3+Tg59b9~_ex0XET%)?X-a^@y z@*QQoqpp!l+lNn`F^^5!NNRF00A%zwsyI#U#oIWtAh!&L^$8I$kP>^3AWm_+YTh6-KjoX)&XJ#{qHt`|Z^ecVF z`LdX%`Hn%x>DV}1S$=5xtxYG)Od6K`>_~~Ocj5%uNH5(8i#6vqNs03B3Oxt=qdn+! zzOjolfv{s;vf55jc?8WC>ep$?AKhNR9ww%s_#sGd8%16zKmC~rt0HMqiS8c1R4Xeu82kCr_^FJbXghdc zw@(TqNc_5j0_mk30%aIDn<{)StKr;{v2h`=XFHwl#kbkvu)kU0W1RDoJ^_(lB#t)C zi<DCm6zw!(Tl!wa zgGTkW)U@7FUdQTiV~qMPj`uz&{9Bws*9BL#@~=89`E5jN`$Rj&V+zolBvKa`Up}zF zaqXC{?5y0_`cn-;20dHM6E_kXjg4b)?58=AZ$1!N!SpkBcHfby$me677U&RumKSQI z^e^VUTlfnk?jAh%jqv@SXhp4ZVGV&u@c%;_imGCJ4?q^GRv${;zG>|-e}y||N1#Jx zm5;Udph%aj;L&|}tE?1AoH6vGg6+401$S21bDRt0qPjVd=wqTKYbfzvDHE^tDB`zQ z%$`53F6oX5W(O*^>b?Le(h0=S}3>-D&xl;MuRoDeZTc+WY@{gG?c^d%;+c zkreUbe*CU;MGp=tFBQ5+S44qXWOUuIbjs_-4N63uFTf1{UE4#c!6a49?snwy*&lX_ zXZ;%XD-~Fz_zCBuMRIhtedc7Cp22gMae4Uv0Cz!%zR9j992y5SdpJIX`$?{+pvCbv zWNW-YNU@-i{b{H{{VTy{{X!c zkL6b*@s61;KYa&3^w)3m73E5q8OK_kBF;MEv6Lc><92%7ULdfLe5Bz20CknWk*SWS zX22gY*Zt_ouO~}&ARVCo6{TqlW94>IJs66`$s=V+v(bLvX2$~NFU7I^=<_3C^2r-# z(Sfc)814uBx0e{`SfA%p2&V2;zurYZ(zEvQH2SoTie%h5A462ikWX5~neJg+s-`>R zEBuW?_Uo^I`}ozYYO82vDiTJz_;TC;e4+VNGk9(s7Fd`3YpyC4PCpv0KLCya$K_rJ zx2T=&d&)6-SmhhT@y59;lT@bCl?gk4O6v@RlyTF!ss!MSKCI4kW7y^`tN^@I zj*8WyA=+_@w*(luJbwwN$q)`b!Ty!k3eHU9t3@q{0{ozRS4-e^$BOL_`x5^E=q9;M zQRjlrN48L9PEH0`Dac~J;8$hf_);$tX;VW#+HI~a%#hp#+9U-CZaWabf;(4jABi*1 ztgRN0Qx#8D)Fab0_qvn-`b#^THd2fsZ#)xpw{lroTdAHutjFyg#HEOIki#y2!k z86;ri_jzCyNdEU;IKUO-`r2&0kOSQGHR@WXlcxAW2Wt(hsNMnx-+Ul!k|P=US1a=o z_?+UnpBY&>wu4evc`eM136=55z#yNf&tBv3t?AcF(VNwoN(*#g=(f*sC9)=WJi`ZZ z2k&}-arMEkNwU_~e-eLXiJC}&8Q^0n&rqYkpc(Y$xQ_{xT~ALANL}Qd5((;ifBL;^ zqww=s#o{gOz=`5=HcO3#zUX34-XM;DgyX2qV~M<^k?7!~?n&D0>n?dx2eZrV*#4c;F@LmAuZ+~Qb`#D9@+M& zbgffY(}TkLUF7P)?iyJm`H#v7=zRbf1Jl11$xU*~>{Sz#6FZ74Cr8(zADw<5gfRMOAdw#Uv-ek%zr zPP%Pa?z|T-o8j1Ua=?(rx&v809MeCt^mw3Smp?NcR-EY4Ln0&Wc2X7bou}6ove@O9 z#K+cVw06M({{W_~{{W)g)`UvQ3M0PuRRFdzlj~WU`Nf<^{izTBL6Vr3=Z-J>i2nc& zdO+M5}!9gb#14?VTIGnDz)EuMoltqjR^B##uk(L(2vahl?&q@DU6 zg%<}G+_4IPKqHcCddABjI}W1*+Oh4T-7*u{^scVi4$<_lKD3VNaj>w#gNERa)j!$B zI#lZ)%baxUQtjfod5()2lE4T#85NS&(e5}W8Lhj*NGiP#Q(2}idRFjy8Oa??mjE6Q z<5kOsUp5b+eyghrfbcy+>%2kBr^l`>s9>nGOf>S)8=wVf2~oNjzG<8 z1Wb#Nb90f?X{SD?2A48CkwY^!N0mAPj=jcv9OA2KjL$0)asVSfr@bztYaPjpc_5)? z1br*6gt;ej6M~ZG*q!h6tBd&joAuNF(f4gGHpvd)4ob*DIVS)EkH-}*ic9^LJtp$r z(9Fa}`EOvjuqc~$@nsKMi>HJ_#1HP)$f_M3|+ZQu_SX?Ktq72vKE^OKzYDXp(v z=rX~1Ya2O;?_|hjml*{}Bc=)HM_l%<==-iReQbGn>QIc|vv*(Chq-C(BsP}z*O$@V zUMdxmK%oqbzj%cmI*OX@ZeG$sX)JMEW8`@ka)a(T;=HF%msi(5AXw`MOc6d8KO} z3s!bKuf*RFwJl>!j@CJnA321xc^iR@E-@G+1NVmFK{?3xs&VVmwvB5Y#CHWViLYOC zppkQkq)?@pGPgM+fyY7#Z>DMXy4Ibe&m31VOA1^~C)yVRO^gVQz!*%X-QfoMCCZzT>lCseW;fov1JpI1nXyZ$D zc>c^|lg)Be;eJunou>!4Jw_{^*IwowVc}Hr+7?-sFC?Y_-#CL3*`kLtN?k=BA zfntpvU`XX@E+gHuc^gtT4VcuN=OBa5af;}Y3w>Y0zh}F%w2J0ud82iWvvB*4-N6(w z9ZpE+AmhDU>&8y)oj5#C)!WlXbe|9Gt}k?(EoW5=HPb-8M5HO$nQQ=h?!h_wf!epM z^>4H$t9cZEYqo)zV*4bbR(*@}AW(RH+2h<)J{PCA&4D7>-yCTi`^;i?e3)15nkVR)my!&cMKiM8x8G|$7&nInnLxzxBmdX^Zc_y zX>}7)dRV^E!#ZOpq2*kHGBMh+Z&o;+yt%h+9#{{Y2`fxLFP;@WaOf)DCxN0uX$XE(9iQZoevo`chx;M>P@6Y^TgkM^ZO{Fb4A z8(uiV+hgl;Px4V!&WZL?I!#khmBSx5!0d$7W6R6t$69VV%hSoLH_`+bU(~fzqBrtVI@=@wnG~K041)4Q@PCOp0;(3 zF?#A2qaXCte~o2pT7|9Uo-L+XiY(6C0XQC0kLzA?b!|&--|Y*XzKbT`<~DlIa&;S)dSien#~{{WuGoXya8$?Z|G`Oo?6YX)?q3s^X7fA|;l ztln|kHKD9vv>!&t=~;of+D|>IQJ^zzHK7UJr^a#s86^HDvmbM0C?tWCj&Le?@8U}| zjZ#M2)s-^6eS6bsv?XN4x{)EbXPLsP@>{5^i<_8cvIZ#&?I03(Ijgr#74@%_8!E&G zBdGrX3dO#(hRso;Ew~;@J!@J~i_z|7R#IB6itci9Y8E`yj(ZA#p0(JK#U?fW^)QUn zo2?)qRiXr?%?v?23{naJ&<%h&sblBzs)|Y5liIAteq2@(M@$&YmdP0P6}92g1T${r z61;lxSQ#?R5kYQiYQ|r(UdaTJyg;@9kn+8NJwKI2b^E?lbm3FKJJ{U)ovWj^ z=m+IorIe%XDl^7-uDaWhI#-2NYI{*y7vdeb=}naE1PpcH)8hrs)jaz8RMNs89f~kl z?$F)axDP5=s_2vr>bsVDox=-g?%VUwkl&>? zO*LV%xQgB-W|0Ug&R2QumhD|oc6yvMO`|J7)wO#)TG`{C;aWMLa-G<2FiBj1Mg|Y3 zHP~t%C%t=TYmsOmn4)k~dH32#k= zkTYAc*(95=%6)O|ir?CFM;xM(ytgmKtjC~QStF!UK@vXVk*K+!c1ZckVE{bipQ+-V zp?H8cpETDpU46RJ7I@x5UN%_>Qgh1&&mSnpD+XOUO=1WWOM!%HN*k+(3Yg$+!1*Yzx>wqH+E5lo3h zM7&sVxmPQ@Cn>!2Ip{i8=9Q{z7Z(ImvcAoGlI(~o-RoD*6eJLcxt z?rxsm?)qshK4g*I6-u9x17x?+$O){&neb*7g3 z{#3%|fsYyWpyw3O-1VTJf7XB$0!SIgdzzAYv~w&_>Dt6s^Xx4u#T#ux4>)XNA5uT1 zK^5McZ+~u@q_e|oD{X0`APDi!Nx}BO1MzDejvumnDXQxj4E$xegA{A^F zAf5v;Cml&@(lNR-lo41?p5Q|Q%N4RwXW!}X%=?(km`2VcJr`QE5SMFM8wHZq%KE0e<4_tYZr0I+w7eXdSQ+|c%seh7qP7F2k_>i-;&21 z;Qs(BrkboW#u=tX1ZOP3W}|&h(E-imBiqadqRhLi6!n$?;2zEV#bzM$4ct{5re;PYo_$Y4R|^2new9jNn|3+q-_!p9ty;r#8k-Qe>rppak&iVI ztb>4IrQkDslx_Q-mkJDzbK|i+=6Baal8H>^|M37Lf?!K!a9euI<>GRypb0g0)v&!dlJF#Xb)RWZG+m(w!o?#-HT2+r>z#(yvM}NEl^x~N_ zZ)R^^^rfphX-TMB=UZS-6z84}YqHa%GQgm6*u`-AVP}pr&Ps~yH0yCKl#misNTo{- z*%Q-Whzigaf`lag`mqf=5go z*J%ah6P8G%R&@Kzo=4$awIZV{Cv!-)<93SovF#c{brQm8V{V2&v~)m3LwnAx~-lhf(zSyO8kXyplzup}HtdvV^nsi!TDIaG;! z1>N_T{`)4dVkcGC(1pYpA@RrV@JA=qpYz2`b=zUUhGX>rXZ5Kp^%$T{w$@t|oS&Qw zpMEQ$L9NUsX&nZaWe%Tcw(zsC3|O<{B%Zmhd&KslJwazsF*5Gs)Np^LXr{4nTWVX! zpLQn|M_ag_?g&hqR3jb}IOKQcsfsIGBMRKFosu#=E2z`0!MKcIVEoK|ao6>(S|WVF zIo-RU>!1GsU38GVk;5s5R*8%72VgVBSq$k5Xa4}KDXhfg;QkfKz2(^?JfHd&-{D?^iqGNb7Xy|~zlE{^#tti#3XOLX31l-PvPtrQ zhiNz=fzEjAf!eiZG0SYiNla0){L*bLupXxala9C*ACk(JOd?jdGt1@NuLgg;+o&H= zR|jOw=Dp-!x7Ykvr$Oae_=fFbJOp@SIrpn}`jyqp3h#3%Vm@LV_3uu#)vgAU2Ay!mOBg|9ia*_D z$)GDkfB05E?#h|3C|41*oaU*KvvT;SxKclvj#?I#@a2TIvQ z=W=DZE+Jc`X-tlQ=L#@S=f!&Kc&o#s!M4_VeT?Zn!i8yNvl(_?zc4<(g?OYg4?iup zAMGh4^sdpaAsQ}|BnxU@J<4{@=LbCkme1%Z$ypGc-lkrwaca|*3gO4z`3+?1MiFfT zM6xMW`H28!w-b@irYcL>Ot-Q-ZBj|j(~r|NS5jat;{bw4$>%-KzZD8T@;2u8Bt~Q{ z7UFvINpQOnAwPQ?o7kR%@EH8*i6+HT05|~RqGmgZCnTJc>sqC$l%;eu9IwAaOeU4G zIS1+~gp*c`Sk|CXg2#YK`c%fw=@DJ6rV7I#n8*j|PU&t5H?ax(w3PE3s^p^q>QvJy zh_IZ2jt2E3pNFvOdH~e*ON*F% z^(_KK2152`$nWXt^{n3l_-Z-sE%lvIV*bt>%u6B2k<{RlI$(3bJf7IENt;&^M)8M_ z$zX5-@_*o2>0TabjhtodsBhK&hp~Xdb4Q+A6f|!K*lE_}Uop6spDY(B0*;&xr2TPL zgckO4u#BH5bG4t5_}4qE_^M{Q@=;1|J3v9y)ONJ-u&-AYU0IR~C-?JawuDQ?sghB^C-JOxzks+wAa>3+r_5=P>rl^nV2=VN1hlT(LwW`E z=C^oxzK1i!L1}Y;&dA60rz}!k$RUn5k;%%c44ylRj9OfexfGHeyReb5=8Q0s5M+H;OGRrC%poTUZ7>#k01M zF((}1+Z=P8F!rkvMw@dC*H9)nG2BSIz499v`s48A)q`g&R*7)ht-whbgpNy)>~J&A zRFB7}J?f(hSG!O8`_6gBQqY#iQnl5l8hl#K?6)#RQ!rTww=v}47afK@@=tm-PYFS% z$gwgcy6v_oCLWS9$7~D<%v`$P)Q012Y@-wO)~z_Jd@s= zh^L>2lgn0Zs>kIa`2=Ge=j&V3gk!de{{UT%Hwi(QdWEbOmjXC$;<{3yi^mfv2h-;3 z^&C>$OjnOuitv`WPN$=XL+s|h^dA869oGOO^=bT{k_!`f#jcv6HYnfSo&SrcMz0gE8MXI&=w>48g=`a z?N=Xo$hkjGY4by~!}fO^Wuym#{?!m#1mC<`xWjTWxK&v3D_kl57?|Vm6!{-%IcyB~ z;-a%~*EeT5%uF-?00CN1T8?i*IA%X)PI)Rnl_8TgMk(VEqL~1~;{?+Est`ZcXXp(M z4&|wy(c94)Yl+GhR+$Rt*c$NoL0<;QZKqd z=lRo=Z?K~)7G{{scPo>evCUT0Pwtkhi>W~;%N)}txRQOL)BrsxayzD9v|OzcjNEo+)~Bzwsij+_T1s2RXue#M z#WR&G#E?krP7hr4u6emUj-wwOZ!&f`E)e9^r)L?antpz{;f5oj@6B&C3}@(wYj@u$#JGzou!-0xlqTs z0dtalg;_Fb?QIlE6HZi*Hha0=1Yk43P)R2~++wkU4wN2i({M-6`$mqXXVSFm%Uhv+ zsIEe=x3alnBN*U=G7Z4`b6R&glU?3I@`7DdE@e<8Jx=L1@w=R0bN~*u#aSr(RCpYN zyN^*_6`j4s)yP<)cYGELwTW}+IOO;I4RPXPmWQK(yG~5)Y&7dRbbFmz=Tv4e4E>VL zRj0>5NNxZ=xWE|nt{(GWxoM?%)<~_%$JyRyAPz$J$s?|K=i7=+Ntt7w-r^rB;y&FO zTm764IP1-0eXjJVU$)%s9TdvM56sk1p%$*kTM;NbcNOohE|np<4|f!K@|Icgw4VKI zuJDGXsM_1lr?hfCyo-_Kg3OE#K>0&qNfU~wnH6Why>j^d8%<4`y z$ul_oiLR+ri;bDd3i6A*PV2>*UX!Lx18KUt-AN$CmZYn^BN-WG;4tXo9l-AutW#@r2ha~>2#fQPF)pk<}*cbq?uSjxd}bS-bWbfKN`-pIVfn2X;h1r ztg9<+bJrQ$pTilg23M0ol5hFtGH2K@BR`#H++XRe_>Si*)0mub`Bsct&WO4N{+%<= zaV^|FT8N`DfZ&Ek-1z86JRH)5Wu}6uDC?<@CBmusoG;^5Uit;=D*hEwbmb=mrZl-$2Wa=RJ-pAGSq|WytPA{{Z#s>S;)D3Na_Ay-u21;4E>o z^Vk!gra7nXp>s&h7m@akQ|Z7IE$bEmZJd&L1-hTgw*LUJ_9)#9k!QT9K+mT*?NZ0~ zeL7hOo2G*C7|3}elL|%-IV3l|J4N&poRN{DxVP4XM^(9KizMJ7qZ(6>!w=l@UMf3J3K*k}ET|*TNGBMVx&m_@l@#yo}x=m{2 zIe(WpJqYdjnv%<3hRWq+7Gml(1Q>1(6zBcZkbZ`;WRfYa8P?Jz1e3H9Lea=BeRV8H z;rS(v-KIs`9{Bv-z)jNuSd0vga!3`{YyKCq(L5ccOK*24+b2F`nY2u8W#k6~a{+0`@XwQs1s(hS2dy{kahLf4Z^E9IT7}8o6|Ll~c_m_V{VLLUXIq<;((ZjTCKn%> zr?*hO`V>vPO5<}N=L9j^&%F{0MdfAygV5o>QCl#03tyY9tTB)FrO*C?4O~x#def1* zYh@n4IsSr+$GEKf9KaAeg%YBS^29b5>%pMgtDHnqVn64bXa4}NU9^7-FHA3;LNM6v z7a#f+W8VBf)-6FX#+M*?3!3`>FlZ%Wj>v_!SEQ5uFZ?q>DLsAWw? z#kO5XA_0Mmn70l+$o@vBlKn1Yjc%=(1Z%Zg-MfFf+0Xs-DBob)b2G?Y*kA`eaZR~} zKwY2=k4n)NNhr)0a54Vm$I5TRcgy)TR@5V&-)Wu+A|Kv$!2AzG`O&S%rI}2j1CT{C zVC0O|ot|go)1E388*&bEJ5v)OWf|iXfB<>wJ+n_a^*Q&X=yFSRrXtPcw)cT%iq;!x z0)Zk#NLN4JRsJ5_XY{Ji0Z3L-q$6}W>qx~j)Q@UHREz_TdQ%YZCNTk!$OGke$BdQ- zIXUPt$6s2D0+)_@A9_K-=LAp;a*Sq?kl>tC&QCjW^rUR#G>RiAzzz*tj{f0PERDE& znwHYtqy|C>1A$8{IPFr}dNJFv3tp0WsLkJ}^QAGbaZNC=fMd)S2ZLZ}KW{7GEsTn}s28 zYQFR9Qrk}~GLsuH+@%|7##HqmLsw%W{hykI$rOzno&i?DBBnFUNit6>f-pgl(HbO( z8-hNSBAB9+b9~2~WMm(xu30;~o`p4Nq>7^M;wAFd<-|vVio^mBT-2bBn5LIN$82LK zw`!&KygPiS8RDr&n`1XM9nAx`&{X!!7DYJdb6E!OJ&jLobj>o2xChporp4`}(7OZ; zGp~f&f=ChQV*wQICshRzb5y)$Vd>kATTJ~CPgb+01S1zG& zzDQh?$0V^mewpW`R9uoaoM)9d#s}7~Mp3NTZtC3N4xoSa>GMggNNHa|p%a!0pl;;} z>*-2anYR3^$>i~mr8#2QF^sVt4oBuF5i>ZzB#ylG`qfyGLpmr3#WoR;yo~yaL@%|8 z1OPgQAJ6(!qB58c3UST{PtW}GO29WIa52vvNBI3Hu#DOCt7)9L1sfUNjpOzI02*5_ z4@q|`Mz_{h_#kF?WY6TJezl|JsLo3cGt;0wIi-jxlL}5)ebIt{FKVu4iE&1)zrxK` z_PgKPqANEMi0t+ zcCN2W_>Xdmqg2yGig@#FzE=8`Jbpw`d30k+J%_{aqe?;h+l#0A$_M^}qxsf^KMnK< zhs|u#f7eME=k=}qH^f>_tU}t}Ae8imVdcN%dj15~sZtIIAo?CEZu*TS(B`1sf~*y$&;$i4 zGZ@cGuthP2G$Jj7Yket}3Ov?QI5j>#R{B+IW43Wa1ZYj9+1eDM*(Te6%m3pCanY+TiErc$dh3%`gw* zrlOYHPPt-Zy12As+TuSkfA6H{{{S4XqIG`?cy{cWFRkp-Px-^)VI+Pc zuwEC9nOzwQoHDS%Q|>!c?Pa^Noh~gdWrAeoWQoc!KT%yoUlBE?jbYX!v(}+ejk=|( z%yI1eF#PJSD>ISCKmByzdwTcHc4JWR7On@{w4F;tn2+_EeBx-c>*dMF9>CO1;Oi^L z&WGZyCiwLbT^6&H`+WIR{{Vdo1ag_{&*7Rz#@?OlV^#2;y`qi!hLd*;IpT2_=+7f2GfB z5Rr^bkH8_Ye-X`cN~B{uN$zw;jO89^tqytAl1|h0sJ5SeYV4Np<%b46r0oG_Qh5IW zXF2?8xs8CqU({C0NS6k>5a5%~>sBp6^MD<3(xcjY06!YFVIFP?1bl-y^{Ja&thzCL z=vQ1+drexEdsH~~tfKTf=M{4`ZI`JvI>LaAC;*-b??!RXG3i3HZqKiL*HzCl$hTsn z&e%vBzZ$%3!v-ff^&P7&8GN?{b~&xTH6&c^`ElIyn&zyUcSmL(SBpgnY?VXpWAcoi zYc$#P`MYWIeoa2$mBhsW;n&nW2z+RXc=e=f?(1!zm*-9|`=x6&_3Y-1L#0UwAJ7F`Ot z&S8rapO-$JK9!+vtkH!i?UTD0&ov>?FW^!x1*{6A4fc!JBR^a-=k%+#zu5L_e%0bl z9wJV06tY|&{Dz00#Iv!iVwCuY!SFB7tY1!7 z_-*v!0Y1gte=34c9(Z0JVAD0Lo_b4h3?u&l9acZBQiO@-MT1}poVtKJZT0I-lEW>X z&wzRPaapo_O0-5M*TUAI_G_Ez<^2IsRHpcss=hppGefq=Lb`g$N8=%CuV%&0jh2Eh zx@AsJMqp3nRw2_aE>N^qcM`8cV}+aP$Q8)kd_(bF!ju}sup#~%XrqJ=`{6;XTdiMH zzh-OAM)KgX$unF#=kqlEpLQ;D(r454L2fK=86mM*OvABJjd@m_rg#JdYiC@rmYDMCXU#DZJSil70}dFfT=_@Ssa ze$S?9@p=~zDg8*UXC>U}ROEU)Fdayw5y>I_+;0B>?x8j16MRGQEP#mbqqiUQ?{6A@ zxHXq?;$3d^k#noxN~fr6h6C~k6uIJ3r6bzphDeum${}A_T~L0cR&4sVlW&nWv!f1$T zKaO-*TXZ(Ml*hJAafAN=p&Ij|$JUcIE~6<@iRf?fJ=9{`Mu|tP*DT-q2o*wqiF)ju zMIDR-{g^i&&55orC)SU~mpqD8;n?Ws@yCg7G8Veb8)C$%bw40 z$NJS8LP({|-%^rGWI2gEu*vm2(+hBD$>O7tu)NZ+Hf3rbJmc6_doMM9UBk4|3Xi|jKU{Es5NfpgUY&8vTP$pUia*_-{CAK1 zc9N*X=_ks~uOQtHwhcpFdGZDJ((+HKHzv5a#Q6>i>@!!?vo{;lwR zjX?HNqXt|t9S?Ct<4Uc$kU~{yUBukGwuTwn7A>aVUh_7(00n}Pp}7v7XQ$W|{wS>BEPoRnV8@~Al610=5B9jJhlj6!<)oI; zf4?ZKoatWoMyeH>vR{E}UmSR%Rb)Et`XhBnW-%}1m=%w4dv|zqb9FrD)>&Eq0GX{t z@bdov%S}cvq;a!16W$Gq0eDK$OAsZ6s3=IT_Wk4Vsy5V z{{Y{c`~^y|S;vnq#@Kt6E&R<-))z!t!ON>hE4Us9KT4K3CzJhL61UL_uB~M$_ZBqt z*%FWGO4H1G%`QDRk^He!^@q_GuxqP1#L_h>I$6j)+lrKESGoP;TgU$NqPAG>JdFK$ zqdR(W^x~n{nQIG8Z*wg)0#DC-82l!gPiImIIuI3$M0Lc=6|4^di;Yo%SC%W-#Q zC;c}cYeD}20{LYh_tB`|L(~`+v{>(x*2+;a{9|pB56P(IeFeJz5G;q) z#^d?$-}v=sOQKu1`k5q%o`WHm@@yZ}*0F3%U@An*)-FSjsxS}#0FKi>R$_qs!>aTn z>N<)qCAe~?b93nzQJid!T=rA7kNgF(pTjkoJ)CO$C5fVcM^dJ{QWz-;$8V=^%A<8w zO^$%4py~ere4MZ8RH_o`+~JU$m!KM3J{{VIW0Po?e^VzyCX18FU-p1wp z#}!xX*8c$Pes!XnGIXWUmA=l4)rqKPRaF~$=dElFIzHn+R;st~M-{o+qd6RkQK*}- zbE4xLm8>li@^X?)(WqSH4A!X8boBtV*HV2DFY~H8ILz|@0J&Q@$Q`RX(~^~wHN(?` zh1wxaH%&qR03MP*{dAlB1x~QqNB*wD)O(Ov{{S&SrN|v?2HV}8DluCmO1BQ2Mk0^> z$drFP(wnP>;Lk2S1Cji(QEp#RpK57!2fyW0%3a4d%yQp1`^P`dfJo_q^cbcYm)rBC zGV#FuDcOse5@fgDCqG(FY9k|O{{Yk}Rc3tQSo@6Au(^p2X*U7Yhv!W;^K~O1To30* zka`(c>AU;|8U^2BcMN}kr1MlC@gP!pF@wvK^j!Y{Gfq^AM+)ch0*wRS8|IAgGy2kb zjp>iW6(`6K`my+uX~D74RDBdtbQXQca_o4|;D5kTvS$7U{6YS;H<;(EvHWSgnpX=D z5W-ET(~*JgPd#akpE&(L!kol`$7-TcUT$M=aZj?~{{UKSyvE;3b)J7()9jc_<3^aL z3Sj9@6^XFy(cYWdmXL`Pu4vCVpd1f+Z37%lE@_$cqci~goY9((JkpXV0sK=*$9iF5 zP2HYpq$Vk9ibKUXkx{#4O;Qm}xhLyUFsXCaqke|11+*GCtC63ORWVQVSGFi> z;GMNJ;Eo1hSx900N^nQwFn^tG+QD$c{T}d1kKmnmf8)qEerfpEH3i&kzypT#HLqwQ+82(w0mYZp?&E$=+5`*G46=)~$i zo4Fr>6vGCUc^N-vy|P%q$%Or?1j)e4OUmT`0CmT;Qu_p_n=9`$WBpyi@5lTJ2I2Up z<5*g&>z1Z@BDa_9bHtHO(6E+2mA4GC=Wiq)aB7lw)R|dqbcaE?ek^Wnue}x-c1iyL zex<=b@E0|iaFW_huc_UEIfY6pErZOkizLF~_j| zGhGB4#q4(eN7@Xw=d2T`mNWkV0c?tw%3NHMxsKaa_Bbg20Nrp@{zjTXQ^6qqwXWaU zQz1z#WU|jwVI~vth4&}rih$_>{{Z90`r6=jjmfr?{`qaupTLTnb}N|9Z@svvC)c6; z$2A#9bIe@*SN{NssqUb-u{gVeOMgQ;7{U5BN+0{^RAX@&36Oj6f2}P?WMDf0a1S+| zss8}SzvWiP%;9$h<2fh%{#8d%b!eGYvw}IsD_U#bMs=FxV|0J@(*FRtTVk;EkM+|3 z0J&RvrC;L=H}J+wk;tS--GDp*eZZ$kbNW(_G4vIk^ft0=Lk@oQtN#Ff6H2he4=hRc zU-6*zCYF`iRzrfs2i;@!Z}1dl2S}qIP)#7DG=@LUa6VQG{_25EXr0uUY0^fCiniWC!Ny0mGBYG&Y0D_! zRASi4DRw6<*YT}XQhJ!mQfp1ik;olG4l~qxQ|Xa_fB-NllR)l3+4IlRozzo5<-1e+8nOtV{ZoA?w8lngfAwwjq_gTO*38CV5;Vc! zQud|;#T&X-Rx}@AB{xUkq(5PprJ*D%Uh{3YKvd` zDf~@D*YDeC{3^K^oRO!TQWgMptdn;ubY?&9ifI!+I~(yd`8$iBv8dBVI>@8(sEy)`bvv1;a{HfYK#qT|lGhe43tIRU|xB(0)(C ziG&1bV{wN-)jSYwBV@%-qjEoA@fBTKK)cEvyz^4UDxUuUoldBBi#8xPKmb%po|{f{ z_#7Y0by-RO0C-S_4!o7eBkDfz{6MM@O3BJGT=nO#;aW*{B#mR9G?f?bsZc=r1Hk_P zfZ+SlboUdBTQWs(z%sC5o(MUsD|s@=B;HwvM`CvmqYNH!tB=X7#3PI$we-_3)6 z^HkmLpi1OzjK%xHt@xndn^7dr&cSEky&hcI5fkemtT7~()@1@#L{0nta zw2tSMhVU^T!8~P${q&zO{8#a*R%B&%1Qk<;WjliXNc+EzDhUt<4yPliB9xozR8lP& zr-=*CYT`9K%Lx+3KBaSy#}xRkA)R+aBFGQ-PHC}S$`~vA29ssW&17D zZ=MA*f6q)|zwf2PetW4|F_!J`uB`zOPUxqq=PG`n@jj1k6vm1_*-`cYfr3^(w^exjT^eFZVkBk-pVPv9#jvAWz-){gWZ z^mD4AX%)7!?@<6(Bqc1UMYgnnA}Dd(s&QKfh{sV^OdA=BYa$rCBIr50H#@sGxr* z?+pDZ%&v3uqK|V;@(X{k*Pz<(-{ zZB%wa_-Faic%bE*AZxi>famZO)8{|%G502}tSSaFe-1yDJR*$RFf!czO&0N^N1dP?_8n?xZ~^*Nc$*Oq ztru+tqMn7Fi|I-f4#0jD51dKB#YwhS>AUdarq8Kd$Z_VHT}K3pXn_9!6A#4Zl)|Gw zIs6401>a$&R34_9Xa4|Zo?}3I6ZJHNC{u+|_*H06yhL2ObjP(;o4++-e(yZhY3k#- z6WCb#16Mf<_*FFMD&{^@^rbz*8w(4SjY#8@{OVEUxy*@!+^+_!!~Us#aZ(C~wJmI4 zg4K`wd(Myk!<~oo`=|7)B*|>+9n@_Dx7`yBo4$nnt@t%(M&~)s53Oq1>X$ai(IbNs zedt|7qkUDf59LDh2D=XoRra=3@%LC@pToUQlgO*IHdp(Hk@+4e@ivn(Y%xoxu{r+$ zMZ}Pk?*y42_sJDE_C4DvYat!{V0Ex!&OWe6Ly!0i5mn25iRQNA8<_4KHOxXsUXF5E zevOdh@L(!8vyMqXk!T&e6eqPT}~c-vzA=TF`!`jr`9_!6xm%#9e3FfpD3f(oDb5F~!JFrCz? z+(}=$Pb7U0QTS4qf_E}c5c5wSa^+_^=q=@s=^A#uBXa4}6^8Wze z!~QiF*xLoI=2sm`xY~d3q+EaBQID-fwtxBLKkxJh{{U)_>d;G zg_g96H!L@u<0lIL08c~hSXZSud(bMCSkzpy(VktZ+Ui!i%R#EeFuPc`?vUg%4um&C zy>NdDbk=tk(L*e-vPUpJTam+KgOAhmuWZ#UY;|}%i|7NOIRtR=vHm4JkMpl8@kfPq z9}h9Qjh^#OX69uar~AX()~XdfqStXX?A?xe&m%t6{^RH>ISz1pV05RB-%6s^uafu&l5x!|YE|rsO99n~;%Pk89RWW#Pu_8ioO^La{DRN92bD8@;z!ifQYl#imIsf*tdmuw{{UO~ ze}y!}(Hl&6$&_p_r%q}u%+|oC1RjKEIH{zCTph!=UX-yf?pFhWS|+Sy)SJ|XDIMnr zCu;H26{1c@CpA|eIQojPdQ?r^x@(kBa6M@14JX!&oK%)0deceyO(`_i#@c8^L+Jr;XKI~eE zGa&X^ml7!Y$AN=%I44?PV;C?u(R+slya(R~1$qV-=#eY9qneJ?FB=c`d5_`U&n9C--x3|7HH4UL->PEH!^4aY@hehs|}@jzcq#A7ew>rOa-*xpE0-PnuO`v zs5m-arIG&tPN4pw`J%yo=N%hk@u}mAL~%W&3!Nra zwY@vwCjgVj{{UG2m4v9NYZGNvuXb9atnn|y)woG7{64=h8>F%)k?x=2UU=y7v5rJ! zbm5hT2tI^Yw`qEu7m%#>vu?+s!Bg){*Zd==_^Kl=64z4$<()V+ThQ?{ z)Y6W%p{!_^`VI41Y8LGjfc&Q)E00iXKhRZfYjJu~1I+@A=K_EO(vF{6J5pyHiee6? zgU@~`4I$!y0os?0aCxMo1FmU+hnja`$LCEX=kTWvlz>r5+LyPbCUNzk2qPc7?R|MX zlI!&}5p`k}DF9TgMMi{_wcrY?;*71^)W3WM6HC)$pm>>9fRJRt`!N$MedPKhB>D zok+`XT`*Q_`jOJ$v*>!=tL6u(hN)Z0xexUPA*2;uM?}u4~#iECJ@*1yGck>pEVQjl4Z9-#M z_m(Cu-r*TT-RAb_Mli^#34=P2%`1A{(0f8M;?C39aPsG z`yJ7~+T==-W3%uAp9S;Iygv}>FRI$DSnA|`-0hlQ_WT6eixR7J@&Q9*Zpp=<S?DxE`8cKW_q8A2OH8C-sb94yaYerqZ$M%F-MpU6mATWo1;cFI1x zfH)5}Is=IiHm4qtv5$@OI7N)7fJuHW#-BANbY~H@3%2n)%H@Vo|Yq?5C-u z4sgNrkOO(2+(!KR!mLGdvYSfBiV}BNtDY*`xvMjybxDI9IPtK}tTnemy}Kpp$UK?! z$(xrC+`6NN)O%gfo)g#d!|nEh0zVhyCJE^sS4U6cWL*{{Bo&#tc;sd1bb^rf&Nb;; z*z0cX9jjjj7U}oKWsa6UymP+E-DPvlMBWC+_{H6uh|R$nsVB+3%_+_MO}<(5S%|(q zZr*>|eWG<;Qo_-v_SyK%2Z;w%O^Qw??&x#8jxKHjKmU##z24eAn71_%Y>1~cn5{WY zk#=-qNSdFOjBmAIa?#sJ|$!QF0ITp1&D8`QcXN3$7$%FOw)TB2>I{vhvhU#wz^nRM#&x z+b3&ZnScM%w*SGk?KeG_YCapYS|ogJd6HIUoOZ%?UWK+e`G7*C=ew(EOsV^JZZRrf zOW8K|&$2KvU1|8?lDotK{aVicCHEz~CQCaz&A&(iAy21(8%@#^J*p_5x)W@8FP2kZ zB!pzDjHL{fK4}H8dwlWityMea?Ma_#*Han^(r1?@IJcTiaJ)JR+adyHPEcH5qaJl@%=6RQ{OA1dIQW>z?M zzoUF#Ryk@ov29~O^@ZAisW(MVS00^ge?H;r+^QX=sb6_H$3brL_OQyuviO>5M2m?h z_#4Xb668?8iILIBNvo0bt6Ow7+r1SzS8f<`?Fn@}v8m83(CDS|M4VE8O}u$3_`8-TYgfZ0H*9PAiB{7@o z=Cjj%J!i_$n&xe)R!aw?dnYyQew=JkHc=`qHt#qczwe5|)poPw!v#kYFTXceYtWw> znaH!jt#8p^9QxTl$P$SYo$$EVb1ZPAU{|TLc&J~%;5yYK_T(k&*Sso~KR#AcFwkOs@#K$ul{Lbx!zR!_PFzHr#RKiJBtPac+1zz8;wg;I9e(qS5gz~^vMTw z6_RZnPHmC%KVB?Uce}UWS$U1gp5muTclMqssk?i={9%$MN~w(bS`e(f3c+TQ-} zDZ~Zg2k$>}DLxjUsdQ|$@bo-~(W0PHq4tg|&3B`=MoS(T#L_}2J#i`|o2fbj&zgr`le0~Ie`5F?d*%PkgM8(?hjG2o?<;L*9qD_uYC_%fD8 zplT0Z`@PAwgOLRh@oD?xz$uKxD2

=Z-S;zl}zIHz^$M zNIJ4i!aePajkGuIeFo7^`JAobJ*w+##Amav3C|$g&4O|U_^4!D8oL|ZMl^0(Kj(>1 zmn1sQk1oBqyjA?Q^lueG-a>|p&R!K>*)7$S>G#&4K;UEIvMS?S z+pIGYWNo@@5O4>0x@w6lF{%(!Fi6;4q43$c= zjI~Omw@lhXQf{u=K&vPw46f7>yk|%9ew!I(T04&75ua!8er@>O3DtYJr1~z$nD889Vg64VU6C?_Sn*GBdaL($y>0v7MnQQ%&6=)8mbHor2cJ zUT@Mi8=U!W+54$I-KS!2r+TN-wZqGITpTrN6T@=FHT*tu?}S-Bb`A4m(g;0S$0k9b z#gEJPIkhgMOx>Sw+nQC=H05j$p6-KEuzNr0Y-~03)U?iDLg7Rj38a&^++6~%soxrU zl6ZOV>w9;SBOQ*aFI3)M^eN8Wq3E+kidnfR*oY84l7`yymY}BJO%PM6Pt}jf_c@w< zC~i6m{?vKUK)1wWvUjj_p(^otz@W`4@!N4-gGHO@3+-0FQcjI*#L*V@2!kgFo!mud zcpjRHIAeR-VY(u(SvAUVHQ}4K3y&F~TP<^NU1OUEw3wFFL(mi@#uVRQIghmVbaM=rGc5tM}#hSYdEp z>`SB3rArXzgI$v!o}>-g$-UqDv^4==^eFtr;r^P{`?`m8M+}r-f`9^Z7@XkJ*KjHG!yR^?t$yza1WM4zf-tWg*#rIX3hhv_2Z>M!* zlGb_I77m)gsa`{raqsQa?6(7myB8Kb;YNA15qH) zjFSEQOSbuO#Nelx^UJHAe)-~4{gsI^FDI_v?;$hlQ1AAzLea)# zbAA0}l}F{xBF>o~r%sMxAM7k;iLZ?nGQhLOa(Aho5Wdy#vCIqqSRf*-=2Et$L|Ehc z=APExbGv@H2TZ2Z9IEx7nXf3ZJ(s?7GN;%+M!N214=u0NQ*mbDz zN9)*QaiNGf58Vf6af`w%#+Ce1k2JVDFY*?Muvu#$=N9`Rfwl%$aJXko`mE;*qp*Oq zT|U@4d~t|@p*l8iy?Fe^qY;8usE0Rw3&(}MHnxs?H>4jo>&&HTW$#<#Vv96A-5hW; zn0?^tA#(emt3c=J+iJYMg6;DcA5(Z6=b6=Dc@$I0oSpB z%@4mT#CzPi)7Ib9GB60vNF55lBGNnc!mKN)I=LY3k@UM7(LE?t$8*zzKZUv8? z1ZiG6yYt;g!<d2DkK##-%iB=1q;uWmceY4UA=qS4tXQ z94_efSQh#WA7Q?1&v{94HPzjQOYK|2$6duNGYK`;&lY2EgS(DDTEk5IxOJk%q`uoc zW?I3$d#SO0ZgstuaNF_lk^@tdQyQn+>n8Umo+^9U=cH*XUf64dUUWo#=s4#$p_`ww z_K%d*G~8?W&CF-mL(X8}n#A2(VlPb#J81neKAQNeS^10Y#@I`T%dUN~Z5hr*T)LRt zj$5Oax+*;}xj0vQZ~WuZZ`+Hvo7k03HuW8%Cf(2&4+l@cBwurK-4@rfExxMC2$dR< zPE{GIo(g}~&A508nLoJxY;XO5>xt%G^80~ELnRUJHVc+cS#Oe!rtB0h$?ucNny+1V zWxVZcol?C5$7zwdE=g|#5)U4yJ8ci_ZM${@(`uV)i>P3QWxrJ1hd(c~S?|f_^>mSn zjK+yoFHw)gmZG^{^Gl}e`T|Y*rxjYSYo9haS~y=_`2FV1WjBvKdse=`Q(|G?QPq~b zhwc@NSFa6-7~KELHLU3QflmXQGqupyE_{eFFh<){Pllf!EZ7nyArgSaUv0_M4Ji3h zXd*K6*k{p9A!1H&x(ywy+`}>LR^EEPZ^^a02Y^q1}--3(GD~rX&Fgt>=no{+X zrItvS$K(3VZ=+9q(zm)G?4lmmYP~SiA~eq}^9?+U>uOi)x2P;> zIp?=^f64x)D^%H$lJ+je>eKfwBC^dLbssy1i5yzLPd@!CF3j-Yk4;1cF&BTMdIJY# zg~L+2%wzVi)1OL@r+lP|3I4cWDp|Mdl*hJ`59(Dne$&_~*j3P0?2tCn+u=d5N_ef_ zk&>)#>bN_c*y$*L>pd}~Ance?cSM*+PKNyO*$(F}_p3oAt5s4y+x7%r$C=t-9coR| z;`UZ^+%~i<7a{E>NN?w~AHJ~Uppn_-tq;C-Wv;i`k=R|ol{jyqwTMZE4d!F^?sWww zE!(Z-E8e>b8yG6q3+k9%?ih3`_?9NjDG1+n`1t$}>O0~SdRF(l%s;CBqK3De`m!V4 zjwn*y@3=@P^tEXy^0qYZzJ`rTNlA3@QDk2p;e?$5 zCUtBNhXp0}lvCw9k#(->Q_y`>S`ayq?RM(%O(c%N0;GT!rV_qU}6?T`Bn zx{O@7DUE&lJmK+;=;&))zmEmw5~_Vh{s-mAcCwL^784`WQ8gd4y_cEZP8~Jf@jZZD zW=fboxPSNazU}rm>K0n*{>Yggu^fAnvE|}~JF%;AeCPePkyFRoj%#GaDUZe!MksbusEp(f2^VwsVOs z#Xh<3a5;p<_p9;|ye*GJjZ})y1=(*dJt~KESf6%njrhqZ0bY%_`l9HM0vA7W99qit zu_`<_3F4i69%6yxvE}(&^MX32V*a+CcYv)~m3auPz+-0}Qq z5t{xI+;PlcL0R|KfYEse?i+*4*|rw1+>S%I67tGpPi^#h=@;C5LIG%YWlZibFq+|GmmqQ#SNes=T2=f)EuYGYxd0(IB?;vaS7SDw@{)n zC)5IG)>mU1bh|%geZJaCBXpG9nP+h}BL^>abXYDH$oIdTY#p^&`rEwjC6lYor{cUS zzmRr=?Kb#n#74WQ;aqu9Yu4xF!DaUH=v^qEQWfDJ)fr*>SFN&obQIZkD2!@W0XrcD z@1FA6ab%eGbXWI5eBEg_9=)kzY|3#k)=($iKvC^{5IsbHw?6#j1PhLoP#j{HmON zVY@$W%aAUhx8YU}+}mE`oA$sG5nSB}{^nh~w0?{0!v<64@&L*1^JEuwl?#Ni3)R4HT%0Vpbb(YtvY{D;!ov#t z?Y%9%>C@_Bgt&Xq;jq2n+0OA|I=J)%j>Xa^UDg6hhg;p5#P*41C#TBhQ2Ev2gDD>+ zD89lT8M=GpqF1Acy}s?gBQ%d%e7I_UM5Z#Jr~J#pheb|37CqsHM!q7q-9NnE@#Y5g z-eDn$jpZ-uOD^4iSbkD_=(npKm8KV8U(lgben!SQ#*wfM|u{hlS70gWHj z7G@?DVN^C2s~&pf`Qr@HCx?e$Fy3PTiH;YnrU)*~-?uQfWLrql+J}lF zh3&!@=>z(Ka{fLWV|XIxZ`Kso6gQyW+KWmxzK*bbqaL#0npy62l+Zb2tGDw0XL5Ev zEth@ff|I=x+tze7$&{_PW8vZur90v2x~SfARnrOa^^MCP)P}!nh_<`fboSgp8BvY2 zG_c(`_7bK&`>S=#W=Sm8jBNP1u7Mhpa*r6eA#{yf!0^|3xkW9*KQ=k3&g)o>F&0lh z{lnW#P#eQD5WM=qxH!#Z?efE$^DG%vTUSmua#)vBuQ6J>yQY1I1HkPAKQ3SWwe`vx zq7S(9ax8J9e`iPf#sJNeXY#q}*iCUkksG7ZiODt!pHTX7#xWIl>qCQwJYJ6nwp++2 z=55J*Ak{1T;&+6$soodgMDf<)0*Oa00b$*j5)QtyxxUqxplZExMKINDjmUe`@PjNFfPYkkcRVk_okP_2-R5bQ{!abi(eJfK*Xpm;UXu zU3N2&gFE~H(N0cZ?}JW|2Ff$aD$%H)#kEtpU;=?=Y^+mO6vt4hc5HE zUf1#<=a`$k0KUCY1^+VdZRsLWh3LA%cI>FG>HK?sB~KV3+cC-gIwxr#_Z_y#2wGy_ ztV_J5g4&cw*A;%2w$EJp1NNidjhm;kub98Q^u~cTDjXRVRxdkL{cQeWSM`IPP0F`)k>m~kX$U30eAn+m4kUL?3P*l({f zU!RTM%DJSIHFW{wF0<`y_^6UP^|^&Gy>e|_eu&WZ7w?3NQS(>0*PW9dAitwr7{nZ> zoNlKrF#N5erEip|HMD@XOO4?e&(!NCNcWYuD;t74>8EmoY>TJ-=`0JJ#Jg#w zgP0e$szk@WJ(~<$g-&U3=rJDy*OspT{RIkv(n4uzp`=lgC?S-Pf*}IDWTfYTPicgp zkeYyipyUcG>JCsw383zap<3syLoG%j79$Xag3s+hwN5Zco5Es5L~&_!8aGylK_Zhl zG>Q%(CuBpnKi!op#7*XVHBBpMTk6hmMIkt_AqU~#%wOjtz7fD=N{N!mac zZ=`254Ig81Si?2C;8@g29FN1}r?U|Yv!(HRcmq6US_`>COHxly7iS2nMYE;#aJmLK z95@!tmc;4f40WfYk=oH{L%g0oO#atNs@eS;VnY+a!Y{t0kmR!e86o~OAwvI5?XoC^ zb*Oo;(#5qzKu;b$kOI`p1ruA0l9_8gHIdmi**}iJ08un-Q6y3tGNT3xGfNNUCI!)> zaiU1{KrV|#C$nM#dAPqJEcHt#*h-6&T?GdIzyWkT?M@OHfe|gJ6lO zE09@SE}e?ODh7%!L1K|)M}5S?4G4WjeMt(PMG2SE+jtNm>Z^d!1q6Jl9MMPOEFP7K zG+L-H&ZdQg8a_bCve+bM2o-}vAjLyTG&a%zd2AtGfKD}@hZN_~nBjtjsBq+Bq~SuG zZ8$=WL8Gu)(Ig%fvr^kggv*W+eFo&QG)78Nq9QO_n1={yE{jH|a*^bP7*Vn|4Vfr_ z(O&4IE{4=xxX1DqLY7Qof`1sGT{LRtPiE5s67sdeCjlvj%YxMe2@5n73>P4&FtXTL z74mg3tHqfCupVdn&nOY7K+*(;z&zm4I1gS9r~ya-=$S==C|VHIF<3!V`rlYz5N?p= ze}R58fSSh#)c?2g|I=0uLIupaAWjkqtcOJkB6PtU2i|(%tq2IywL#&p12q@&$kKas@s>%`tKqY2e`i^N@cl zu-IrA_ENYRH#b+4!-+07G(g}sA`n_~TGFtOqc>(CL>)0$1nk;UI-)x;H)p-t9PSp(|yL3Sxi`6 zKlh&rif{`*hM4~&6a3=Q@RIv?9t}6h3c_E2+D{%WHz{L)kQL%kLa7WAY7qh@zG|NA z0%$D{>wUrE>Vh1tLh?(7BX_Nn|)FnOi49W%2@< zB*q+$;%sU(i5(Qk4}LR}iifaSQOuw~V8)+`3UauybLADI#sD1#(wM<4ln7rCArg}l zO=ZtzD@bDoQDf%Th(tzFIb0fxIk!@f69KZQnHm{sIR&{6RfGacT+RR?Bq{fk2~hK7 zq-9c(G8sr!V^tB5xv2^AI9w82mj8v2(*gk=0n7kCB*7N~F^7`dN3=P$r(yG}=h<1jkFDnWSQaUsV9ZID~1V_=)Bn}4{n#AN{#Kct49w2+*h(Wy= zq*$=H)9GMY2WvSS&0v9H!BI>KG+TIG0_80nGZP*DH`>Q}9hyrD;TVg}w1~wkb7Lc@%ErpG zRFt*;JT#71hTs5NVc-SEu~-aXO*k6@{eq|*3Y!)Ii4MB>fkrgo3EY%~=0s2_v|t(q z&8CKcX2`8cOf*adO$I7pvmzpB%n&qwpc->44?q?kVoKTjV@#D0>0Ce4^p07yU}?EwX80dS!G zV|2C9vEUtp1K+p}TIlFdDzK~?zg`_qasWjxD?%F%coY_!34j1;LybS?k3LX%WHGe) z$q>IE29g|B(VV$*xNM-?U+e}64p|w3BRpg_2G3|3m=s#2W-JO!_0ufXW(}hbcmk$u zYGf1*>~55e{gokgD{n|hND!L^Amj}8tY|3o!(KoFM=ZJ+kqS@?!6po(wX+t6Jzy4< zdzqtSeSn&FK8lgr;a&d;pkVLidHMEmPvwa_^5*2TgH zSOAP!JED~#Gms(xOUPTG0zhCm&^p8(_HaIb*-uCl*wf6!{=EfM%p(+J1?XTJo5R)O z8%jgt*9f4Yfu;qcnN%u>L~w!*EC>g84BD2(W{|*+1t5;XrN;uq109)gA+lx|3T^N0^VBC*@ORL)%hE8@Tv_{d%I9( zKv4C>HLPk`1s0O@KigdYPwdkF>GWi0^9^`5IY^mfNY&OGSk?R+@NE9rkk8qg|8shO z3H{|`)kj+Fdd5*D%z zDbtKpJ@6QStPvKHzd!qzC;Uee3R9aFl}Qy+ecpmq9e4~=`(N72{LA9<)8jcNKh=g+ z9e&C;`A-{~UrEaB%+!^URuz$%Xv0cFfeLJ0z*lB>D3aPg0YR(a0eNEwU_rcvoZ@HPXC`fDSfZ6H65Ln}uyX%ukU1sVo2 zvWVC}8fFtbv@*#3NDv1=j2T6zCupJnu&h8a|WjVLyznACS&FHUu|JwQ&-hV>o-+|JfJK0|u|J?Q; zvxVJ@O(k(zYyj$bWjc+_Cb457Z;6Nn8x~eHBmxiYc;Jl}T3Oc!ql>|Ulpl0}-8OJe zBMyVZ=qZE8V2TrnL|1QP{)r3OYH1zlWf)>lAQ0ArH}rBM5cCO_Ap~0jA%tKR0@qnu z{?r}~RoeW~?iOv$FN05*3f%5MAVd?uIPeV-vGxQ-s7xR@Q$T;qH4Y4Vl%JiKz7NiY zN4EFkl9|plSDGc+$2~g4fiY7@$48JEL^{>ZE6yQ|72>Un8n7&xIG{;}{0&p#BV!w-?9H79Zv7txJdKI53EzL3Xym6dYcc z!f>HG&;U=SWhmekO3}q*sXmtceoU_@lD7-b&(4#!&Wb=IGrhPJdwOt?Jw1AcAI}$T z0DSe<^V_|>u)f|9Z5wDVQ9<4@oM12?YOLd*yqT7eX=p=Twm`XIwkfW$o6yl|FUehG?I5rM38+rJYPQsoldqR z2G6m5s4s&KbVD@wU$rO0D=LUSgDnV048I6J@0f5OoIxnr+Y{h`K=bo9z|EE+98v6S z0rqX)29vxE803G5 zJ1_kpd&f{89EceVBiR0br$aK%G1AZ516%^}Urb`4Frw`lF*&`&% z*F7Y=c0CrZTnAP1gD~9Af{OM~P_~52c2JoB)mqZ2 z_QY6UFDSnH@j!eI*KyO)ad6X{q2~!la(;jK5FTj>9ooA@1ASnj3Tvo8HVm($V+2Yz zwlEV*IR3+jP#=E$w~Pt$_KFL#1M`O^CFtt}>*xgiEBx&E>jXr}7wT^i_ZNOJ!8!b3 zeTI^mE_!}GKz|O9d^$D|KS09Lnh5uYRRga_kd7gJgBP?ALhBYN$NKpMg;E$GzIX$l z^oHj|h=%y<{8#)vq49BGjo|n}`f`Q!6RyMK^S2~|I=)i)>mghuK!zvy={fR190&)P zjScc4!ZH#p!!&ht@ScAe4}oOvcv${9aRzE85Z%`3=;&L6@pJ9-SAD^F`y+M|?BYOw zP1Y&g2(i%kXb+&{eKr^l?4TqHykL1Fg6zDw zaM2zfA3ZyM1`mIg7uFKs$;S}EiURe={UPr^@J|S)b|80R66s{-dO{4;KL(Zq;KLVa zjgK=x9G)cTZTS!I2!|4QeeuG_k@dV{eQ}CxaPA%aGQriRL_}!Ai79#} a@kH}eQn?QBOW42iAK@d9K)4Zzy8j0)mFal^ literal 0 HcmV?d00001 diff --git a/docs/qa.md b/docs/qa.md deleted file mode 100644 index a68177d5d..000000000 --- a/docs/qa.md +++ /dev/null @@ -1,93 +0,0 @@ -# 常见 Q&A - -0. **Megatron 模型如何转成 HF** - -使用如下命令进行格式转换 - -```bash -python mcore_adapter/tools/convert.py --checkpoint_path path_to_megatron_model --output_path path_to_output_hf_model -``` - -0. **什么是colocate模式** - -actor_train、actor_infer、reference多个角色之间的device_mapping可以复用,比如actor_train配置device_mapping: list(range(0,8)), actor_infer配置device_mapping: list(range(0,8)), reference配置device_mapping: list(range(0,8)) , 框架底层通过对保证了多个角色间GPU的复用 - - -0. **什么是分离模式** - -actor_train、actor_infer、reference多个角色之间的device_mapping 之间没有交集,每个角色持有一组独立的GPU device资源,比如actor_train配置device_mapping: list(range(0,8)), actor_infer配置device_mapping: list(range(8,16)), reference配置device_mapping: list(range(16,24)) - - -0. **rollout_batch_size/num_return_sequences_in_group是什么意思** - -rollout_batch_size: 一个batch中的prompt数量 - -num_return_sequences_in_group: 针对每条prompt采样数,也就是vllm/sglang推理中通常意义上的n参数 - -也就是实际一个batch内样本数 = rollout_batch_size * num_return_sequences_in_group - -对于Megatron Backend, 需要注意: - -rollout_batch_size * num_return_sequences_in_group 整数倍于 -gradient_accumulation_steps * per_device_train_batch_size * (world_size/tensor_model_parallel_size/pipeline_model_parallel_size/context_parallel_size) - - -0. **如何设置gradient_accumulation_steps/per_device_train_batch_size** - -***对于DeepSpeed Backend*** - -global_batch_size = per_device_train_batch_size * gradient_accumulation_steps * world_size - -world_size 即actor_train/critic的device_mapping长度 - -***对于Megatron Backend*** - -global_batch_size = per_device_train_batch_size * gradient_accumulation_steps * world_size / tensor_model_parallel_size / pipeline_model_parallel_size / context_parallel_size - -world_size 即actor_train/critic的device_mapping长度 - -注意: 不需要除以expert_model_parallel_size - - -0. **如何获取训练的timeline** - -可以尝试在yaml中开启profile - -```yaml -system_envs: - RAY_PROFILING: "1" -profiler_output_dir: /data/oss_bucket_0/yali/llm/profile/${exp_name} -``` - -然后利用https://ui.perfetto.dev/ 工具进行分析 - -0. **如何debug代码** - -在对应后端的Platform的env中设置 "RAY_DEBUG": "legacy" , 就可以采用pdb进行单步调试 - - -0. **如果出现这种错误: self.node2pg[node_rank] KeyError: 1** - -检查申请的GPU总数和device_mapping的配置,出现该错误一般是max(device_mapping) < 或者 > total_gpu_nums - -0. **如果出现这种错误:assert self.lr_decay_steps > 0** - -roll数据分配的时候,会将rollout_batch_size的样本,按dp size 分发到每个actor_train worker上,然后再按gradient_accumulation_steps计算每次梯度更新的样本。配置一除就是0; - -详细配置逻辑可以参考手册:https://alibaba.github.io/ROLL/docs/User%20Guides/Configuration/config_guide#training-arguments-training_args - - -1. **如果出现这种错误:AssertionError: batch_size 32 < chunks 64** - -batch_size 小于reference/actor_train 的DP size,导致dispatch时数据不够切分,可以调整rollout_batch_size解决 - - -0. **如果出现这种错误:TypeError: BackendCompilerFailed.__init__() missing 1 required positional argument** - -可以尝试在yaml增加配置项解决: - -```yaml -system_envs: - NVTE_TORCH_COMPILE: '0' -``` - diff --git a/docs/reward_worker_examples/README_code_sandbox_reward_worker.md b/docs/reward_worker_examples/README_code_sandbox_reward_worker.md deleted file mode 100644 index 0b12d3a72..000000000 --- a/docs/reward_worker_examples/README_code_sandbox_reward_worker.md +++ /dev/null @@ -1,216 +0,0 @@ -# Code Sandbox Reward Worker - -The `code_sandbox_reward_worker.py` module provides functionality to evaluate code solutions in a sandbox environment and compute rewards based on test case results. It supports multiple testing modes to accommodate different types of code evaluation scenarios. - -## Overview - -The Code Sandbox Reward Worker evaluates code solutions by: -1. Extracting code from model responses -2. Running the code against test cases -3. Computing rewards based on test results -4. Providing detailed feedback on test failures - -The worker supports both HTTP sandbox testing (remote execution) and local testing, making it flexible for different deployment scenarios. - -## Test Case Types - -The worker supports five different test case types, each with its own format and requirements: - -### 1. Assert Testing - -Used for simple assertion-based testing of code. - -**Format:** -```json -[ - { - "assert_code": "assert find_binary_numbers(2) == ['00', '01', '10']" - }, - { - "assert_code": "assert find_binary_numbers(3) == ['000', '001', '010', '100', '101']" - }, - { - "assert_code": "assert find_binary_numbers(1) == ['0', '1']" - } -] -``` - -**Key Components:** -- `assert_code`: Simple assert statements that test the function - -**Example Use Case:** -Testing functions with straightforward inputs and expected outputs. - -### 2. Pytest Testing - -Used for more complex test cases using pytest-style test functions. - -**Format:** -```json -{ - "assert_code": "\n\n\ndef test_even_length_string_with_reverse_parts():\n assert can_split_into_reverse_parts(\"abccba\") == True\n\ndef test_even_length_string_without_reverse_parts():\n assert can_split_into_reverse_parts(\"abcdef\") == False\n\ndef test_odd_length_string():\n assert can_split_into_reverse_parts(\"abcba\") == False\n\ndef test_empty_string():\n assert can_split_into_reverse_parts(\"\") == True\n\ndef test_single_character_string():\n assert can_split_into_reverse_parts(\"a\") == False\n\ndef test_string_with_mixed_cases():\n assert can_split_into_reverse_parts(\"AbCCba\") == False\n\ndef test_palindrome_string():\n assert can_split_into_reverse_parts(\"abccba\") == True\n assert can_split_into_reverse_parts(\"abcdedcba\") == False" -} -``` - -**Key Components:** -- `assert_code`: Contains multiple test functions with assertions - -**Example Use Case:** -Testing functions that require multiple test cases with different scenarios. - -### 3. Input/Output Testing - -Used for testing code with standard input and expected output. - -**Format:** -```json -[ - { - "stdin": "[1, 2, 3]", - "expected_stdout": "9" - }, - { - "stdin": "[1, 2, 3, 4]", - "expected_stdout": "19" - }, - { - "stdin": "[1, 2, 3, 4, 5]", - "expected_stdout": "33" - } -] -``` - -**Key Components:** -- `stdin`: Input to provide to the program -- `expected_stdout`: Expected output from the program - -**Example Use Case:** -Testing functions that read from standard input and write to standard output. - -### 4. Check-Based Testing - -Used for testing code with a specific entry point and custom imports. - -**Format:** - -```json -{ - "assert_code": "def check(candidate):\n assert candidate(nums = [1,3,5,6], target = 5) == 2\n assert candidate(nums = [1,3,5,6], target = 2) == 1\n assert candidate(nums = [1,3,5,6], target = 7) == 4\n", - "import_prefix": "import collections\nimport string\nimport math\nimport datetime\n\nfrom typing import *\nfrom functools import *\nfrom collections import *\nfrom itertools import *\nfrom heapq import *\nfrom bisect import *\nfrom string import *\nfrom operator import *\nfrom math import *\n\ninf = float('inf')\n\n", - "entry_point": "Solution().searchInsert" -} -``` - -**Key Components:** - -- `assert_code`: Contains the test assertions -- `import_prefix`: Imports to include before the solution code -- `entry_point`: Function or method to call for testing - -**Example Use Case:** -Testing LeetCode-style problems where a specific method of a class needs to be evaluated. - -### 5. Text Testing - -Used for validating text responses rather than code execution. - -**Format:** - -```json -[ - { - "assert_code": "import re\ndef check_keyword_highlight(input_str):\n highlights = re.findall(r'\\\\*[^\\\\n\\\\*]+\\\\*', input_str)\n return len(highlights) >= 1\ninput_str = {response}\nres = check_keyword_highlight(input_str)\nassert res == True" - }, - { - "assert_code": "import re\ndef check_title(input_str):\n pattern = r'<<[^\\\\n]+>>'\n re_pattern = re.compile(pattern)\n titles = re.findall(re_pattern, input_str)\n\n for title in titles:\n if title.lstrip('<').rstrip('>').strip():\n return True\n return False\ninput_str = {response}\nres = check_title(input_str)\nassert res == True" - } -] -``` - -**Key Components:** - -- `assert_code`: Python code that validates the text response -- `{response}`: Placeholder that gets replaced with the model's response - -**Example Use Case:** -Validating formatting, structure, or content of text responses like ensuring a response has a title, highlights, or specific number of sentences. - -## Data Format - -When using the Code Sandbox Reward Worker, each test case should include: - -1. `id`: A unique identifier for the test case -2. `prompt`: The problem statement or question -3. `case_type`: The type of test case (one of: "check_based", "text", "assert", "pytest", "input") -4. `test_case_function`: The function name to test (if applicable) -5. `test_cases`: The test cases in the appropriate format for the case type -6. `tag`: Optional tag for categorizing test cases - -Example: - -```json -{ - "id": "3c45c692be4866bcf8922c7825ffe0bd00e5539034725594a2e24512f44834b5", - "domain": "code_sandbox", - "source": "leetcode", - "difficulty": "0", - "prompt": "You are an expert Python programmer...", - "case_type": "check_based", - "test_case_function": "Solution().searchInsert", - "test_cases": "[{\"assert_code\": \"def check(candidate):\\n assert candidate(nums = [1,3,5,6], target = 5) == 2\\n assert candidate(nums = [1,3,5,6], target = 2) == 1\\n assert candidate(nums = [1,3,5,6], target = 7) == 4\\n\", \"import_prefix\": \"import collections\\nimport string\\nimport math\\nimport datetime\\n\\nfrom typing import *\\nfrom functools import *\\nfrom collections import *\\nfrom itertools import *\\nfrom heapq import *\\nfrom bisect import *\\nfrom string import *\\nfrom operator import *\\nfrom math import *\\n\\ninf = float('inf')\\n\\n\", \"entry_point\": \"Solution().searchInsert\"}]", - "tag": "leetcode-Easy" -} -``` - -## Important Considerations - -### Local vs. HTTP Sandbox Testing - -The worker supports two testing modes: - -1. **Local Testing**: Executes code locally using Python's exec/eval - - Faster but less secure - - Good for development and testing - - Set `use_local=True` in the worker config - -2. **HTTP Sandbox Testing**: Executes code in a remote sandbox - - More secure but requires a sandbox service - - Good for production use - - Provide `code_url` in the worker config - -## Usage - -To use the Code Sandbox Reward Worker: - -1. Create a worker configuration with the appropriate settings: - -```python -from roll.pipeline.rlvr.rlvr_config import RewardConfig - -config = RewardConfig( - use_local=True, # Set to False for HTTP sandbox - code_url="http://your-sandbox-url.com/execute", # Only needed for HTTP sandbox - model_args={...} # Model configuration -) -``` - -2. Initialize the worker: - -```python -from roll.pipeline.rlvr.rewards.code_sandbox_reward_worker import CodeSandboxRewardWorker - -worker = CodeSandboxRewardWorker(config) -``` - -3. Compute rewards: - -```python -from roll.distributed.scheduler.protocol import DataProto - -# Prepare data with prompts, responses, and test cases -data = DataProto.from_dict(...) - -# Compute rewards -results = worker.compute_rewards(data) -``` - diff --git a/docs_roll/docs/Development/Developer Guide/custom_loss_func.md b/docs_roll/docs/Development/Developer Guide/custom_loss_func.md new file mode 100644 index 000000000..4c1b7dfa2 --- /dev/null +++ b/docs_roll/docs/Development/Developer Guide/custom_loss_func.md @@ -0,0 +1,360 @@ +# Guide to Implementing Custom `loss_func` + +When implementing a custom `loss_func` in ROLL, the most critical aspects are **how the loss is aggregated** and **how `loss_scale` is handled**. Mishandling these two points can cause the final computed loss or gradients to **deviate from the result that would be obtained by performing a single forward pass over the entire global batch**, thereby introducing training bias—especially severe in complex training scenarios involving **data parallelism (DP) + gradient accumulation (GA) + sequence packing**. + +--- + +## 1. Common Loss Aggregation Strategies + +Consider a **global batch** containing $B$ sequences. Let the length of the $i$-th sequence be $T_i$, with a per-token mask $m_{i,t} \in \{0,1\}$ indicating whether position $t$ participates in loss computation. The number of valid tokens is: + +$$ +N_i = \sum_{t=1}^{T_i} m_{i,t}, \quad N_{\text{all}} = \sum_{i=1}^{B} N_i +$$ + +Let $\mathcal{L}_{i,t}$ denote the per-token loss at position $t$ of sequence $i$ (e.g., NLL, CE, KL divergence, policy loss, etc.). + +### 1.1 Token-level Loss (`token-mean`) + +Compute the average loss over **all valid tokens in the global batch**: + +$$ +\mathcal{L}_{\text{token}} = \frac{1}{N_{\text{all}}} \sum_{i=1}^{B} \sum_{t=1}^{T_i} m_{i,t} \mathcal{L}_{i,t} +$$ + +**Property**: Each token has equal weight; longer sequences contribute more due to having more valid tokens. + +### 1.2 Sequence-level Loss (`seq-mean`) + +First aggregate within each sequence, then average across sequences. ROLL commonly uses two variants: + +**(a) `seq-mean-token-sum`** +Sum losses over tokens within each sequence, then average across sequences: +$$ +\mathcal{L}_{\text{seq-sum}} = \frac{1}{B} \sum_{i=1}^{B} \left( \sum_{t=1}^{T_i} m_{i,t} \mathcal{L}_{i,t} \right) +$$ + +**(b) `seq-mean-token-mean`** +Average losses over tokens within each sequence, then average across sequences: +$$ +\mathcal{L}_{\text{seq-mean}} = \frac{1}{B} \sum_{i=1}^{B} \left( \frac{1}{N_i} \sum_{t=1}^{T_i} m_{i,t} \mathcal{L}_{i,t} \right) +$$ + +**Property**: Each sequence has equal weight, avoiding bias due to sequence length differences. + +--- + +## 2. Micro-batch Partitioning in Distributed Training + +In practice, a single global training step typically involves: + +- **Data Parallelism (DP)**: The global batch is split across multiple DP ranks; +- **Gradient Accumulation (GA)**: Each rank further splits its data into multiple micro-batches, processed sequentially; +- **Sequence Packing**: To reduce padding and improve GPU utilization, multiple samples are concatenated into fixed-length packed sequences. + +Let: +- DP world size be $D$, +- Gradient accumulation steps be $A$, +- Then the total number of micro-batches per global step is $M = D \times A$. + +Denote the set of samples in the $k$-th micro-batch as $\mathcal{S}_k$. Its number of valid tokens is: +$$ +N_k = \sum_{(i,t) \in \mathcal{S}_k} m_{i,t}, \quad N_{\text{all}} = \sum_{k=1}^{M} N_k +$$ +The number of sequences (samples) in this micro-batch is $B_k$, satisfying: +$$ +B = \sum_{k=1}^{M} B_k +$$ + +### 2.1 Why Does Sequence Packing Cause $B_k$ to Vary? + +With sequence packing enabled, frameworks typically construct micro-batches based on a **token budget** rather than a fixed number of samples: + +- Short sequences can be densely packed → some micro-batches contain many samples ($B_k$ large); +- Long sequences consume more space → some micro-batches contain few samples ($B_k$ small). + +Thus, under packing, the number of samples per micro-batch $B_k$ is typically **uneven and unpredictable**, posing challenges for correct sequence-level loss aggregation. + +--- + +## 3. Core Issue: Why You Should Not Normalize Using Local Statistics Within Micro-batches + +ROLL’s goal is: **regardless of training configuration (DP/GA/Packing), the final loss used for backpropagation must be mathematically equivalent to computing the loss over the entire global batch in one go** (as defined in Section 1). + +If each micro-batch uses its own local statistics (e.g., $N_k$ or $B_k$) for normalization, and gradients are accumulated via the backend, the result is generally **not equivalent**. + +### 3.1 Token-level: Incorrect Normalization Within Micro-batches + +**Wrong approach** (normalize by micro-batch’s own token count): +$$ +\ell_k^{\text{wrong}} = \frac{1}{N_k} \sum_{(i,t) \in \mathcal{S}_k} m_{i,t} \mathcal{L}_{i,t} +$$ + +If micro-batches are equally weighted during averaging (e.g., via gradient averaging), the total loss becomes: +$$ +\frac{1}{M} \sum_{k=1}^{M} \ell_k^{\text{wrong}} = \frac{1}{M} \sum_{k=1}^{M} \left( \frac{1}{N_k} \sum_{(i,t) \in \mathcal{S}_k} m_{i,t} \mathcal{L}_{i,t} \right) +$$ + +But the correct global `token-mean` loss is: +$$ +\mathcal{L}_{\text{token}} = \frac{1}{N_{\text{all}}} \sum_{k=1}^{M} \sum_{(i,t) \in \mathcal{S}_k} m_{i,t} \mathcal{L}_{i,t} +$$ + +These are only equal when all $N_k$ are identical. Under variable-length sequences or packing, $N_k$ varies significantly, causing bias. + +### 3.2 Sequence-level: Micro-batch `seq-mean` Causes Sample Weight Imbalance + +Take `seq-mean-token-mean` as an example: + +**Wrong approach** (normalize by micro-batch’s sample count $B_k$): +$$ +\ell_k^{\text{wrong}} = \frac{1}{B_k} \sum_{i \in \mathcal{S}_k} \bar{\mathcal{L}}_i, \quad \text{where } \bar{\mathcal{L}}_i = \frac{1}{N_i} \sum_t m_{i,t} \mathcal{L}_{i,t} +$$ + +After equal-weight averaging across micro-batches: +$$ +\frac{1}{M} \sum_{k=1}^{M} \ell_k^{\text{wrong}} = \frac{1}{M} \sum_{k=1}^{M} \left( \frac{1}{B_k} \sum_{i \in \mathcal{S}_k} \bar{\mathcal{L}}_i \right) +$$ + +But the correct global `seq-mean` is: +$$ +\mathcal{L}_{\text{seq-mean}} = \frac{1}{B} \sum_{i=1}^{B} \bar{\mathcal{L}}_i +$$ + +The former treats each micro-batch equally; the latter treats each sequence equally. When $B_k$ varies (common under packing), they are not equivalent. + +--- + +## 4. Correct Approach: Use Global Denominator + Sum Across Micro-batches + +ROLL follows these design principles: + +1. **Within each micro-batch, use global statistics as the denominator**; +2. **Each micro-batch’s returned loss should represent a partial contribution to the global loss**; +3. **The sum of all micro-batch losses must exactly equal the global loss**; +4. **Use `loss_scale` to counteract the backend’s default normalization behavior** (see Section 5). + +### 4.1 Correct Implementation for Token-level Loss + +For the $k$-th micro-batch: +$$ +\ell_k = \frac{1}{N_{\text{all}}} \sum_{(i,t) \in \mathcal{S}_k} m_{i,t} \mathcal{L}_{i,t} +$$ + +Then: +$$ +\sum_{k=1}^{M} \ell_k = \frac{1}{N_{\text{all}}} \sum_{k=1}^{M} \sum_{(i,t) \in \mathcal{S}_k} m_{i,t} \mathcal{L}_{i,t} = \mathcal{L}_{\text{token}} +$$ + +✅ Mathematically exact. + +### 4.2 Correct Implementation for Sequence-level Loss (e.g., `seq-mean-token-mean`) + +For the $k$-th micro-batch: +$$ +\ell_k = \frac{1}{B} \sum_{i \in \mathcal{S}_k} \bar{\mathcal{L}}_i +$$ + +Then: +$$ +\sum_{k=1}^{M} \ell_k = \frac{1}{B} \sum_{i=1}^{B} \bar{\mathcal{L}}_i = \mathcal{L}_{\text{seq-mean}} +$$ + +✅ Holds exactly even when $B_k$ varies (common under packing). + +--- + +## 5. `loss_scale`: Compensating for Backend Normalization + +Most training frameworks (e.g., Megatron, FSDP) implicitly normalize gradients under DP + GA to stabilize scale: + +- **GA dimension**: Average gradients over $A$ micro-steps (equivalent to `loss /= A`); +- **DP dimension**: Divide by $D$ after AllReduce (equivalent to averaging across ranks). + +The combined effect is: +$$ +g \propto \frac{1}{M} \sum_{k=1}^{M} \nabla \ell_k, \quad M = D \times A +$$ + +However, ROLL’s aggregation design requires **summation semantics** across micro-batches: +$$ +\nabla \mathcal{L}_{\text{global}} = \sum_{k=1}^{M} \nabla \ell_k +$$ + +To cancel the backend’s $1/M$ normalization, multiply each micro-batch’s loss by: +$$ +\text{loss\_scale} = M +$$ + +Thus: +$$ +\frac{1}{M} \sum_{k=1}^{M} \nabla (M \cdot \ell_k) = \sum_{k=1}^{M} \nabla \ell_k +$$ + +✅ Recovers correct summation semantics. + +--- + +## 6. ROLL Interface: Global Stat Injection and `loss_scale` Control + +To enable **globally equivalent loss aggregation** at the micro-batch level, ROLL automatically injects global batch statistics (e.g., total valid tokens, total valid samples) into each training step. These statistics are **computed based entirely on user-specified `loss_mask_keys`**. + +### 6.1 `loss_mask_keys`: Define Loss Participation Scope and Drive Global Stat Injection + +`loss_mask_keys` is a list of strings declaring **which mask fields identify "valid tokens participating in loss computation."** This configuration not only guides how the loss function masks invalid positions but—more importantly—**determines how the strategy computes and injects global aggregation quantities**. + +You must set this in your pipeline’s data preprocessing or worker initialization: +```python +data.meta_info['loss_mask_keys'] = ['response_mask', 'labels_mask'] +``` + +For each key in `loss_mask_keys` (e.g., `'response_mask'`), ROLL’s strategy will: + +1. **Extract the corresponding mask tensor** from `data.batch` (typically shape `[batch_size, seq_len]`); +2. **Gather this mask across all DP ranks and GA steps**; +3. **Compute two global statistics**: + - **`batch_num_tokens[key]`**: Total sum of this mask over the entire global batch, i.e., + $$ + N_{\text{all}}^{(\text{key})} = \sum_{\text{all samples}} \sum_{t} \text{mask}_{i,t}^{(\text{key})} + $$ + - **`global_valid_samples[key]`**: Number of sequences with **at least one valid token**, i.e., + $$ + B^{(\text{key})} = \sum_{i=1}^{B} \mathbb{I}\left( \sum_{t} \text{mask}_{i,t}^{(\text{key})} > 0 \right) + $$ + +These statistics are injected into `data.meta_info` for use in `loss_func`. + +> ⚠️ **Critical Consistency Requirement**: The mask you use in `loss_func` for loss computation, weighting, or aggregation **must have identical semantics to the mask specified in `loss_mask_keys`**. +> For example, if `loss_mask_keys = ['response_mask']`, your loss must be computed **only** using `response_mask`. Using a different mask (e.g., `attention_mask`) will cause a mismatch between the numerator (loss computation) and denominator (global stats), breaking equivalence. + +### 6.2 Using Injected Global Statistics in `loss_func` + +In your custom `loss_func`, access global statistics as follows: + +```python +# Assume 'response_mask' is in loss_mask_keys +mask_key = 'response_mask' + +N_all = data.meta_info['batch_num_tokens'][mask_key] # Global valid token count +B_all = data.meta_info['global_valid_samples'][mask_key] # Global valid sample count +``` + +Then use these global values as denominators during aggregation (see Section 4) to ensure micro-batch computations exactly reconstruct the global loss. + +### 6.3 `apply_loss_scale`: Controlling Gradient Scale Correction + +Since training backends (e.g., Megatron/FSDP) typically apply implicit $1/(D \times A)$ normalization under DP + GA, while ROLL relies on **summation semantics**, compensation via `loss_scale = D \times A` is needed. + +In `worker_config`, the parameter `apply_loss_scale` controls whether this scaling is applied automatically: + +- **Default: `True`** (recommended to keep enabled) +- **Effect**: Framework automatically multiplies the loss returned by `loss_func` by `loss_scale` +- **When to disable**: Only if you manually implement the full global loss (including scale) inside `loss_func`—generally not advised. + +--- + +## 7. Metrics Logging: Use `@sum` Semantics + +For losses aggregated using global denominators, metrics should be **summed—not averaged—during multi-worker reduction**. + +ROLL supports specifying reduction behavior via an `@operator` suffix in metric names: + +```python +metrics = { + "actor/kl_loss@sum": kl_loss.detach().item(), +} +reduce_metrics(metrics) +``` + +- `@sum`: Sum values across all workers during reduction; +- `@mean` (default): Average across workers; +- The logger automatically strips everything from `@` onward, so it displays as `actor/kl_loss`. + +--- + +## 8. Code Example: Globally Equivalent KL Loss Implementation in Actor + +### 8.1 Compute Per-Token KL + +```python +kl_loss = compute_approx_kl( + log_probs=log_probs, + log_probs_base=ref_log_probs, + action_mask=final_response_mask, + kl_penalty="k3" +) +``` + +### 8.2 Aggregate Using Global Denominator + +```python +kl_loss = agg_loss( + loss_mat=kl_loss, + loss_mask=final_response_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['final_response_mask'], + global_valid_samples=global_valid_samples['final_response_mask'], +) +``` + +### 8.3 Key Implementation of `agg_loss` + +```python +def agg_loss(loss_mat, loss_mask, loss_agg_mode, batch_num_tokens=None, global_valid_samples=None, weights=None): + if batch_num_tokens is None: + batch_num_tokens = loss_mask.sum() + if global_valid_samples is None: + global_valid_samples = loss_mat.size(0) + + if loss_agg_mode == "token-mean": + loss = (loss_mat * loss_mask).sum() / batch_num_tokens + elif loss_agg_mode == "seq-mean-token-sum": + seq_losses = (loss_mat * loss_mask).sum(dim=-1) + valid = (loss_mask.sum(dim=-1) > 0).float() + loss = (seq_losses * valid).sum() / (global_valid_samples + 1e-8) + elif loss_agg_mode == "seq-mean-token-mean": + seq_means = masked_mean(loss_mat, loss_mask, dim=-1) # Custom function supporting mask + valid = (loss_mask.sum(dim=-1) > 0).float() + loss = (seq_means * valid).sum() / (global_valid_samples + 1e-8) + else: + raise ValueError(f"Unsupported loss_agg_mode: {loss_agg_mode}") + + return loss +``` + +### 8.4 Log Metrics + +```python +pg_metrics = {"actor/kl_loss@sum": kl_loss.detach().item()} +``` + +--- + +## 9. Design Checklist: Custom Loss Implementation (⚠️ Summary of Critical Points) + +To ensure mathematical equivalence and training stability under any configuration, strictly follow this checklist: + +### ✅ **Loss Granularity and Aggregation Mode** +- Clearly decide whether your loss is **token-level** or **sequence-level**. +- Choose the correct `loss_agg_mode` (e.g., `"token-mean"`, `"seq-mean-token-mean"`). + +### ✅ **Use Global Denominators (Critical!)** +- **Never** use local micro-batch statistics (e.g., `loss_mask.sum()` or `loss_mat.shape[0]`) as denominators. +- **Always** use global statistics from `data.meta_info['batch_num_tokens'][key]` and `data.meta_info['global_valid_samples'][key]`. + +### ✅ **`loss_mask_keys` Configuration and Consistency (Common Pitfall!)** +- Explicitly set `data.meta_info['loss_mask_keys']` in your pipeline. +- **Ensure** the mask used in `loss_func` for computation/masking/weighting **exactly matches** the key(s) in `loss_mask_keys`. +- If using multiple masks (e.g., response + labels), include all in `loss_mask_keys` and handle them separately. + +### ✅ **`apply_loss_scale` Setting** +- **Keep default `True`** unless you fully understand and manage scaling logic yourself. +- Disabling incorrectly causes gradients to be implicitly scaled down by $1/(D \times A)$, leading to divergence or extremely slow convergence. + +### ✅ **Metrics Logging Convention** +- For losses using global denominators, **always** append `@sum` to metric names (e.g., `"loss@sum"`). +- Otherwise, reduction by averaging will log incorrect (underestimated by $M\times$) loss values. + +### ✅ **Special Care Under Packing** +- Never assume fixed $B_k$ (sample count) or $N_k$ (token count) per micro-batch. +- All aggregation logic must **avoid dependence on micro-batch-local statistics** and rely solely on injected global values. \ No newline at end of file diff --git a/docs_roll/docs/Development/Developer Guide/llm_as_judge_optimization.md b/docs_roll/docs/Development/Developer Guide/llm_as_judge_optimization.md new file mode 100644 index 000000000..f887c6fef --- /dev/null +++ b/docs_roll/docs/Development/Developer Guide/llm_as_judge_optimization.md @@ -0,0 +1,262 @@ +# LLM as Judge Optimization in Agentic Environments + +This document describes the optimized implementation of LLM as Judge in Agentic environments within the ROLL framework, including system architecture, call chains, configuration methods, and best practices. + +## Overview + +LLM as Judge is a method that uses large language models as evaluators to assess agent response quality. In Agentic training scenarios, when large-scale environment instances perform concurrent rollouts, using LLM as Judge to compute rewards generates massive concurrent LLM requests, which poses significant challenges to the stability and throughput of external LLM services. + +To address this challenge, the ROLL framework implements a scalable **localized parallel evaluation system** through an **independent Reward Cluster** and **efficient scheduling mechanisms**, avoiding dependency on external services and ensuring the stability and controllability of the training process. + +:::info Documentation Scope +This document uses the **DeepEyes environment's** LLM as Judge implementation as an example. For other environments that need LLM as Judge, you can refer to the calling patterns in `env_manager` and `env` to implement your own custom solutions. +::: + +### Key Advantages + +- **Independent Resource Management**: Reward model is separated from Policy model, allowing independent GPU resource allocation and avoiding resource contention +- **Localized Deployment**: Avoid external API dependencies through local Reward Cluster, ensuring service stability and data security +- **High Concurrency Support**: Efficient parallel reward evaluation through RequestScheduler, supporting scalable environment concurrency +- **Unified Interface Design**: Provides `generate_by_proxy` unified utility function, simplifying LLM calls and supporting both text and multimodal inputs +- **Flexible Configuration**: Supports multiple inference backends (vLLM, SGLang) and custom generation parameters + +### Application Scenarios + +Typical Agentic training scenarios: +- **Environment Scale**: 256 environment groups with 4 environments each, totaling 1024 concurrent environment instances +- **Rollout Frequency**: Each environment calls LLM Judge after completing an episode +- **Concurrency Pressure**: During rollout peaks, 500+ environments may simultaneously request reward evaluation +- **Stability Requirements**: Training process cannot be interrupted by external API rate limiting or timeouts + +The optimized implementation described in this document effectively addresses these challenges. + +## System Architecture + +### Overall Architecture + +``` +AgenticPipeline + ├── Reward Cluster (optional, independent GPU resources) + │ ├── InferWorker (default) + │ └── Supports vLLM/SGLang backends + │ + ├── Reward Scheduler (Ray Named Actor) + │ ├── Request routing and load balancing + │ ├── Concurrency control + │ └── Request tracking and cleanup + │ + └── Environment Manager + ├── llm_proxy: for policy inference + ├── reward_proxy: for LLM as Judge + └── env instances + └── Call reward_proxy in obtain_outcome_reward +``` + +### Key Components + +#### 1. Reward Cluster + +**Location**: `roll/pipeline/agentic/agentic_pipeline.py:88-98` + +Reward Cluster is an optional component, created only when `device_mapping` is configured: + +```python +self.reward = None +if (self.pipeline_config.reward is not None and + len(self.pipeline_config.reward.device_mapping) > 0): + self.reward = Cluster( + name=self.pipeline_config.reward.name, + worker_cls=self.pipeline_config.reward.worker_cls, # Default: InferWorker + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.reward, + ) +``` + +**Worker Class Default Configuration**: `roll/pipeline/agentic/agentic_config.py:287` +- Defaults to `InferWorker` as inference engine, reusing ActorInfer Worker implementation +- Supports multiple backends including vLLM and SGLang + +#### 2. Reward Scheduler (Ray Named Actor) + +**Location**: `roll/pipeline/agentic/agentic_pipeline.py:112-125` + +Reward Scheduler is created as a Ray Named Actor for shared access by all environment managers: + +```python +self.reward_scheduler = RequestScheduler.options( + name=f"RewardScheduler-{self.pipeline_config.reward.name}", + get_if_exists=True, + namespace=RAY_NAMESPACE, + scheduling_strategy=NodeAffinitySchedulingStrategy(...) +).remote( + infer_cluster=self.reward, + pipeline_config=self.pipeline_config, + resource_manager=self.resource_manager, +) +``` + +**Core Functionality**: + +- **Smart Routing**: Uses least-loaded routing algorithm to distribute requests to different DP ranks +- **Sticky Routing**: Requests from the same environment are routed to the same worker (beneficial for KV cache) +- **Request Tracking**: Maintains mapping from `request_id` to workers + +#### 3. Reward Proxy + +**Location**: `roll/pipeline/agentic/env_manager/vl_traj_env_manager.py:85-109` + +Environment manager retrieves Reward Scheduler via Ray and creates Reward Proxy: + +```python +# Get reward scheduler from Ray (Named Actor) +if self.pipeline_config.reward: + self.reward_scheduler = ray.get_actor( + name=f"RewardScheduler-{pipeline_config.reward.name}", + namespace=RAY_NAMESPACE + ) + + # Create reward proxy + self.reward_proxy = create_llm_proxy( + generate_scheduler=self.reward_scheduler, + llm_proxy_config=pipeline_config.reward.llm_proxy, + tokenizer=self.reward_tokenizer, + env=None, + ) +``` + +**Proxy Factory Function**: `roll/pipeline/agentic/llm_proxy/__init__.py:11` +- Supports multiple proxy types: `policy`, `openai`, `random` +- Extensible through registration mechanism +- Policy proxy has been validated in training; for externally deployed LLM services, use openai proxy (note concurrency challenges) + +#### 4. Unified Utility Function `generate_by_proxy` + +**Location**: `roll/pipeline/agentic/llm_proxy/proxy_utils.py:18-170` + +This is the core component called by environments, providing a unified LLM calling interface: + +```python +def generate_by_proxy( + messages: List[Dict[str, Any]], + tokenizer: PreTrainedTokenizer, + proxy: BaseLLMProxy, + enable_thinking: bool = False, + generation_config: Optional[Dict[str, Any]] = None, + collator: Optional[Any] = None, + mm_data: Optional[Dict[str, Any]] = None, + src_rank: Optional[int] = None, +) -> Optional[str] +``` + +**Core Features**: + +- **Unified Interface**: Same calling pattern for both text and multimodal inputs +- **Automatic Formatting**: Uses `tokenizer.apply_chat_template` to format messages +- **Multimodal Support**: Supports image/video inputs through `collator` parameter +- **Thinking Mechanism**: Supports chain-of-thought for models like DeepSeek and Qwen +- **Routing Control**: Implements sticky routing through `src_rank` parameter +- **Error Handling**: Returns `None` to indicate inference failure, handled by caller + +## Call Chain + +### Complete Call Flow + +``` +1. DeepEyesEnv.step() (env/deepeyes/env.py:182-197) + Triggers obtain_outcome_reward when done=True + ↓ +2. DeepEyesEnv.obtain_outcome_reward() (env/deepeyes/env.py:199-254) + Builds judge prompt, calls reward model + ↓ +3. generate_by_proxy() (llm_proxy/proxy_utils.py:18) + Unified LLM calling utility function + ↓ +4. reward_proxy.generate() (llm_proxy/policy_proxy.py:15) + Calls scheduler via Ray + ↓ +5. reward_scheduler.generate_one_request() (scheduler/generate_scheduler.py:1296) + Request routing and load balancing + ↓ +6. infer_cluster.workers[dp_rank].generate_request() + Actual model inference + ↓ +7. Returns LLM judgment result +``` + +## Configuration Guide + +### Complete Configuration Example + +```yaml +# Reward Configuration (LLM as Judge for AgenticPipeline) +reward: + name: "reward" + worker_cls: "roll.pipeline.base_worker.InferWorker" # Default value, can be omitted + model_args: + model_name_or_path: Qwen/Qwen2.5-72B-Instruct + dtype: bf16 + generating_args: + max_new_tokens: 2048 + temperature: 0.2 # Lower temperature for stable judgments + top_p: 0.95 + top_k: 20 + strategy_args: + strategy_name: vllm # or sglang + strategy_config: + gpu_memory_utilization: 0.8 + tensor_parallel_size: 4 + load_format: auto + # Critical: Must be non-empty to create reward cluster + device_mapping: list(range(8, 16)) # GPUs 8-15 + llm_proxy: + proxy_type: policy # Use policy proxy +``` + +### Configuration Key Points + +#### 1. device_mapping (Required) + +```yaml +# Recommended: Policy and Reward use independent GPUs +actor_infer: + device_mapping: list(range(0, 8)) # GPUs 0-7 + +reward: + device_mapping: list(range(8, 16)) # GPUs 8-15, independent resources +``` + +- **Empty or None**: Reward cluster not created, environments cannot use LLM as Judge +- **Non-empty**: Creates independent reward cluster, enables LLM as Judge +- **Independent Deployment**: Use different GPU resources from actor_infer. Policy inference and Reward evaluation run in parallel. actor_infer and reward must be deployed independently + +#### 2. strategy_name (Inference Backend Selection) + +```yaml +strategy_args: + strategy_name: vllm # or sglang + strategy_config: + gpu_memory_utilization: 0.8 + tensor_parallel_size: 4 + load_format: auto # Must configure auto; vllm/sglang strategies default to dummy load which randomly initializes parameters +``` + +#### 3. generating_args (Generation Parameters) + +```yaml +generating_args: + max_new_tokens: 2048 # Adjust based on judge output length + temperature: 0.2 # Lower temperature for stability + top_p: 0.95 + top_k: 20 +``` + +## Summary + +The optimized LLM as Judge implementation in Agentic environments achieves efficient scalability through the following key designs: + +1. **Independent Reward Cluster**: Resource isolation, avoiding competition with Policy inference +2. **Ray Named Actor**: Reward Scheduler as a shared service, accessible by all environments +3. **Unified Utility Function**: `generate_by_proxy` simplifies calls, supports text and multimodal +4. **Smart Routing**: Sticky routing and load balancing, improving cache utilization + +By properly configuring and using these components, you can build an efficient and reliable LLM as Judge evaluation system. diff --git a/docs_roll/docs/Development/Developer Guide/rollout_mock_usage.md b/docs_roll/docs/Development/Developer Guide/rollout_mock_usage.md new file mode 100644 index 000000000..f0bed1d04 --- /dev/null +++ b/docs_roll/docs/Development/Developer Guide/rollout_mock_usage.md @@ -0,0 +1,289 @@ +--- +sidebar_position: 4 +--- + +# Rollout Dump Mock Usage Guide + +## Overview + +Rollout Dump Mock is a powerful debugging tool in the ROLL framework designed to **eliminate randomness in the rollout phase of RL training**, enabling numerical precision alignment verification. By saving and replaying rollout data, it helps developers quickly validate the correctness of computational optimizations. + +### Core Value + +- **Eliminate Randomness**: Enable numerical precision alignment verification +- **Fast Iteration**: Mock mode skips expensive environment rollout +- **Reproducible Debugging**: Capture problematic rollout data for repeated debugging +- **Transparent Architecture**: Implemented at the Scheduler layer, completely transparent to the Pipeline + +### Use Cases + +| Scenario | Description | +|----------|-------------| +| **Computation Optimization Verification** | Verify numerical consistency of optimizations like dynamic_batching, sequence_packing | +| **Model Parallelism Verification** | Verify precision alignment of TP, PP, EP and other parallel strategies | +| **Regression Testing** | Automated precision testing in CI/CD pipelines | + +--- + +## Quick Start + +### Typical Workflow + +``` +[1. Dump Mode] → [2. Modify Code] → [3. Mock Mode] → [4. Precision Verification] + ↓ ↓ ↓ ↓ + Capture baseline Optimize compute Deterministic Numerical + data logic replay comparison +``` + +### Step 1: Dump Mode - Capture Baseline Data + +Before modifying code, capture correct rollout data as a baseline. + +**Configuration File** (`agentic_sokoban_rollout_mock_dump.yaml`): +```yaml +exp_name: "sokoban_precision_test_dump" +max_steps: 50 + +# Rollout Mock Configuration - DUMP MODE +rollout_mock: + enable: true + mode: dump + dump_dir: ./output/rollout_dumps/baseline_v1 + +# Environment variables for deterministic execution +system_envs: + NCCL_ALGO: Ring + NVTE_ALLOW_NONDETERMINISTIC_ALGO: '0' + CUBLAS_WORKSPACE_CONFIG: ':4096:8' + DETERMINISTIC_MODE: '1' + +# ... other configurations ... +``` + +**Command**: +```bash +python examples/start_agentic_pipeline.py \ + --config_name agentic_sokoban_rollout_mock_dump \ + --config_path examples/qwen2.5-0.5B-agentic +``` + +**Output**: +``` +./output/rollout_dumps/baseline_v1/ + └── train/ + ├── step_000000.pkl (~5MB) + ├── step_000001.pkl + ├── step_000002.pkl + ├── ... + └── step_000049.pkl +``` + +**Log Example**: +``` +[Rollout Mock] Rollout Mock enabled: mode=dump, dir=./output/rollout_dumps/baseline_v1 +[Rollout Mock] Dumped step 0: ./output/rollout_dumps/baseline_v1/train/step_000000.pkl (samples=128, size=4.82MB) +[Rollout Mock] Dumped step 1: ./output/rollout_dumps/baseline_v1/train/step_000001.pkl (samples=128, size=4.85MB) +``` + +### Step 2: Modify Code + +Implement your computational optimizations, such as: +- Adding dynamic_batching +- Implementing sequence_packing +- Migrating to new parallel strategies + +### Step 3: Mock Mode - Deterministic Replay + +Use pre-recorded rollout data to verify that modified code maintains numerical consistency. + +**Configuration File** (`agentic_sokoban_rollout_mock_mock.yaml`): +```yaml +exp_name: "sokoban_precision_test_mock" +max_steps: 50 + +# Rollout Mock Configuration - MOCK MODE +rollout_mock: + enable: true + mode: mock + dump_dir: ./output/rollout_dumps/baseline_v1 # Same path as dump mode + +# Environment variables for deterministic execution (keep consistent with dump mode) +system_envs: + NCCL_ALGO: Ring + NVTE_ALLOW_NONDETERMINISTIC_ALGO: '0' + CUBLAS_WORKSPACE_CONFIG: ':4096:8' + DETERMINISTIC_MODE: '1' + +# ... other configurations (keep consistent with dump mode) ... +``` + +**Command**: +```bash +python examples/start_agentic_pipeline.py \ + --config_name agentic_sokoban_rollout_mock_mock \ + --config_path examples/qwen2.5-0.5B-agentic +``` + +**Behavior**: +- ✅ Directly loads DataProto from disk for each step +- ✅ All subsequent computations (advantages, losses, gradients) are fully deterministic + +**Log Example**: +``` +[Rollout Mock] Rollout Mock enabled: mode=mock, dir=./output/rollout_dumps/baseline_v1 +[Rollout Mock] Loaded step 0: ./output/rollout_dumps/baseline_v1/train/step_000000.pkl (samples=128) +[Rollout Mock] Loaded step 1: ./output/rollout_dumps/baseline_v1/train/step_000001.pkl (samples=128) +``` + + +### Step 4: Numerical Precision Verification + +Compare training metrics between baseline and optimized versions to ensure complete numerical consistency. You can verify that both runs produce identical results by examining key metrics (such as pg_loss, total_loss, value_loss, approx_kl, grad_norm, etc.) in the logs. +--- + +## Configuration Parameters + +### Configuration Schema + +Add the `rollout_mock` section to your YAML configuration file: + +```yaml +rollout_mock: + enable: bool # Enable rollout dump/mock mechanism + mode: "dump" | "mock" # dump: save data, mock: load data + dump_dir: str # Data storage directory +``` + +### Configuration Examples + +**Dump Mode Configuration**: +```yaml +rollout_mock: + enable: true + mode: dump + dump_dir: ./rollout_dumps/precision_test_v1 +``` + +**Mock Mode Configuration**: +```yaml +rollout_mock: + enable: true + mode: mock + dump_dir: ./rollout_dumps/precision_test_v1 # Same path as dump mode +``` + +### Environment Variables for Deterministic Execution + +To ensure complete numerical reproducibility, the following environment variables should be configured: + +```yaml +system_envs: + NCCL_ALGO: Ring # Use Ring algorithm for NCCL + NVTE_ALLOW_NONDETERMINISTIC_ALGO: '0' # Disable non-deterministic algorithms in Transformer Engine + CUBLAS_WORKSPACE_CONFIG: ':4096:8' # Enable deterministic CUDA operations + DETERMINISTIC_MODE: '1' # Enable PyTorch deterministic mode +``` + +**DETERMINISTIC_MODE Effects**: +- Sets `torch.backends.cudnn.deterministic = True` for reproducible cuDNN operations +- Sets `torch.backends.cudnn.benchmark = False` to disable auto-tuning that causes non-determinism +- Calls `torch.use_deterministic_algorithms(True)` to enforce deterministic PyTorch algorithms + +**Important**: These environment variables must be kept consistent between dump and mock modes to ensure numerical precision alignment. + +### Key Considerations + +1. **dump_dir must match**: Dump and Mock modes must use the same `dump_dir` path +2. **mode must match**: Scheduler mode (train/val) must match the dump mode +3. **max_steps cannot exceed**: Mock mode `max_steps` cannot exceed the value used in Dump mode +4. **system_envs must be consistent**: Environment variables for deterministic execution should be identical between dump and mock modes + +--- + +## Common Issues and Troubleshooting + +### Issue 1: Mock File Not Found + +**Error Message**: +``` +FileNotFoundError: [Rollout Mock] Mock file not found: ./dumps/baseline/train/step_000005.pkl +Possible reasons: + 1. Step 5 was not run in dump mode + 2. dump_dir configuration is incorrect: ./dumps/baseline + 3. mode mismatch (current: train) +Please run in dump mode first to ensure all step data is generated. +``` + +**Troubleshooting Steps**: + +1. Check if enough steps were run in dump mode: + ```bash + ls -lh ./output/rollout_dumps/baseline_v1/train/ + # Should see step_000000.pkl ~ step_000049.pkl + ``` + +2. Confirm `max_steps` consistency: + ```bash + # Dump: max_steps=50 + # Mock: max_steps=50 (must match or be smaller) + ``` + +3. Verify `dump_dir` path is correct: + ```yaml + # Dump mode + dump_dir: ./output/rollout_dumps/baseline_v1 + + # Mock mode (must be same) + dump_dir: ./output/rollout_dumps/baseline_v1 + ``` + +### Issue 2: Mode Mismatch + +**Problem**: Used train mode during dump, but accidentally used val mode during mock. + +**File Structure**: +``` +dumps/baseline/ + ├── train/ # Generated during dump + │ └── step_*.pkl + └── val/ # Empty directory + └── (no files) +``` + +**Solution**: Ensure dump and mock use the same scheduler mode (train/val). + +### Issue 3: Insufficient Disk Space + +**Symptom**: Error during dump: +``` +OSError: [Errno 28] No space left on device +``` + +**Disk Usage Estimation**: +``` +Single step file size ≈ batch_size × seq_len × data type size + ≈ 128 × 512 × 4 bytes (float32) + ≈ 256KB ~ 10MB (depending on sequence length and metadata) + +Total disk usage ≈ single step size × max_steps + ≈ 5MB × 100 steps = 500MB +``` + +**Solutions**: +- Increase disk space +- Reduce `max_steps` +- Use network storage (OSS, etc.) + +### Issue 4: Pickle Version Incompatibility + +**Symptom**: Error when loading across different Python versions: +``` +pickle.UnpicklingError: invalid load key, '\x00' +``` + +**Cause**: Pickle compatibility issues between different Python versions. + +**Solutions**: +- Ensure dump and mock use the same Python version +- Or use a lower protocol version during dump (requires source code modification) diff --git a/docs_roll/docs/User Guides/Advanced Features/dynamic_batching.md b/docs_roll/docs/User Guides/Advanced Features/dynamic_batching.md new file mode 100644 index 000000000..8369a57e9 --- /dev/null +++ b/docs_roll/docs/User Guides/Advanced Features/dynamic_batching.md @@ -0,0 +1,214 @@ +# ROLL Dynamic Batching + +The ROLL framework supports **Dynamic Batching** for rollout batches. This feature minimizes invalid token computation and improves overall computational efficiency. This document provides a detailed guide on how to use this feature. + +## Glossary + +- attention_mask: data in the rollout batch ,where `1` represents a real token and `0` represents a `pad_token` +- micro_batch (mbs): The micro-batch during the model forward pass. +- num_micro_batches: The number of micro_batch in one mini-batch. +- micro_batch_size: The number of sequences in the micro_batch. +- micro_batch_seqlen: The sequence length in the micro_batch. +- dp_size, dp_rank, shard: The size of data parallelism, the specific rank within the data parallel group and the training data in the data parallel group. +- vpp: Virtual Pipeline Model Parallelism; an efficient pipeline parallel technique supported by the Megatron-LM framework. + +## Introduction + +In Reinforcement Learning (RL) training, the data generated during rollout phase has a **long-tail** effect, that the sequence lengths vary significantly. This phenomenon is even more pronounced in **Agentic Pipelines**, where training data is generated through multi-turn interactions with an environment. + +In the train step of RL, all samples in a rollout batch are typically padded to a fixed `max_len`. Consequently, these pad tokens are included in the calculation, leading to a waste of computational resources. + +To address this and improve efficiency, the core idea of Dynamic Batching is: +- Partition the rollout batch across DP (Data Parallel) Ranks according to actual tokens and ensure a balanced workload. +- The sequence of samples is rearranged so that samples with similar lengths are grouped together, to remove as many pad tokens as possible. + +## Example +The following example briefly illustrates the process of Dynamic Batching in ROLL. + +**Assumptions:** `dp_size=2`, `num_seqs=8`, `max_tokens_microbatch=10`, `sequence_length_round=2` + +Original input `attention_mask` +```bash +attention_mask: +[1, 1, 1, 1, 1, 1, 1, 0, 0, 0] +[1, 1, 1, 1, 1, 1, 0, 0, 0, 0] +[1, 1, 1, 1, 1, 1, 1, 1, 0, 0] +[1, 1, 1, 1, 1, 0, 0, 0, 0, 0] +[1, 0, 0, 0, 0, 0, 0, 0, 0, 0] +[1, 1, 1, 0, 0, 0, 0, 0, 0, 0] +[1, 1, 1, 1, 1, 1, 1, 1, 0, 0] +[1, 1, 1, 1, 1, 1, 0, 0, 0, 0] +``` +The corresponding `seq_lens` are: +```bash +seq_lens: +[7, 6, 8, 5, 1, 3, 8, 6] +``` + +As shown, the number of actual tokens varies significantly between sequences, causing the waste of GPU resources for processing `pad_tokens`. + +To optimize efficiency, ROLL Dynamic Batching follows these steps to eliminate pad tokens within a `micro_batch`: + +**1. Sort and Shard:** A shard represents the training data within each dp_rank. By default, the data is sharded in order. In Dynamic Batching, sequences are first sorted by their actual length and then sharded to ensure that the number of tokens is balanced across dp_ranks. +```bash +# seq_lens after sorting: +[1, 3, 5, 6, 6, 7, 8, 8] + +# Partition into dp_size shards: +shard0: + [1, 5, 6, 8] +shard1: + [3, 6, 7, 8] +``` + +**2. Micro-batch Partition:** + +The partition process consider the following two parameters: + +- `max_tokens_per_microbatch`: The maximum number of tokens allowed in one micro_batch. `micro_batch_size * micro_batch_seqlen` cannot exceed this value. If it is exceeded, a new micro_batch must be created. +- `sequence_length_round`: The `micro_batch_seqlen` must be a multiple of this value. For example, the sequence lengths in a micro_batch is [200, 240] and `sequence_length_round` is 64, the sequences in this micro-batch must be padded to a length of 256. + +The shard partition process for Dynamic Batching aims to find the split that maximizes the number of tokens in a micro-batch, while ensuring the numer of tokens in mirco_batch cannot exceed `max_tokens_per_microbatch`. It also ensures that the sequence length for each micro-batch is padded up to a multiple of `sequence_length_round`. + +The process is detailed as follows: + + + +```bash +shard0: + mbs0: # Padding length 6 + [1, 0, 0, 0, 0, 0 + 1, 1, 1, 1, 1, 0] + mbs1: # Padding length 8 + [1, 1, 1, 1, 1, 1, 0, 0] + mbs2: # Padding length 8 + [1, 1, 1, 1, 1, 1, 1, 1] + +shard1: + mbs0: # Padding length 6 + [1, 1, 1, 0, 0, 0 + 1, 1, 1, 1, 1, 1] + mbs1: # Padding length 8 + [1, 1, 1, 1, 1, 1, 1, 0] + mbs2: # Padding length 8 + [1, 1, 1, 1, 1, 1, 1, 1] +``` +In this example, the original total token count was `80` (`8 * 10`). After Dynamic Batching, the total token count is reduced to 56, removing 30% of the `pad_tokens`. + +**3. Support Virtual Pipeline Model Parallel :** Split micro-batches with more tokens and `micro_batch_size > 1`. This ensures the number of micro-batches is an integer multiple of `pp_size` (compatible with Megatron). + +Since the `num_microbatches` in the original example is not divisible by pp_size, mbs0 is selected and split into two mbs, as follows: + +```bash +shard0: + mbs0: # padding length 6 + [1, 0, 0, 0, 0, 0] + mbs1: # padding length 6 + [1, 1, 1, 1, 1, 0] + mbs2: # padding length 8 + [1, 1, 1, 1, 1, 1, 0, 0] + mbs3: # padding length 8 + [1, 1, 1, 1, 1, 1, 1, 1] +shard1: + mbs0: # padding length 6 + [1, 1, 1, 0, 0, 0] + mbs1: # padding length 6 + [1, 1, 1, 1, 1, 1] + mbs2: # padding length 8 + [1, 1, 1, 1, 1, 1, 1, 0] + mbs3: # padding length 8 + [1, 1, 1, 1, 1, 1, 1, 1] + +``` + +## Configuration Parameters + +The Dynamic Batching parameters are divided into `train` and `infer`: + +### Train +- `use_dynamic_batching_in_train`: Whether to enable this feature during the `train_step`. +- `max_tokens_per_microbatch_in_train`: The maximum number of tokens allowed per micro-batch during training. +- `sequence_length_round_in_train`: The sequence length of each micro-batch must be divisible by this value. It should also be divisible by `tensor_model_parallel_size * context_parallel_size`. Common values are 128 or 64. + +### Infer +- `use_dynamic_batching_in_infer`: Whether to enable this during phases that do not require gradient update (e.g., `compute_log_probs`). +- `max_tokens_per_microbatch_in_infer`: Same as the train, usually be higher depending on gpu memory. +- `sequence_length_round_in_infer`: Same as train. + +## Full Configuration + +```yaml +actor_train: + # Flash Attention is recommended when using both Dynamic Batching and Context Parallel + system_envs: + NVTE_FLASH_ATTN: '1' + NVTE_FUSED_ATTN: '0' + NVTE_UNFUSED_ATTN: '0' + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 2 + gradient_accumulation_steps: 64 + warmup_steps: 10 + lr_scheduler_type: cosine + data_args: + template: qwen2_5 + strategy_args: + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + device_mapping: list(range(0,8)) + infer_batch_size: 2 + use_dynamic_batching_in_train: true + max_tokens_per_microbatch_in_train: 8192 + sequence_length_round_in_train: 128 + use_dynamic_batching_in_infer: true + max_tokens_per_microbatch_in_infer: 16384 + sequence_length_round_in_infer: 128 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: 128 # single-turn response length + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + load_format: auto + device_mapping: list(range(0,8)) + +reference: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + strategy_args: + strategy_name: megatron_infer + strategy_config: ~ + device_mapping: list(range(0,8)) + infer_batch_size: 2 + use_dynamic_batching_in_infer: true + max_tokens_per_microbatch_in_infer: 16384 + sequence_length_round_in_infer: 128 +``` \ No newline at end of file diff --git a/docs_roll/docs/User Guides/Advanced Features/sequence_packing.md b/docs_roll/docs/User Guides/Advanced Features/sequence_packing.md new file mode 100644 index 000000000..efcdb42fe --- /dev/null +++ b/docs_roll/docs/User Guides/Advanced Features/sequence_packing.md @@ -0,0 +1,319 @@ +# SEQUENCE PACKING IN ROLL + +The ROLL framework now supports **Sequence Packing**, a feature that eliminates padding tokens by packing variable-length sequences together, thereby improving computational efficiency. This document provides a detailed explanation of the implementation rationale and configuration methods for this feature. + +> **Note**: Currently, only `megatron_strategy` supports `sequence_packing`. + +## 1. Introduction + +In reinforcement learning (RL) training scenarios, rollout data typically exhibits a long-tailed distribution. In conventional training pipelines, samples within a micro-batch are padded to a fixed maximum sequence length before being grouped into a batch for training. This approach wastes significant computational resources on processing padding tokens and slows down training. + +To address this issue, ROLL introduces **Sequence Packing**, which: +- Packs sequences of varying lengths within each micro-batch to eliminate padding tokens. +- Employs optimized packing algorithms to improve packing efficiency, reduce the number of micro-batches, and accelerate training. + +## 2. Implementation Principles + +### 2.1 Data Partitioning Hierarchy + +In distributed training, data is organized in the following hierarchical structure: + +``` +GLOBAL BATCH (Global Batch) +├── DP RANK 0 → BATCH 0 +│ └── MINI BATCH 0 (used for one gradient update) +│ ├── MICRO BATCH 0 (smallest computation unit) +│ ├── MICRO BATCH 1 +│ └── ... +├── DP RANK 1 → BATCH 1 +│ └── MINI BATCH 0 +│ ├── MICRO BATCH 0 +│ └── ... +└── ... +``` + +- **GLOBAL BATCH**: The complete rollout results generated by `actor_infer`. +- **BATCH**: A subset of the Global Batch assigned to a specific Data Parallel (DP) rank. +- **MINI BATCH**: A portion of a Batch used for a single gradient update (considering gradient accumulation). +- **MICRO BATCH**: The smallest computational unit derived from a Mini Batch, used in a single forward/backward pass. + +In standard training, all samples within a micro-batch are padded to a fixed length, leading to substantial computational waste. Sequence Packing solves this by packing sequences at the micro-batch level. + +### 2.2 Core Mechanism of Sequence Packing + +The primary goal of Sequence Packing is to eliminate padding tokens while ensuring correct and efficient execution under complex distributed training configurations—particularly when Context Parallelism (CP) and Tensor Parallelism (TP) are enabled. To achieve this, the packing process must satisfy specific alignment constraints critical for both correctness and performance. + +#### 2.2.1 Alignment Requirement: Multiple of 2 × CP_SIZE × TP_SIZE + +When Context Parallelism (CP) and Tensor Parallelism (TP) are enabled, the packed sequence length **must be a multiple of `2 × CP_SIZE × TP_SIZE`**. + +This requirement stems from the needs of both parallelism strategies: + +1. **TENSOR PARALLELISM (TP)**: When Sequence Parallelism is enabled, sequences are split across TP ranks during the forward pass. Thus, the sequence length must be divisible by `TP_SIZE`. + +2. **CONTEXT PARALLELISM (CP)**: To achieve load balancing in CP, sequences must be logically divided into `2 × CP_SIZE` chunks. Hence, the sequence length must also be divisible by `2 × CP_SIZE`. + +Combining these two requirements, the sequence length must be a multiple of **`2 × CP_SIZE × TP_SIZE`** to ensure compatibility with both TP and CP. + +#### 2.2.2 Why the Factor of 2? Detailed Explanation of CP Load Balancing + +In Context Parallel (CP) training, the asymmetric nature of causal attention leads to severe load imbalance. + +**Root Cause – Asymmetry in Causal Attention** + +Consider a sequence of length 6: `[0, 1, 2, 3, 4, 5]`, with `CP=2`: + +``` +Full causal attention mask: + 0 1 2 3 4 5 +0 [ 1 0 0 0 0 0 ] +1 [ 1 1 0 0 0 0 ] +2 [ 1 1 1 0 0 0 ] +3 [ 1 1 1 1 0 0 ] +4 [ 1 1 1 1 1 0 ] +5 [ 1 1 1 1 1 1 ] +``` + +**Problem with Naive Partitioning**: + +If the sequence is simply split evenly: +- CP0 handles: `[0, 1, 2]` +- CP1 handles: `[3, 4, 5]` + +The actual computational loads become: +- **CP0**: Only computes attention weights for its own positions (6 weight computations). +- **CP1**: Must compute attention weights from its positions to all preceding positions (15 weight computations). + +**Load ratio: 6:15 = 2:5** — CP1 bears 2.5× more computation than CP0! + +**Solution – 2×CP Interleaved Chunking** + +Megatron-Core resolves this by splitting the sequence into **`2 × CP`** chunks and applying an interleaved assignment strategy: + +``` +Original sequence: [0, 1, 2, 3, 4, 5] +Split into 4 chunks: |[0,1]|[2,3]|[4,5]|[p,p]| (padded to multiple of 4) + +Interleaved assignment: +- Chunk 0 [0,1] → CP0 +- Chunk 1 [2,3] → CP1 +- Chunk 2 [4,5] → CP1 +- Chunk 3 [p,p] → CP0 + +Final assignment: +- CP0: [0,1] + [p,p] +- CP1: [2,3] + [4,5] +``` + +This carefully designed assignment balances the computational load between CP ranks, avoiding performance bottlenecks. + +Thus, **the factor of 2 is essential for CP load balancing**, ensuring roughly equal workloads across CP ranks under causal attention. + +#### 2.2.3 Complete Packing Example + +Assume a micro-batch contains the following samples (original max sequence length = 8): + +| Sample ID | Original Sequence | Valid Length | +|-----------|---------------------------|--------------| +| 0 | `[0, 0, p, p, p, p, p, p]`| 2 | +| 1 | `[1, 1, 1, 1, p, p, p, p]`| 4 | +| 2 | `[2, 2, 2, 2, 2, 2, p, p]`| 6 | +| 3 | `[3, p, p, p, p, p, p, p]`| 1 | + +Configuration: `CP_SIZE=2`, `TP_SIZE=1` + +**Step 1: Remove original padding** +``` +Sample 0: [0, 0] +Sample 1: [1, 1, 1, 1] +Sample 2: [2, 2, 2, 2, 2, 2] +Sample 3: [3] +``` + +**Step 2: Re-pad to alignment boundary** +- Alignment factor = 2 × CP_SIZE × TP_SIZE = 2 × 2 × 1 = 4 + +Re-padded sequences: +``` +Sample 0: [0, 0, p, p] → length 4 +Sample 1: [1, 1, 1, 1] → length 4 +Sample 2: [2, 2, 2, 2, 2, 2, p, p] → length 8 +Sample 3: [3, p, p, p] → length 4 +``` + +**Step 3: Detailed CP Chunking Process** + +With `CP_SIZE=2`, each sequence is logically split into **`2 × CP_SIZE = 4`** segments and assigned via interleaving: + +For any sequence of length L under `CP_SIZE=2`: +- Split into 4 consecutive segments: seg0, seg1, seg2, seg3 +- Each segment has length L/4 +- Assignment rule: + - **CP0**: seg0 + seg3 + - **CP1**: seg1 + seg2 + +Applied to our example: + +- **Sample 0** `[0, 0, p, p]` (length 4): + - seg0: `[0]`, seg1: `[0]`, seg2: `[p]`, seg3: `[p]` + - CP0 gets: seg0 + seg3 = `[0] + [p]` → processes `[0, p]` + - CP1 gets: seg1 + seg2 = `[0] + [p]` → processes `[0, p]` + +- **Sample 1** `[1, 1, 1, 1]` (length 4): + - seg0: `[1]`, seg1: `[1]`, seg2: `[1]`, seg3: `[1]` + - CP0: `[1] + [1]` → `[1, 1]` + - CP1: `[1] + [1]` → `[1, 1]` + +- **Sample 2** `[2, 2, 2, 2, 2, 2, p, p]` (length 8): + - seg0: `[2, 2]`, seg1: `[2, 2]`, seg2: `[2, 2]`, seg3: `[p, p]` + - CP0: `[2, 2] + [p, p]` → `[2, 2, p, p]` + - CP1: `[2, 2] + [2, 2]` → `[2, 2, 2, 2]` + +- **Sample 3** `[3, p, p, p]` (length 4): + - seg0: `[3]`, seg1: `[p]`, seg2: `[p]`, seg3: `[p]` + - CP0: `[3] + [p]` → `[3, p]` + - CP1: `[p] + [p]` → `[p, p]` + +**Step 4: Final Packed Input per CP Rank** + +- **CP0’s full input**: `[0, p, 1, 1, 2, 2, p, p, 3, p]` +- **CP1’s full input**: `[0, p, 1, 1, 2, 2, 2, 2, p, p]` + +**Step 5: Cumulative Sequence Lengths** + +Padded cumulative lengths: `[0, 4, 8, 16, 20]` + +### 2.3 Loss Computation Workflow + +Under Sequence Packing, loss calculation requires special handling: + +1. **Unpack Model Outputs**: Use `_unpack_sequences` to restore individual sequences from the packed output. + - Compute start/end positions of each sequence on the current CP rank using `cu_seqlens_padded`. + - `seq_starts = cu_seqlens_padded[:-1] // cp_size` + - `seq_ends = cu_seqlens_padded[1:] // cp_size` + +2. **Per-Sequence Loss Calculation**: + - Apply the loss function to each unpacked sequence individually. + - Adjust original data to match the actual sequence length using `adjust_sequence_length`. + - Accumulate losses from all sequences. + +3. **Result Aggregation**: + - Sum all per-sequence losses to obtain the total loss. + - Aggregate metrics across sequences. + - Apply loss scaling if enabled. + +This per-sequence approach ensures correct loss computation even under complex combinations of CP, TP, and packing. + +### 2.4 Load Balancing Optimization + +To maximize the effectiveness of Sequence Packing, ROLL applies the **Karmarkar-Karp algorithm** at multiple levels for load balancing. + +**Karmarkar-Karp Algorithm Overview**: +A classical multi-way partitioning algorithm that divides a set of numbers into *k* subsets with sums as balanced as possible. In Sequence Packing, it ensures computational loads across processing units remain balanced, preventing bottlenecks. + +Key optimizations include: +- **GLOBAL BATCH → DP RANK Load Balancing**: Ensures each DP rank receives a similar total number of tokens. +- **MINI BATCH → MICRO BATCH Load Balancing**: Balances computational load across micro-batches. + +Implementation details and responsibility allocation are described in Section 3.2. + +## 3. Implementation Workflow + +### 3.1 Core Packing and Unpacking Logic + +Packing logic resides primarily in the strategy layer. When `use_sequence_packing` is enabled, the strategy automatically packs micro-batches and unpacks logits for loss computation. + +**Core packing function `_pack_sequences` performs**: +1. Removes original padding and extracts valid tokens. +2. Computes cumulative sequence lengths (both original and padded). +3. Re-pads sequences to a multiple of `2 * cp_size * tp_size`. +4. Handles CP chunking and assignment. +5. Concatenates sequences and creates `PackedSeqParams`. + +**Loss computation** is handled by `loss_wrapper`, which unpacks outputs and computes per-sequence losses. + +### 3.2 Load Balancing Responsibility Allocation + +Load balancing in ROLL follows a clear division of responsibilities: + +1. **GLOBAL BATCH → DP RANK Load Balancing**: + - **Responsible Module**: Pipeline layer (`batch_balance` function) + - **Objective**: Equalize total token count per DP rank + - **Method**: Apply Karmarkar-Karp algorithm before data distribution + +2. **MINI BATCH → MICRO BATCH Load Balancing**: + - **Responsible Module**: Strategy layer (`make_micro_batch_iter_for_sequence_packing`) + - **Objective**: Balance computational load across micro-batches + - **Method**: Apply Karmarkar-Karp during micro-batch generation + +3. **Preservation of Randomness**: + - The division from Batch → Mini Batch retains randomness (for shuffling) and thus does **not** apply load balancing. + +This layered optimization ensures balanced workloads from global to local levels, maximizing hardware utilization. + +## 4. Configuration Parameters + +### 4.1 How to Enable Sequence Packing + +To use Sequence Packing, simply set `use_sequence_packing: true` in your configuration file. + +### 4.2 Parameter Details (Plain Language) + +#### `algorithm` (Packing Algorithm) +- **`none`**: Default simple packing—sequences are packed in their original order. +- **`load_balance`**: Intelligent load-balanced packing—reorders data to balance computational load across micro-batches. **Recommended**. + +#### `max_packed_sequence_length_train` (Max Packed Length for Training) +- Controls the maximum allowed length of a packed sequence during training. +- E.g., setting to 8192 means no packed sequence will exceed 8192 tokens. +- Choose a reasonable value to avoid out-of-memory errors while maintaining packing efficiency. + +#### `max_packed_sequence_length_forward` (Max Packed Length for Inference) +- Same as above, but applied during inference. +- Typically set to the same value as the training parameter. + +#### `min_num_micro_batches_train` (Minimum Micro-Batches for Training) +- Specifies the minimum number of micro-batches per mini-batch during training. +- Setting to 1 means no constraint—the system auto-determines optimal splitting. +- Increase this value if facing GPU memory issues to reduce micro-batch size. + +#### `min_num_micro_batches_forward` (Minimum Micro-Batches for Inference) +- Same as above, but for inference. + +### 4.3 Full Configuration Example + +```yaml +actor_train: + # Enable sequence packing + use_sequence_packing: True + + # Sequence packing configuration + sequence_packing_args: + # Use load-balancing algorithm for better performance + algorithm: load_balance + + # Max packed sequence length during training + max_packed_sequence_length_train: 8192 + + # Max packed sequence length during inference + max_packed_sequence_length_forward: 8192 + + # Minimum 1 micro-batch during training (no constraint) + min_num_micro_batches_train: 1 + + # Minimum 1 micro-batch during inference + min_num_micro_batches_forward: 1 + + # Sequence packing requires megatron strategy + strategy_args: + strategy_name: megatron_train +``` + +### 4.4 Usage Recommendations + +1. **Mandatory Condition**: Only supported under `megatron_train` or `megatron_infer` strategies. +2. **Recommended Setting**: Use `algorithm: load_balance` for optimal performance. +3. **Length Tuning**: Set `max_packed_sequence_length` based on your GPU memory capacity—typically equal to the model’s maximum supported sequence length. +4. **Custom Loss Functions**: If using a custom loss function with sequence packing, refer to the custom loss documentation and ensure `apply_loss_scale` is correctly configured. + +With proper configuration, Sequence Packing significantly boosts training efficiency—especially in RL scenarios with highly variable sequence lengths—while maintaining model performance. \ No newline at end of file diff --git a/docs_roll/docs/User Guides/Configuration/fsdp2.md b/docs_roll/docs/User Guides/Configuration/fsdp2.md new file mode 100644 index 000000000..128579dcb --- /dev/null +++ b/docs_roll/docs/User Guides/Configuration/fsdp2.md @@ -0,0 +1,246 @@ +# FSDP2 Training and Inference Backend Configuration Guide + +[FSDP2 (Fully Sharded Data Parallel 2](https://docs.pytorch.org/tutorials/intermediate/FSDP_tutorial.html) is PyTorch's latest distributed training framework that provides efficient parameter sharding with [DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html). This document will provide detailed instructions on how to configure and use the FSDP2 backend in the ROLL framework. + +## FSDP2 with ROLL + +ROLL support the following FSDP2 features: +1. **FSDP2 Sharding**: Shards model parameters, gradients, and optimizer with FSDP2 [fully_shard](https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html). Also support checkpoint management with [DCP](https://docs.pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html). +2. **Context Parallelism**: Supports integration with Context Parallel (Ulysses) +3. **Model Support**: Supports text models, Vision-Language (VL) models, and MoE (Mixture of Experts) models. + +## Configuring FSDP2 Strategy + +In the ROLL framework, FSDP2 training and inference strategies can be configured by setting `strategy_args` in the YAML configuration file. + +### Training Configuration Example + +The following is a typical FSDP2 training configuration example (from `examples_lixing/qwen3-8B-rlvr_fsdp2/rlvr_config.yaml`): + +```yaml +actor_train: + model_args: + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 32 + warmup_steps: 20 + num_train_epochs: 50 + strategy_args: + strategy_name: fsdp2_train + strategy_config: + fsdp_size: 16 + param_dtype: bf16 + reduce_dtype: float32 + reshard_after_forward: true + offload_policy: false + device_mapping: list(range(0,16)) + infer_batch_size: 4 +``` + +### Inference Configuration Example + +The following is a typical FSDP2 inference configuration example: + +```yaml +reference: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + strategy_args: + strategy_name: fsdp2_infer + strategy_config: + fsdp_size: 4 + param_dtype: bf16 + reduce_dtype: float32 + reshard_after_forward: true + offload_policy: false + device_mapping: list(range(0,8)) + infer_batch_size: 1 +``` + +### FSDP2 + Context Parallel Configuration Example + +The following is a configuration example combining FSDP2 with Context Parallel (Ulysses) (from `examples_lixing/qwen3-4b-vl_fsdp2_lct/vl_fsdp2_lct_cp2.yaml`): + +```yaml +actor_train: + model_args: + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + ulysses_size: 2 # Context parallel size + training_args: + learning_rate: 1.0e-6 + weight_decay: 1.0e-2 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 256 + warmup_steps: 0 + num_train_epochs: 50 + strategy_args: + strategy_name: fsdp2_train + strategy_config: + fsdp_size: 4 # FSDP sharding size + param_dtype: bf16 + reduce_dtype: float32 + reshard_after_forward: true + offload_policy: false + device_mapping: list(range(0,8)) + infer_batch_size: 1 +``` + +In this example: +- Total GPUs: 8 +- Context Parallel (Ulysses) size: 2 +- FSDP size: 4 +- Device mesh shape: (2, 4) [ddp, fsdp] +- 2 replicas, each with 4-way parameter sharding + +### Configuration Parameter Details + +1. **strategy_name**: + - `fsdp2_train` for training + - `fsdp2_infer` for inference + +2. **strategy_config**: FSDP2-specific configuration parameters + - `fsdp_size`: Number of FSDP shards + - If `fsdp_size >= world_size` or `fsdp_size <= 1`: pure FSDP2 mode + - If `fsdp_size < world_size`: HSDP mode with DDP replicas + - `param_dtype`: Parameter data type (e.g., `bf16`, `fp16`, `float32`) + - `reduce_dtype`: Data type for gradient reduction (e.g., `float32`) + - `reshard_after_forward`: Whether to reshard parameters after forward pass + - `true`: Reshard after forward + - `false`: Keep parameters gathered + - `offload_policy`: Whether to enable CPU offloading + - `true`: Offload parameters to CPU when not in use (saves GPU memory) + - `false`: Keep all parameters on GPU (faster but uses more memory) + - `wrap_policy`: Module wrapping policy + - `transformer_layer_cls_to_wrap`: List of transformer layer class names to wrap (e.g., `["Qwen3DecoderLayer"]`) + - `wrap_embeddings`: Whether to wrap embedding layers (default: `false`) + - `wrap_lm_output`: Whether to wrap LM head (default: `false`) + - `moe_experts`: List of MoE expert block class names to wrap (for MoE models, we may want to wrap each experts seperately to avoid OOM during param. gather, but need dummy expert forward to avoid hang, see [example](../../../../roll/third_party/fsdp2/qwen3_moe_patch.py)) + + if not sef the `wrap_policy`, by default will use the _no_splite_modules for transofmers models. + - `apply_expert_patch`: Whether to apply MoE expert patch (for MoE models) + - `true`: Apply patch to prevent deadlocks when different ranks activate different experts + - `false`: Don't apply patch (may cause deadlocks in MoE models) + - `apply_tiled_mlp`: Whether to apply TiledMLP optimization + - `true`: Use tiled MLP computation to reduce memory usage + - `false`: Use standard MLP computation + - `tiled_num_shards`: Number of shards for TiledMLP (default: 4) + - `async_save_ckpt`: Whether to save checkpoints asynchronously (default: `true`) + +3. **ulysses_size**: Context parallel size (set in `model_args`) + - Splits sequence dimension across multiple GPUs + - Compatible with FSDP2 for hybrid parallelism + - Useful for long-context training + +4. **device_mapping**: Specify the list of GPU device IDs to use + +5. **infer_batch_size**: Batch size during inference + +## Device Mesh Configuration + +FSDP2 supports different device mesh configurations based on `fsdp_size` and `ulysses_size`: + +### Pure FSDP2 Mode + +When `fsdp_size >= world_size` or `fsdp_size <= 1`: + +```yaml +# Example: 16 GPUs, fsdp_size=16 +strategy_config: + fsdp_size: 16 +# Device mesh: (16,) [fsdp] +# All 16 GPUs shard parameters +``` + +### HSDP Mode + +When `fsdp_size < world_size`: + +```yaml +# Example: 16 GPUs, fsdp_size=8 +strategy_config: + fsdp_size: 8 +# ddp_size = 16 // 8 = 2 +# Device mesh: (2, 8) [ddp, fsdp] +# 2 replicas, each with 8-way parameter sharding +``` + +### FSDP2 + Context Parallel (Ulysses) + +When both `ulysses_size` and `fsdp_size` are configured: + +```yaml +# Example: 8 GPUs, ulysses_size=2, fsdp_size=4 +model_args: + ulysses_size: 2 +strategy_config: + fsdp_size: 4 +# ddp_size = 8 // 4 = 2 +# Device mesh: (2, 4) [ddp, fsdp] +# 2 replicas, each with 4-way parameter sharding +# Ulysses: 2-way context parallel (sequence dimension split) +``` + +## Model-Specific Configurations + +### Text Models (Qwen2.5, Qwen3, LLaMA) + +```yaml +strategy_config: + fsdp_size: 16 + param_dtype: bf16 + reduce_dtype: float32 + wrap_policy: + transformer_layer_cls_to_wrap: ["Qwen3DecoderLayer"] +``` + +### Vision-Language Models (Qwen2.5-VL, Qwen3-VL) + +VL models require special handling for the vision tower: + +```yaml +actor_train: + model_args: + freeze_module_prefix: vision_model # Freeze vision tower + ulysses_size: 2 # Optional: context parallel + strategy_args: + strategy_name: fsdp2_train + strategy_config: + fsdp_size: 4 + param_dtype: bf16 + reduce_dtype: float32 + # Vision tower blocks automatically have cast_forward_inputs disabled +``` + +### MoE Models (Qwen3-MoE) + +MoE models require the expert patch to prevent deadlocks: + +```yaml +strategy_config: + fsdp_size: 16 + param_dtype: bf16 + reduce_dtype: float32 + apply_expert_patch: true # Critical for MoE models if wrap each expert separately + wrap_policy: + moe_experts: ["Qwen3MoeMLP"] +``` + + +## Notes + +1. **PyTorch Version**: FSDP2 requires PyTorch >= 2.4 +2. **MoE Models**: Always enable `apply_expert_patch: true` for MoE models to prevent deadlocks if wrap experts seperately +3. **VL Models**: Vision tower blocks automatically handle precision issues +4. **Memory vs Performance**: + - `offload_policy: true` saves memory but is slower + - `reshard_after_forward: true` saves memory but may be slower + - Balance based on your hardware and requirements \ No newline at end of file diff --git a/docs_roll/docs/User Guides/Configuration/vllm.md b/docs_roll/docs/User Guides/Configuration/vllm.md index 7d824ccff..a70773820 100644 --- a/docs_roll/docs/User Guides/Configuration/vllm.md +++ b/docs_roll/docs/User Guides/Configuration/vllm.md @@ -74,20 +74,6 @@ In the configuration example, we can see: This design allows different components to choose the most suitable inference engine according to their needs. -### beam_search Configuration -RLVRPipeline supports vllm beam_search generation method, configured as follows: -```yaml -generate_opt_level: 0 # Degrades to batch_generate generation method, generate_opt_level=1 is prompt-level parallel method -num_return_sequences_in_group: 8 -actor_infer: - generating_args: - num_beams: ${num_return_sequences_in_group} - num_return_sequences: ${num_return_sequences_in_group} -``` -Note: -- generating_args.num_beams and generating_args.num_return_sequences must be set to the same value. -- The generating_args configuration in validate is also configured in the same way. - ## Performance Optimization Recommendations 1. **Memory Management**: diff --git a/docs_roll/docs/User Guides/Pipeline/agent_pipeline_start.md b/docs_roll/docs/User Guides/Pipeline/agent_pipeline_start.md index 56be4523d..7c0eeeb04 100644 --- a/docs_roll/docs/User Guides/Pipeline/agent_pipeline_start.md +++ b/docs_roll/docs/User Guides/Pipeline/agent_pipeline_start.md @@ -21,6 +21,9 @@ The ROLL (Reinforcement Learning Optimization for Large-Scale Learning) agentic * Train these agents using reinforcement learning algorithms like Proximal Policy Optimization (PPO), GRPO, and **reinforce++**. * Evaluate agent performance on specific tasks and complex reasoning scenarios. * Leverage [Ray](https://www.ray.io/) for efficient, distributed computation across large-scale GPU setups. +* **Efficient Training Optimization**: Supports **Sequence Packing** (concatenating multiple short samples into a continuous sequence to reduce padding) and **Dynamic Batching +** (dynamically grouping samples into batches based on their lengths, applying uniform padding within each batch to the length of the longest sample, thereby minimizing unnecessary computation). +For configuration methods and implementation details, please refer to the dedicated documentation for `sequence packing` and `dynamic batching`. This guide provides a step-by-step walkthrough for utilizing these agentic capabilities. diff --git a/docs_roll/docs/User Guides/Pipeline/agentic_pipeline_start.md b/docs_roll/docs/User Guides/Pipeline/agentic_pipeline_start.md index d0d859340..a0b587392 100644 --- a/docs_roll/docs/User Guides/Pipeline/agentic_pipeline_start.md +++ b/docs_roll/docs/User Guides/Pipeline/agentic_pipeline_start.md @@ -33,7 +33,9 @@ Agentic Pipeline is ROLL's core pipeline for agent training, supporting multiple * **Asynchronous Training**: Decoupling of rollout/training supports asynchronous training. * **Multi-turn Interaction Support for Local Debugging**: Multi-turn interaction rollout supports local debugging, improving development efficiency for multi-turn interaction business. * **Flexible Policy Configuration**: Supports multiple distributed training strategies such as Megatron, DeepSpeed, vLLM, etc., allowing flexible configuration based on hardware resources. - +* **Efficient Training Optimization**: Supports **Sequence Packing** (concatenating multiple short samples into a continuous sequence to reduce padding) and **Dynamic Batching +** (dynamically grouping samples into batches based on their lengths, applying uniform padding within each batch to the length of the longest sample, thereby minimizing unnecessary computation). +For configuration methods and implementation details, please refer to the dedicated documentation for `sequence packing` and `dynamic batching`. --- ## ✨️ Core Components diff --git a/docs_roll/docs/User Guides/Pipeline/distill_pipeline_start.md b/docs_roll/docs/User Guides/Pipeline/distill_pipeline_start.md index 0cbb123da..8c9d563e3 100644 --- a/docs_roll/docs/User Guides/Pipeline/distill_pipeline_start.md +++ b/docs_roll/docs/User Guides/Pipeline/distill_pipeline_start.md @@ -35,6 +35,7 @@ * **Efficient Distributed Computing**: Leverages the [Ray](https://www.ray.io/) framework to implement efficient distributed training on large-scale GPU clusters, significantly improving training speed and resource utilization. +* **Efficient Training Optimization**: Supports **Sequence Packing** (concatenating multiple short samples into a continuous sequence to reduce padding). For configuration methods and implementation details, please refer to the dedicated documentation for `sequence packing`. --- diff --git a/docs_roll/docs/User Guides/Pipeline/rlvr_pipeline_start.md b/docs_roll/docs/User Guides/Pipeline/rlvr_pipeline_start.md index ff48b3015..dbef7f187 100644 --- a/docs_roll/docs/User Guides/Pipeline/rlvr_pipeline_start.md +++ b/docs_roll/docs/User Guides/Pipeline/rlvr_pipeline_start.md @@ -41,6 +41,9 @@ * **Efficient Distributed Computing**: Leverages the [Ray](https://www.ray.io/) framework to implement efficient distributed training on large-scale GPU clusters, significantly improving training speed and resource utilization. +* **Efficient Training Optimization**: Supports **Sequence Packing** (concatenating multiple short samples into a continuous sequence to reduce padding) and **Dynamic Batching** +(dynamically grouping samples into batches based on their lengths, applying uniform padding within each batch to the length of the longest sample, thereby minimizing unnecessary computation). +For configuration methods and implementation details, please refer to the dedicated documentation for `sequence packing` and `dynamic batching`. --- diff --git a/docs_roll/docs/User Guides/Pipeline/sft_pipeline_start.md b/docs_roll/docs/User Guides/Pipeline/sft_pipeline_start.md new file mode 100644 index 000000000..36f32b36b --- /dev/null +++ b/docs_roll/docs/User Guides/Pipeline/sft_pipeline_start.md @@ -0,0 +1,272 @@ +# SFT Pipeline + +**Table of Contents** + +- [SFT Pipeline](#sft-pipeline) + - [✨️ Overview](#️-overview) + - [✨️ Core Components](#️-core-components) + - [Main Module (`SFTPipeline`)](#main-module-sftpipeline) + - [Worker (`SFTWorker`)](#worker-sftworker) + - [Configuration (`SFTConfig`)](#configuration-sftconfig) + - [Config Structure and Organization](#config-structure-and-organization) + - [✨️ Data Preparation](#️-data-preparation) + - [Data Format](#data-format) + - [Required Fields and Field Mapping](#required-fields-and-field-mapping) + - [Chat Template and Labels Rules](#chat-template-and-labels-rules) + - [Validation Set (`validation`)](#validation-set-validation) + - [✨️ Running the Pipeline](#️-running-the-pipeline) + - [Method 1: Start with a Python Script](#method-1-start-with-a-python-script) + - [Method 2: Use a Helper Shell Script](#method-2-use-a-helper-shell-script) + - [✨️ Step-by-step Example](#️-step-by-step-example) + - [Step 1: Configuration](#step-1-configuration) + - [Step 2: Prepare Environment and Dependencies](#step-2-prepare-environment-and-dependencies) + - [Step 3: Launch the Pipeline](#step-3-launch-the-pipeline) + - [Step 4: Monitoring](#step-4-monitoring) + - [Step 5: Outputs and Results](#step-5-outputs-and-results) + +--- + +## ✨️ Overview + +This pipeline is designed for Supervised Fine-Tuning (SFT) and provides: + +- **Unified data encoding and chat templates**: Supports concatenating system/user/assistant chat formats and automatically constructs `labels` (loss is computed only on the answer portion). +- **Efficient distributed training**: Uses [Ray](https://www.ray.io/) plus a Cluster/Worker abstraction to launch distributed training. +- **Comprehensive performance monitoring**: A fine-grained metrics tracking system that monitors performance indicators and provides full visualization and analysis of the training process. +- **Efficient Training Optimization**: Supports **Sequence Packing** (concatenating multiple short samples into a continuous sequence to reduce padding). For configuration methods and implementation details, please refer to the dedicated documentation for `sequence packing`. +--- + +## ✨️ Core Components + +### Main Module (`SFTPipeline`) + +`SFTPipeline` (located at `roll/pipeline/sft/sft_pipeline.py`) is the main SFT training flow and is responsible for: + +- Loading the tokenizer. +- Loading the training dataset and the (optional) validation dataset. +- Encoding data with templates to generate `input_ids` / `attention_mask` / `labels`. +- Initializing the distributed training cluster (`Cluster` + `SFTWorker`). +- Training loop: trains by step, evaluates every `eval_steps`, saves checkpoints according to the save policy, records metrics, and reports them to the tracker. + +--- + +### Worker (`SFTWorker`) + +`SFTWorker` (located at `roll/pipeline/sft/sft_worker.py`) executes training, evaluation, and checkpoint saving: + +- `initialize()`: Creates and initializes the distributed strategy (`create_strategy`) and loads the model. +- `train_step()`: Runs one training step and returns training metrics. +- `val_step()`: Runs one validation step (forward + loss) and returns validation metrics. +- `do_checkpoint()`: Saves a checkpoint and returns metrics such as save time. + +--- + +### Configuration (`SFTConfig`) + +`SFTConfig` (defined in `roll/pipeline/sft/sft_config.py`) is the configuration object (dataclass-style) for the SFT pipeline, and supports YAML + Hydra management. + +#### Config Structure and Organization + +Example config file: `examples/qwen2.5-7B-sft_megatron/sft_config.yaml` + +A typical config includes: + +1. **Experiment basics** + - `exp_name`: experiment name + - `seed`: random seed + - `logging_dir`: log directory + - `output_dir`: checkpoint/output directory + +2. **Training control parameters** + - `save_steps`: checkpoint saving frequency + - `logging_steps`: training metrics logging frequency + - `eval_steps`: evaluation frequency (effective when a validation set is enabled) + - `resume_from_checkpoint`: settings for resuming from a checkpoint + +3. **Model configuration** + - `pretrain`: path to the pretrained model + +4. **Data field mapping (critical)** + - `system_key`: system prompt field (optional) + - `prompt_key`: prompt field name (default: `instruction`) + - `query_key`: query field name (optional) + - `response_key`: response field name (default: `output`) + - `global_template`: global template name (optional; otherwise use `sft_train.data_args.template`) + +5. **Worker configuration (`sft_train`)** + `sft_train` is a `WorkerConfig` and includes: + + - **Data args** (`data_args`) + - `file_name`: training data JSON path (string or list) + - `template`: template name (used when `global_template` is not set) + - `preprocessing_num_workers`: number of preprocessing workers + - **Training args** (`training_args`) + - `num_train_epochs` + - `learning_rate` + - `per_device_train_batch_size` + - `gradient_accumulation_steps` + - `dataloader_num_workers` + - ... + - **Strategy args** (`strategy_args`) + - `strategy_name`: e.g., `megatron_train` / `deepspeed_train`, etc. + - Parallelism-related parameters (tensor/pipeline parallel sizes, etc.) + - **Device mapping** (`device_mapping`) + - Specifies which GPUs the worker uses + - **Inference batch** (used in validation) + - `infer_batch_size`: used during validation + +6. **Validation configuration (optional)** + - `validation.data_args.file_name`: validation data JSON path (validation is enabled only if set) + +--- + +## ✨️ Data Preparation + +### Data Format + +The SFT pipeline uses **JSON** files loaded via HuggingFace Datasets. + +#### Required Fields and Field Mapping + +Each sample must be mappable to at least: + +- Prompt: specified by `prompt_key` (default: `instruction`) +- Response: specified by `response_key` (default: `output`) + +Optional fields: + +- `system_key`: system prompt (optional) +- `query_key`: additional input (optional; appended to the user content) + +#### Chat Template and Labels Rules + +Chat structure: + +- system (optional) +- user (prompt + query) +- assistant (response) + +Labels construction: + +- All tokens in the prompt portion are set to `IGNORE_INDEX` (not included in loss). +- Tokens in the response portion use real token ids (included in loss). + +In other words: supervision is applied only to the model’s “answer portion”. + +--- + +### Validation Set (`validation`) + +The validation set is optional: + +- It is loaded only if `validation.data_args.file_name` is configured. +- During training, validation is triggered according to `eval_steps`. +- Validation is executed by `sft_train.val_step` (no separate validation worker is launched). + +--- + +## ✨️ Running the Pipeline + +### Method 1: Start with a Python Script + +Start with `examples/start_sft_pipeline.py`; Hydra loads the configuration: + +```bash +# Make sure you are in the ROLL project root directory +# export PYTHONPATH=$(pwd):$PYTHONPATH + +python examples/start_sft_pipeline.py \ + --config_path examples/qwen2.5-7B-sft_megatron \ + --config_name sft_config +``` + +- `--config_path` – config directory: `examples/qwen2.5-7B-sft_megatron` +- `--config_name` – config file name: `sft_config` (corresponds to `sft_config.yaml`) + +--- + +### Method 2: Use a Helper Shell Script + +Example: + +```bash +#!/bin/bash +# Example: examples/qwen2.5-7B-sft_megatron/run_sft_pipeline.sh + +CONFIG_NAME="sft_config" +CONFIG_PATH="examples/qwen2.5-7B-sft_megatron" + +python examples/start_sft_pipeline.py \ + --config_path $CONFIG_PATH \ + --config_name $CONFIG_NAME \ + "$@" +``` + +Run: + +```bash +bash examples/qwen2.5-7B-sft_megatron/run_sft_pipeline.sh +``` + +--- + +## ✨️ Step-by-step Example + +### Step 1: Configuration + +Config file: `examples/qwen2.5-7B-sft_megatron/sft_config.yaml` + +Key items to check: + +- **Data config**: `sft_train.data_args.file_name` +- **Field mapping**: `prompt_key/query_key/response_key/system_key` +- **Model config**: `pretrain` +- **Distributed strategy**: `sft_train.strategy_args` and `sft_train.device_mapping` +- **Validation config (optional)**: `validation.data_args.file_name` and `eval_steps` +- **Template selection**: `global_template` or `sft_train.data_args.template` + +### Step 2: Prepare Environment and Dependencies + +```bash +pip install -r requirements.txt +``` + +Also ensure: + +- The `pretrain` path is accessible +- The fields in training/validation JSON match `prompt_key/response_key/...` + +### Step 3: Launch the Pipeline + +```bash +python examples/start_sft_pipeline.py \ + --config_path examples/qwen2.5-7B-sft_megatron \ + --config_name sft_config +``` + +### Step 4: Monitoring + +- **Console output** – watch Hydra, Ray, and pipeline logs +- **Log files** – check `logging_dir` +- **TensorBoard** + ```bash + tensorboard --logdir + ``` + +### Step 5: Outputs and Results + +- **Trained model** – checkpoints are saved under `output_dir` with the default structure: + + ``` + /sft_train/checkpoint-// + ``` + + Where: + - ``: current training step (e.g., `checkpoint-200`) + - ``: distributed cluster name (determined by Cluster/Ray runtime) + +- **Training/validation metrics** – recorded in the terminal and tracker/TensorBoard (depending on tracker configuration) + +--- + +*Happy experimenting!* \ No newline at end of file diff --git a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/Development/Developer Guide/custom_loss_func_cn.md b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/Development/Developer Guide/custom_loss_func_cn.md new file mode 100644 index 000000000..46b2c89e4 --- /dev/null +++ b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/Development/Developer Guide/custom_loss_func_cn.md @@ -0,0 +1,362 @@ +# 自定义 `loss_func` 实现指南 + +在 ROLL 中实现自定义 `loss_func` 时,最关键的是 **loss 的聚合方式(aggregate)** 与 **`loss_scale` 的处理**。如果这两点处理不当,会导致最终计算出的 loss 或梯度 **不等价于对整个 global batch 一次性前向计算的结果**,从而引入训练偏差——这在 **数据并行(DP) + 梯度累积(Gradient Accumulation, GA) + 序列打包(Sequence Packing)** 的复杂训练场景下尤为严重。 + +--- + +## 1. 常用 Loss 聚合方式 + +设一个 **global batch** 包含 $B$ 个序列。第 $i$ 个序列长度为 $T_i$,其 token 级 mask 为 $m_{i,t} \in \{0,1\}$,表示该位置是否参与 loss 计算。有效 token 数为: + +$$ +N_i = \sum_{t=1}^{T_i} m_{i,t}, \quad N_{\text{all}} = \sum_{i=1}^{B} N_i +$$ + +令 $\mathcal{L}_{i,t}$ 表示第 $i$ 个序列第 $t$ 个位置的逐 token loss(如 NLL、CE、KL 散度、策略损失等)。 + +### 1.1 Token-level Loss(token-mean) + +对 global batch 中 **所有有效 token 求平均**: + +$$ +\mathcal{L}_{\text{token}} = \frac{1}{N_{\text{all}}} \sum_{i=1}^{B} \sum_{t=1}^{T_i} m_{i,t} \mathcal{L}_{i,t} +$$ + +**特点**:每个 token 权重相同,长序列因包含更多有效 token 而贡献更大。 + +### 1.2 Sequence-level Loss(seq-mean) + +先对每条序列内部做聚合,再对所有序列求平均。ROLL 中常用两种变体: + +**(a) seq-mean-token-sum** +序列内对 token 求和,再对序列求平均: +$$ +\mathcal{L}_{\text{seq-sum}} = \frac{1}{B} \sum_{i=1}^{B} \left( \sum_{t=1}^{T_i} m_{i,t} \mathcal{L}_{i,t} \right) +$$ + +**(b) seq-mean-token-mean** +序列内对 token 求平均,再对序列求平均: +$$ +\mathcal{L}_{\text{seq-mean}} = \frac{1}{B} \sum_{i=1}^{B} \left( \frac{1}{N_i} \sum_{t=1}^{T_i} m_{i,t} \mathcal{L}_{i,t} \right) +$$ + +**特点**:每条序列权重相同,不会因长度不同而产生偏差。 + +--- + +## 2. 分布式训练中的 micro-batch 划分 + +实际训练中,一个 global step 通常同时涉及: + +- **数据并行(DP)**:global batch 被划分到多个 DP rank 上; +- **梯度累积(GA)**:每个 rank 将其数据进一步划分为多个 micro-batch,逐次前向/反向; +- **序列打包(Sequence Packing)**:为减少 padding、提升 GPU 利用率,将多个样本拼接成固定长度的 packed 序列。 + +设: +- DP world size 为 $D$, +- Gradient accumulation steps 为 $A$, +- 则一个 global step 内共有 $M = D \times A$ 个 micro-batch。 + +第 $k$ 个 micro-batch 包含的样本集合记为 $\mathcal{S}_k$,其有效 token 数为: +$$ +N_k = \sum_{(i,t) \in \mathcal{S}_k} m_{i,t}, \quad N_{\text{all}} = \sum_{k=1}^{M} N_k +$$ +其包含的序列数量(即样本数)为 $B_k$,满足: +$$ +B = \sum_{k=1}^{M} B_k +$$ + +### 2.1 为什么 sequence packing 会导致 $B_k$ 不固定? + +开启 sequence packing 后,框架通常按 **token 预算**(而非固定样本数)来构建 micro-batch: + +- 短序列可被密集打包 → 某些 micro-batch 包含较多样本($B_k$ 较大); +- 长序列占用更多空间 → 某些 micro-batch 只能容纳较少样本($B_k$ 较小)。 + +因此,在 packing 场景下,各 micro-batch 的样本数 $B_k$ 通常是**不均衡且不可预测的**。这对 sequence-level loss 的正确聚合提出了挑战。 + +--- + +## 3. 核心问题:为何不能在 micro-batch 内使用局部统计量做归一化? + +ROLL 的目标是:**无论训练配置如何(DP/GA/Packing),最终用于反向传播的 loss 必须严格等价于对整个 global batch 一次性计算的结果**(见第 1 节)。 + +若在每个 micro-batch 内使用其自身的统计量(如 $N_k$ 或 $B_k$)进行归一化,再依赖 backend 进行梯度累积,通常会导致**非等价结果**。 + +### 3.1 Token-level:错误的 micro 内归一化 + +**错误做法**(用 micro 自身 token 数归一化): +$$ +\ell_k^{\text{wrong}} = \frac{1}{N_k} \sum_{(i,t) \in \mathcal{S}_k} m_{i,t} \mathcal{L}_{i,t} +$$ + +若 micro-batch 之间被等权平均(如通过梯度平均实现),则总 loss 为: +$$ +\frac{1}{M} \sum_{k=1}^{M} \ell_k^{\text{wrong}} = \frac{1}{M} \sum_{k=1}^{M} \left( \frac{1}{N_k} \sum_{(i,t) \in \mathcal{S}_k} m_{i,t} \mathcal{L}_{i,t} \right) +$$ + +而正确的 global token-mean 应为: +$$ +\mathcal{L}_{\text{token}} = \frac{1}{N_{\text{all}}} \sum_{k=1}^{M} \sum_{(i,t) \in \mathcal{S}_k} m_{i,t} \mathcal{L}_{i,t} +$$ + +二者仅在所有 $N_k$ 相等时才一致。在变长序列或 packing 场景下,$N_k$ 差异显著,导致偏差。 + +### 3.2 Sequence-level:micro 内 seq-mean 导致样本权重失衡 + +以 `seq-mean-token-mean` 为例: + +**错误做法**(用 micro 自身样本数 $B_k$ 归一化): +$$ +\ell_k^{\text{wrong}} = \frac{1}{B_k} \sum_{i \in \mathcal{S}_k} \bar{\mathcal{L}}_i, \quad \text{其中 } \bar{\mathcal{L}}_i = \frac{1}{N_i} \sum_t m_{i,t} \mathcal{L}_{i,t} +$$ + +micro 间等权平均后得到: +$$ +\frac{1}{M} \sum_{k=1}^{M} \ell_k^{\text{wrong}} = \frac{1}{M} \sum_{k=1}^{M} \left( \frac{1}{B_k} \sum_{i \in \mathcal{S}_k} \bar{\mathcal{L}}_i \right) +$$ + +而正确的 global seq-mean 是: +$$ +\mathcal{L}_{\text{seq-mean}} = \frac{1}{B} \sum_{i=1}^{B} \bar{\mathcal{L}}_i +$$ + +前者等价于“每个 micro-batch 等权”,后者是“每个序列等权”。当 $B_k$ 不固定时(packing 常见),两者不等价。 + +--- + +## 4. 正确做法:使用全局分母 + micro 间求和 + +ROLL 的设计原则是: + +1. **在 micro-batch 内部聚合时,直接使用 global 统计量作为分母**; +2. **每个 micro-batch 返回的 loss 应设计为 global loss 的一部分**; +3. **所有 micro-batch 的 loss 相加后,应精确等于 global loss**; +4. **通过 `loss_scale` 抵消 backend 的默认归一化行为**(见第 5 节)。 + +### 4.1 Token-level 的正确实现 + +对第 $k$ 个 micro-batch: +$$ +\ell_k = \frac{1}{N_{\text{all}}} \sum_{(i,t) \in \mathcal{S}_k} m_{i,t} \mathcal{L}_{i,t} +$$ + +则: +$$ +\sum_{k=1}^{M} \ell_k = \frac{1}{N_{\text{all}}} \sum_{k=1}^{M} \sum_{(i,t) \in \mathcal{S}_k} m_{i,t} \mathcal{L}_{i,t} = \mathcal{L}_{\text{token}} +$$ + +✅ 严格等价。 + +### 4.2 Sequence-level 的正确实现(以 seq-mean-token-mean 为例) + +对第 $k$ 个 micro-batch: +$$ +\ell_k = \frac{1}{B} \sum_{i \in \mathcal{S}_k} \bar{\mathcal{L}}_i +$$ + +则: +$$ +\sum_{k=1}^{M} \ell_k = \frac{1}{B} \sum_{i=1}^{B} \bar{\mathcal{L}}_i = \mathcal{L}_{\text{seq-mean}} +$$ + +✅ 即使 $B_k$ 不固定(packing 场景),仍严格成立。 + +--- + +## 5. `loss_scale`:抵消 backend 的默认归一化 + +大多数训练框架(如 Megatron、FSDP)为保证梯度尺度稳定,在 DP + GA 下会对梯度做隐式归一化: + +- **GA 维度**:对 $A$ 次 micro-step 的梯度取平均(等效于 `loss /= A`); +- **DP 维度**:AllReduce 后除以 $D$(等效于跨 rank 求平均)。 + +综合效果等价于: +$$ +g \propto \frac{1}{M} \sum_{k=1}^{M} \nabla \ell_k, \quad M = D \times A +$$ + +但 ROLL 的 aggregate 设计要求 **micro 间是求和语义**: +$$ +\nabla \mathcal{L}_{\text{global}} = \sum_{k=1}^{M} \nabla \ell_k +$$ + +为抵消 backend 的 $1/M$ 归一化,需在每个 micro-batch 的 loss 上乘以: +$$ +\text{loss\_scale} = M +$$ + +这样: +$$ +\frac{1}{M} \sum_{k=1}^{M} \nabla (M \cdot \ell_k) = \sum_{k=1}^{M} \nabla \ell_k +$$ + +✅ 恢复了正确的求和语义。 + +--- + +## 6. ROLL 接口:全局统计量注入机制与 `loss_scale` 控制 + +在 ROLL 中,为了支持在 micro-batch 级别实现**全局等价的 loss 聚合**,框架会自动为每个训练 step 注入当前 global batch 的全局统计信息(如总有效 token 数、总有效样本数)。这些信息的**计算方式完全由用户通过 `loss_mask_keys` 指定**。 + +### 6.1 `loss_mask_keys`:定义 loss 参与范围,并驱动全局统计注入 + +`loss_mask_keys` 是一个字符串列表,用于声明 **哪些 mask 字段应被用于识别“参与 loss 计算的有效 token”**。该配置不仅指导 loss 函数如何屏蔽无效位置,更重要的是——**它直接决定了 strategy 如何统计并注入全局聚合量**。 + +你需要在 pipeline 的数据预处理或 worker 初始化阶段设置: +```python +data.meta_info['loss_mask_keys'] = ['response_mask', 'labels_mask'] +``` + +对于 `loss_mask_keys` 中的每一个 key(例如 `'response_mask'`),ROLL 的 strategy 会: + +1. **从 `data.batch` 中提取对应的 mask 张量**(形状通常为 `[batch_size, seq_len]`); +2. **跨所有 DP rank 和 GA steps 收集该 mask**; +3. **计算两个全局统计量**: + - **`batch_num_tokens[key]`**:该 mask 在整个 global batch 中的 **总和**,即 + $$ + N_{\text{all}}^{(\text{key})} = \sum_{\text{all samples}} \sum_{t} \text{mask}_{i,t}^{(\text{key})} + $$ + - **`global_valid_samples[key]`**:该 mask **至少有一个有效 token 的序列数量**,即 + $$ + B^{(\text{key})} = \sum_{i=1}^{B} \mathbb{I}\left( \sum_{t} \text{mask}_{i,t}^{(\text{key})} > 0 \right) + $$ + +这些统计量会被注入到 `data.meta_info` 中,供 `loss_func` 使用。 + +> ⚠️ **关键一致性要求**:你在 `loss_func` 中用于计算 loss、加权或聚合的 mask,**必须与 `loss_mask_keys` 中指定的 key 对应的 mask 语义完全一致**。 +> 例如,若 `loss_mask_keys = ['response_mask']`,则你的 loss 必须且只能基于 `response_mask` 来屏蔽 token;若实际使用了其他 mask(如 `attention_mask`),会导致分子(loss 计算)与分母(全局统计)不匹配,破坏等价性。 + +### 6.2 在 `loss_func` 中使用注入的全局统计量 + +在自定义 `loss_func` 中,你可以通过以下方式获取对应 mask 的全局统计量: + +```python +# 假设 loss_mask_keys 包含 'response_mask' +mask_key = 'response_mask' + +N_all = data.meta_info['batch_num_tokens'][mask_key] # 全局有效 token 数 +B_all = data.meta_info['global_valid_samples'][mask_key] # 全局有效样本数 +``` + +然后在聚合时直接使用这些全局值作为分母(见第 4 节),确保 micro-batch 的局部计算能精确还原 global loss。 + +### 6.3 `apply_loss_scale`:控制是否应用梯度尺度校正 + +由于训练 backend(如 Megatron/FSDP)在 DP + GA 下通常会对梯度做 $1/(D \times A)$ 的隐式归一化,而 ROLL 的聚合设计依赖**求和语义**,因此需要通过 `loss_scale = D \times A` 进行补偿。 + +在 `worker_config` 中,参数 `apply_loss_scale` 控制是否自动应用此缩放: + +- **默认值:`True`**(推荐保持开启) +- **作用**:框架会自动将 `loss_func` 返回的 loss 乘以 `loss_scale` +- **何时关闭**:仅当你在 `loss_func` 内部已手动完成完整 global loss(含 scale)时才设为 `False`,一般不建议。 + +--- + +## 7. Metrics 记录:使用 `@sum` 语义 + +对于通过全局分母聚合的 loss,其 metrics 在多 worker reduce 时**不应取平均**,而应**求和**。 + +ROLL 支持在 metric 名称后添加 `@操作符` 来指定 reduce 方式: + +```python +metrics = { + "actor/kl_loss@sum": kl_loss.detach().item(), +} +reduce_metrics(metrics) +``` + +- `@sum`:reduce 时对所有 worker 的值求和; +- `@mean`(默认):求平均; +- 日志记录时会自动过滤 `@` 及之后的内容,最终显示为 `actor/kl_loss`。 + +--- + +## 8. 代码示例:Actor 中 KL Loss 的全局等价实现 + +### 8.1 计算逐 token KL + +```python +kl_loss = compute_approx_kl( + log_probs=log_probs, + log_probs_base=ref_log_probs, + action_mask=final_response_mask, + kl_penalty="k3" +) +``` + +### 8.2 调用聚合函数(使用全局分母) + +```python +kl_loss = agg_loss( + loss_mat=kl_loss, + loss_mask=final_response_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['final_response_mask'], + global_valid_samples=global_valid_samples['final_response_mask'], +) +``` + +### 8.3 `agg_loss` 关键实现 + +```python +def agg_loss(loss_mat, loss_mask, loss_agg_mode, batch_num_tokens=None, global_valid_samples=None, weights=None): + if batch_num_tokens is None: + batch_num_tokens = loss_mask.sum() + if global_valid_samples is None: + global_valid_samples = loss_mat.size(0) + + if loss_agg_mode == "token-mean": + loss = (loss_mat * loss_mask).sum() / batch_num_tokens + elif loss_agg_mode == "seq-mean-token-sum": + seq_losses = (loss_mat * loss_mask).sum(dim=-1) + valid = (loss_mask.sum(dim=-1) > 0).float() + loss = (seq_losses * valid).sum() / (global_valid_samples + 1e-8) + elif loss_agg_mode == "seq-mean-token-mean": + seq_means = masked_mean(loss_mat, loss_mask, dim=-1) # 自定义函数,支持 mask + valid = (loss_mask.sum(dim=-1) > 0).float() + loss = (seq_means * valid).sum() / (global_valid_samples + 1e-8) + else: + raise ValueError(f"Unsupported loss_agg_mode: {loss_agg_mode}") + + return loss +``` + +### 8.4 记录指标 + +```python +pg_metrics = {"actor/kl_loss@sum": kl_loss.detach().item()} +``` + +--- + +## 9. 设计建议:自定义 loss 实现 Checklist(⚠️ 所有注意事项汇总) + +为确保 loss 在任意训练配置下保持数学等价性和训练稳定性,请严格遵循以下 checklist: + +### ✅ **Loss 粒度与聚合模式** +- 明确你的 loss 是 **token-level** 还是 **sequence-level**。 +- 根据需求选择正确的 `loss_agg_mode`(如 `"token-mean"`、`"seq-mean-token-mean"`)。 + +### ✅ **全局分母使用(核心!)** +- **禁止**在 micro-batch 内使用局部统计量(如 `loss_mask.sum()` 或 `loss_mat.shape[0]`)作为分母。 +- **必须**使用 `data.meta_info['batch_num_tokens'][key]` 和 `data.meta_info['global_valid_samples'][key]` 提供的**全局统计量**。 + +### ✅ **`loss_mask_keys` 配置与一致性(极易出错!)** +- 在 pipeline 中显式设置 `data.meta_info['loss_mask_keys']`。 +- **确保** `loss_func` 中用于计算/屏蔽/加权的 mask **与 `loss_mask_keys` 中指定的 key 完全对应**。 +- 若使用多个 mask(如 response + labels),需全部列入 `loss_mask_keys`,并分别处理。 + +### ✅ **`apply_loss_scale` 设置** +- **保持默认 `True`**,除非你完全理解并接管了 scale 逻辑。 +- 错误关闭会导致梯度被 backend 隐式缩小 $1/(D \times A)$ 倍,训练发散或收敛极慢。 + +### ✅ **Metrics 记录方式** +- 对使用全局分母聚合的 loss,**必须**在 metric 名称后加 `@sum`(如 `"loss@sum"`)。 +- 否则 reduce 时取平均会导致 logged loss 值错误(偏小 $M$ 倍)。 + +### ✅ **Packing 场景特别注意** +- 不要假设 micro-batch 的样本数 $B_k$ 或 token 数 $N_k$ 固定。 +- 所有聚合逻辑必须**不依赖 micro 内部统计量**,只依赖全局注入值。 + +--- \ No newline at end of file diff --git a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/Development/Developer Guide/llm_as_judge_optimization.md b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/Development/Developer Guide/llm_as_judge_optimization.md new file mode 100644 index 000000000..553111ff2 --- /dev/null +++ b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/Development/Developer Guide/llm_as_judge_optimization.md @@ -0,0 +1,262 @@ +# LLM as Judge 在 Agentic 环境中的优化实现 + +本文档介绍 ROLL 框架中 LLM as Judge 在 Agentic 环境中的优化实现方案,包括系统架构、调用链路、配置方法和最佳实践。 + +## 概览 + +LLM as Judge 是一种使用大语言模型作为评判器来评估智能体响应质量的方法。在 Agentic 训练场景中,大规模环境实例并发执行 rollout 时,使用 LLM as Judge 计算 reward 会产生大量并发 LLM 请求,这对外部 LLM 服务的稳定性和吞吐量提出了巨大挑战。 + +为解决这一问题,ROLL 框架通过**独立的 Reward Cluster** 和**高效的调度机制**,实现了可扩展的本地化并行评估系统,避免了对外部服务的依赖,确保了训练过程的稳定性和可控性。 + +:::info 文档说明 +本文档以 **DeepEyes 环境**的 LLM as Judge 实现为例进行说明。对于其他需要使用 LLM as Judge 的环境,可以参考 `env_manager` 和 `env` 内的调用方式自定义实现。 +::: + +### 核心优势 + +- **独立资源管理**:Reward 模型与 Policy 模型分离,可独立分配 GPU 资源,避免资源竞争 +- **本地化部署**:通过本地 Reward Cluster 避免外部 API 依赖,保证服务稳定性和数据安全 +- **高并发支持**:通过 RequestScheduler 实现多环境并行的高效 reward 评估,支持环境并发扩展 +- **统一接口设计**:提供 `generate_by_proxy` 统一工具函数,简化 LLM 调用逻辑,支持文本和多模态 +- **灵活配置**:支持多种推理后端(vLLM、SGLang)和自定义生成参数 + +### 应用场景 + +典型的 Agentic 训练场景: +- **环境规模**:256个环境组,每组 4 个环境,共 1024个并发环境实例 +- **Rollout 频率**:每个环境完成 episode 后调用 LLM Judge +- **并发压力**:在 rollout 高峰期可能有 500+ 个环境同时请求 reward 评估 +- **稳定性要求**:训练过程不能因为外部 API 限流或超时而中断 + +通过本文档介绍的优化实现,可以有效应对上述挑战。 + +## 系统架构 + +### 整体架构 + +``` +AgenticPipeline + ├── Reward Cluster (可选,独立GPU资源) + │ ├── InferWorker (默认) + │ └── 支持 vLLM/SGLang 后端 + │ + ├── Reward Scheduler (Ray Named Actor) + │ ├── 请求路由与负载均衡 + │ ├── 并发控制 + │ └── 请求追踪与清理 + │ + └── Environment Manager + ├── llm_proxy: 用于 policy 推理 + ├── reward_proxy: 用于 LLM as Judge + └── env实例 + └── 在 obtain_outcome_reward 中调用 reward_proxy +``` + +### 关键组件 + +#### 1. Reward Cluster + +**位置**: `roll/pipeline/agentic/agentic_pipeline.py:88-98` + +Reward Cluster 是可选组件,仅在配置了 `device_mapping` 时创建: + +```python +self.reward = None +if (self.pipeline_config.reward is not None and + len(self.pipeline_config.reward.device_mapping) > 0): + self.reward = Cluster( + name=self.pipeline_config.reward.name, + worker_cls=self.pipeline_config.reward.worker_cls, # 默认 InferWorker + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.reward, + ) +``` + +**Worker Class 默认配置**: `roll/pipeline/agentic/agentic_config.py:287` +- 默认使用 `InferWorker` 作为推理引擎,复用ActorInfer Worker实现 +- 支持 vLLM、SGLang等多种后端 + +#### 2. Reward Scheduler (Ray Named Actor) + +**位置**: `roll/pipeline/agentic/agentic_pipeline.py:112-125` + +Reward Scheduler 作为 Ray Named Actor 创建,供所有环境管理器共享访问: + +```python +self.reward_scheduler = RequestScheduler.options( + name=f"RewardScheduler-{self.pipeline_config.reward.name}", + get_if_exists=True, + namespace=RAY_NAMESPACE, + scheduling_strategy=NodeAffinitySchedulingStrategy(...) +).remote( + infer_cluster=self.reward, + pipeline_config=self.pipeline_config, + resource_manager=self.resource_manager, +) +``` + +**核心功能**: + +- **智能路由**: 使用最少负载路由算法分配请求到不同的 DP rank +- **粘性路由**: 同一环境的请求固定路由到同一 worker(利于 KV cache) +- **请求追踪**: 维护 `request_id` 到 worker 的映射关系 + +#### 3. Reward Proxy + +**位置**: `roll/pipeline/agentic/env_manager/vl_traj_env_manager.py:85-109` + +环境管理器通过 Ray 获取 Reward Scheduler 并创建 Reward Proxy: + +```python +# 从 Ray 获取 reward scheduler (Named Actor) +if self.pipeline_config.reward: + self.reward_scheduler = ray.get_actor( + name=f"RewardScheduler-{pipeline_config.reward.name}", + namespace=RAY_NAMESPACE + ) + + # 创建 reward proxy + self.reward_proxy = create_llm_proxy( + generate_scheduler=self.reward_scheduler, + llm_proxy_config=pipeline_config.reward.llm_proxy, + tokenizer=self.reward_tokenizer, + env=None, + ) +``` + +**Proxy 工厂函数**: `roll/pipeline/agentic/llm_proxy/__init__.py:11` +- 支持多种 proxy 类型:`policy`、`openai`、`random` +- 通过注册机制实现可扩展性 +- 训练验证过policy设置功能正常,基于外部部署的大模型服务可使用openai proxy,注意对并发的挑战 + +#### 4. 统一工具函数 `generate_by_proxy` + +**位置**: `roll/pipeline/agentic/llm_proxy/proxy_utils.py:18-170` + +这是env调用的核心组件,提供统一的 LLM 调用接口: + +```python +def generate_by_proxy( + messages: List[Dict[str, Any]], + tokenizer: PreTrainedTokenizer, + proxy: BaseLLMProxy, + enable_thinking: bool = False, + generation_config: Optional[Dict[str, Any]] = None, + collator: Optional[Any] = None, + mm_data: Optional[Dict[str, Any]] = None, + src_rank: Optional[int] = None, +) -> Optional[str] +``` + +**核心特性**: + +- **统一接口**: 无论文本还是多模态,都使用相同的调用方式 +- **自动格式化**: 使用 `tokenizer.apply_chat_template` 格式化消息 +- **多模态支持**: 通过 `collator` 参数支持图像/视频输入 +- **thinking 机制**: 支持 DeepSeek、Qwen 等模型的思考链 +- **路由控制**: 通过 `src_rank` 参数实现粘性路由 +- **错误处理**: 返回 `None` 表示推理失败,由调用方处理 + +## 调用链路 + +### 完整调用流程 + +``` +1. DeepEyesEnv.step() (env/deepeyes/env.py:182-197) + 当 done=True 时触发 obtain_outcome_reward + ↓ +2. DeepEyesEnv.obtain_outcome_reward() (env/deepeyes/env.py:199-254) + 构建 judge prompt,调用 reward model + ↓ +3. generate_by_proxy() (llm_proxy/proxy_utils.py:18) + 统一的 LLM 调用工具函数 + ↓ +4. reward_proxy.generate() (llm_proxy/policy_proxy.py:15) + 通过 Ray 调用 scheduler + ↓ +5. reward_scheduler.generate_one_request() (scheduler/generate_scheduler.py:1296) + 请求路由与负载均衡 + ↓ +6. infer_cluster.workers[dp_rank].generate_request() + 实际的模型推理 + ↓ +7. 返回 LLM 判断结果 +``` + +## 配置说明 + +### 完整配置示例 + +```yaml +# Reward 配置 (LLM as Judge for AgenticPipeline) +reward: + name: "reward" + worker_cls: "roll.pipeline.base_worker.InferWorker" # 默认值,可省略 + model_args: + model_name_or_path: Qwen/Qwen2.5-72B-Instruct + dtype: bf16 + generating_args: + max_new_tokens: 2048 + temperature: 0.2 # 较低温度提高判断稳定性 + top_p: 0.95 + top_k: 20 + strategy_args: + strategy_name: vllm # 或 sglang + strategy_config: + gpu_memory_utilization: 0.8 + tensor_parallel_size: 4 + load_format: auto + # 关键:必须非空才会创建 reward cluster + device_mapping: list(range(8, 16)) # GPUs 8-15 + llm_proxy: + proxy_type: policy # 使用 policy proxy +``` + +### 配置关键点 + +#### 1. device_mapping(必须配置) + +```yaml +# 推荐配置:Policy 和 Reward 使用独立 GPU +actor_infer: + device_mapping: list(range(0, 8)) # GPUs 0-7 + +reward: + device_mapping: list(range(8, 16)) # GPUs 8-15,独立资源 +``` + +- **空或 None**: 不创建 reward cluster,环境无法使用 LLM as Judge +- **非空**: 创建独立的 reward cluster,支持 LLM as Judge +- **独立部署**: 与 actor_infer 使用不同的 GPU 资源,Policy 推理和 Reward 评估并行执行,actor_infer与reward必须得独立部署 + +#### 2. strategy_name(推理后端选择) + +```yaml +strategy_args: + strategy_name: vllm # 或 sglang + strategy_config: + gpu_memory_utilization: 0.8 + tensor_parallel_size: 4 + load_format: auto # 必须配置auto, vllm/sglang strategy里默认使用dummy load,会随机初始化参数 +``` + +#### 3. generating_args(生成参数) + +```yaml +generating_args: + max_new_tokens: 2048 # 根据 judge 输出长度调整 + temperature: 0.2 # 较低温度提高稳定性 + top_p: 0.95 + top_k: 20 +``` + +## 总结 + +LLM as Judge 在 Agentic 环境中的优化实现通过以下关键设计实现高效可扩展: + +1. **独立 Reward Cluster**: 资源隔离,避免与 Policy 推理竞争 +2. **Ray Named Actor**: Reward Scheduler 作为共享服务,供所有环境访问 +3. **统一工具函数**: `generate_by_proxy` 简化调用,支持文本和多模态 +4. **智能路由**: 粘性路由和负载均衡,提高缓存利用率 + +通过合理配置和使用这些组件,可以构建高效、可靠的 LLM as Judge 评估系统。 diff --git a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/Development/Developer Guide/rollout_mock_usage.md b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/Development/Developer Guide/rollout_mock_usage.md new file mode 100644 index 000000000..dc0fbc026 --- /dev/null +++ b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/Development/Developer Guide/rollout_mock_usage.md @@ -0,0 +1,288 @@ +--- +sidebar_position: 5 +--- + +# Rollout Dump Mock 使用指南 + +## 概述 + +Rollout Dump Mock是ROLL框架提供的强大调试工具,用于**消除RL训练中rollout阶段的随机性**,实现数值级精度对齐验证。它通过保存和回放rollout数据,帮助开发者快速验证计算优化的正确性。 + +### 核心价值 + +- **消除随机性**:实现数值级精度对齐验证 +- **快速迭代**:Mock模式下跳过昂贵的环境rollout +- **可复现调试**:捕获问题rollout数据,反复调试 +- **架构透明**:在Scheduler层实现,对Pipeline完全无感知 + +### 适用场景 + +| 场景 | 说明 | +|------|------| +| **计算优化验证** | 验证dynamic_batching、sequence_packing等优化的数值一致性 | +| **模型并行验证** | 验证TP、PP、EP等并行策略的精度对齐 | +| **回归测试** | CI/CD中自动化精度测试 | + +--- + +## 快速开始 + +### 典型工作流 + +``` +[1. Dump模式] → [2. 修改代码] → [3. Mock模式] → [4. 精度验证] + ↓ ↓ ↓ ↓ + 捕获基准数据 优化计算逻辑 确定性回放 数值对比 +``` + +### Step 1: Dump模式 - 捕获基准数据 + +在修改代码前,先捕获正确的rollout数据作为基准。 + +**配置文件** (`agentic_sokoban_rollout_mock_dump.yaml`): +```yaml +exp_name: "sokoban_precision_test_dump" +max_steps: 50 + +# Rollout Mock Configuration - DUMP MODE +rollout_mock: + enable: true + mode: dump + dump_dir: ./output/rollout_dumps/baseline_v1 + +# 用于确定性执行的环境变量 +system_envs: + NCCL_ALGO: Ring + NVTE_ALLOW_NONDETERMINISTIC_ALGO: '0' + CUBLAS_WORKSPACE_CONFIG: ':4096:8' + DETERMINISTIC_MODE: '1' + +# ... 其他配置 ... +``` + +**命令**: +```bash +python examples/start_agentic_pipeline.py \ + --config_name agentic_sokoban_rollout_mock_dump \ + --config_path examples/qwen2.5-0.5B-agentic +``` + +**输出**: +``` +./output/rollout_dumps/baseline_v1/ + └── train/ + ├── step_000000.pkl (~5MB) + ├── step_000001.pkl + ├── step_000002.pkl + ├── ... + └── step_000049.pkl +``` + +**日志示例**: +``` +[Rollout Mock] Rollout Mock enabled: mode=dump, dir=./output/rollout_dumps/baseline_v1 +[Rollout Mock] Dumped step 0: ./output/rollout_dumps/baseline_v1/train/step_000000.pkl (samples=128, size=4.82MB) +[Rollout Mock] Dumped step 1: ./output/rollout_dumps/baseline_v1/train/step_000001.pkl (samples=128, size=4.85MB) +``` + +### Step 2: 修改代码 + +实现你的计算优化,例如: +- 添加dynamic_batching +- 实现sequence_packing +- 迁移到新的并行策略 + +### Step 3: Mock模式 - 确定性回放 + +使用预录制的rollout数据,验证修改后的代码是否保持数值一致性。 + +**配置文件** (`agentic_sokoban_rollout_mock_mock.yaml`): +```yaml +exp_name: "sokoban_precision_test_mock" +max_steps: 50 + +# Rollout Mock Configuration - MOCK MODE +rollout_mock: + enable: true + mode: mock + dump_dir: ./output/rollout_dumps/baseline_v1 # 与dump模式相同路径 + +# 用于确定性执行的环境变量(保持与dump模式一致) +system_envs: + NCCL_ALGO: Ring + NVTE_ALLOW_NONDETERMINISTIC_ALGO: '0' + CUBLAS_WORKSPACE_CONFIG: ':4096:8' + DETERMINISTIC_MODE: '1' + +# ... 其他配置(保持与dump模式一致)... +``` + +**命令**: +```bash +python examples/start_agentic_pipeline.py \ + --config_name agentic_sokoban_rollout_mock_mock \ + --config_path examples/qwen2.5-0.5B-agentic +``` + +**行为**: +- ✅ 直接从磁盘加载每步的DataProto +- ✅ 后续所有计算(advantages, losses, gradients)完全确定 + +**日志示例**: +``` +[Rollout Mock] Rollout Mock enabled: mode=mock, dir=./output/rollout_dumps/baseline_v1 +[Rollout Mock] Loaded step 0: ./output/rollout_dumps/baseline_v1/train/step_000000.pkl (samples=128) +[Rollout Mock] Loaded step 1: ./output/rollout_dumps/baseline_v1/train/step_000001.pkl (samples=128) +``` + +### Step 4: 数值精度验证 + +对比baseline和优化版本的训练指标,确保数值完全一致。可以通过查看日志中的关键指标(如pg_loss、total_loss、value_loss、approx_kl、grad_norm等)来验证两次运行的结果是否一致。 + +--- + +## 配置参数 + +### 配置Schema + +在你的YAML配置文件中添加 `rollout_mock` 段: + +```yaml +rollout_mock: + enable: bool # 启用rollout dump/mock机制 + mode: "dump" | "mock" # dump: 保存数据, mock: 加载数据 + dump_dir: str # 数据存储目录 +``` + +### 配置示例 + +**Dump模式配置**: +```yaml +rollout_mock: + enable: true + mode: dump + dump_dir: ./rollout_dumps/precision_test_v1 +``` + +**Mock模式配置**: +```yaml +rollout_mock: + enable: true + mode: mock + dump_dir: ./rollout_dumps/precision_test_v1 # 与dump模式相同路径 +``` + +### 确定性执行的环境变量 + +为确保完全的数值可复现性,需要配置以下环境变量: + +```yaml +system_envs: + NCCL_ALGO: Ring # 使用Ring算法进行NCCL通信 + NVTE_ALLOW_NONDETERMINISTIC_ALGO: '0' # 禁用Transformer Engine中的非确定性算法 + CUBLAS_WORKSPACE_CONFIG: ':4096:8' # 启用确定性的CUDA操作 + DETERMINISTIC_MODE: '1' # 启用PyTorch确定性模式 +``` + +**DETERMINISTIC_MODE 的作用**: +- 设置 `torch.backends.cudnn.deterministic = True` 以确保cuDNN操作的可复现性 +- 设置 `torch.backends.cudnn.benchmark = False` 禁用导致非确定性的自动调优 +- 调用 `torch.use_deterministic_algorithms(True)` 强制使用确定性的PyTorch算法 + +**重要提示**:这些环境变量在dump和mock模式之间必须保持一致,以确保数值精度对齐。 + +### 关键注意事项 + +1. **dump_dir必须一致**:Dump和Mock模式必须使用相同的`dump_dir`路径 +2. **mode必须匹配**:Scheduler的mode(train/val)必须与dump时一致 +3. **max_steps不能超过**:Mock模式的`max_steps`不能超过Dump模式时的值 +4. **system_envs必须一致**:确定性执行的环境变量在dump和mock模式之间必须保持一致 + +--- + +## 常见问题与排查 + +### 问题1: Mock文件不存在 + +**错误信息**: +``` +FileNotFoundError: [Rollout Mock] Mock文件不存在: ./dumps/baseline/train/step_000005.pkl +可能的原因: + 1. 未在dump模式下运行过step 5 + 2. dump_dir配置不正确: ./dumps/baseline + 3. mode不匹配(当前: train) +请先以dump模式运行,确保生成了所有步骤的数据。 +``` + +**排查步骤**: + +1. 检查dump模式下是否运行了足够的步骤: + ```bash + ls -lh ./output/rollout_dumps/baseline_v1/train/ + # 应该看到 step_000000.pkl ~ step_000049.pkl + ``` + +2. 确认`max_steps`一致: + ```bash + # Dump时: max_steps=50 + # Mock时: max_steps=50 (必须一致或更小) + ``` + +3. 确认`dump_dir`路径正确: + ```yaml + # Dump时 + dump_dir: ./output/rollout_dumps/baseline_v1 + + # Mock时 (必须相同) + dump_dir: ./output/rollout_dumps/baseline_v1 + ``` + +### 问题2: Mode不匹配 + +**问题**:Dump时使用train mode,Mock时误用val mode。 + +**文件结构**: +``` +dumps/baseline/ + ├── train/ # Dump时生成 + │ └── step_*.pkl + └── val/ # 空目录 + └── (无文件) +``` + +**解决**:确保dump和mock使用相同的scheduler mode(train/val)。 + +### 问题3: 磁盘空间不足 + +**症状**:Dump过程中报错: +``` +OSError: [Errno 28] No space left on device +``` + +**估算磁盘占用**: +``` +单步文件大小 ≈ batch_size × seq_len × 数据类型大小 + ≈ 128 × 512 × 4 bytes (float32) + ≈ 256KB ~ 10MB (取决于序列长度和metadata) + +总磁盘占用 ≈ 单步大小 × max_steps + ≈ 5MB × 100 steps = 500MB +``` + +**解决**: +- 增加磁盘空间 +- 减少`max_steps` +- 使用网络存储(OSS等) + +### 问题4: Pickle版本不兼容 + +**症状**:在不同Python版本间加载报错: +``` +pickle.UnpicklingError: invalid load key, '\x00' +``` + +**原因**:Pickle在不同Python版本间的兼容性问题。 + +**解决**: +- 确保dump和mock使用相同Python版本 +- 或在dump时使用较低的protocol版本(需修改源码) diff --git a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Advanced Features/dynamic_batching.md b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Advanced Features/dynamic_batching.md new file mode 100644 index 000000000..20f9cc7f4 --- /dev/null +++ b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Advanced Features/dynamic_batching.md @@ -0,0 +1,214 @@ +# ROLL Dynamic Batching + +ROLL 框架支持对 Rollout Batch 做 **Dynamic Batching** 功能,尽量减少无效 token 计算,使得计算效率更高,本文档详细介绍如何使用这一功能。 + +## 术语列表 + +- attention_mask: rollout batch中的数据,其中 `1` 表示实际需要被计算的token,`0` 表示 pad_token; +- micro_batch (mbs): 模型前向处理时的微批次; +- num_micro_batches: 每个mini-batch中micro_batch数量; +- micro_batch_size: 每个微批次中序列数量; +- micro_batch_seqlen: 每个微批次中序列长度; +- dp_size, dp_rank, shard: 数据并行时的并行数量,以及在并行组中的编号,每个数据并行组中的训练数据; +- vpp: Virtual Pipeline Model Parallel,Megatron-LM框架中支持的一种高效流水线并行技术; + +## 简介 + +在RL训练场景中,每次rollout出来的数据具有十分显著的长尾效应,即序列长度不一致,尤其在Agentic Pipeline中,由于训练数据是多轮和Env相互产生的,导致这种长尾现象更为显著。 + +在训练时,通常会将一个rollout batch中的所有样本按照一个`max_len` pad到最长,这些pad_token也会参与计算,造成计算资源浪费; + +为了解决这一问题,提高计算效率,Dynamic Batching技术核心思路是: +- 对整个rollout batch中的样本在DP Rank维度上按照token数进行划分,使得计算资源尽量均衡; +- 改变样本中序列的顺序,使得临近的样本,长度尽量接近,能够去掉尽量多的pad token; + +## 示例 +下面通过一个例子,简要说明 ROLL 中 Dynamic Batching 流程 + +假设 `dp_size=2`, `num_seqs=8`, `max_tokens_microbatch=10`, `sequence_length_round=2` + +原始输入 `attention_mask` 如下 +```bash +attention_mask: +[1, 1, 1, 1, 1, 1, 1, 0, 0, 0] +[1, 1, 1, 1, 1, 1, 0, 0, 0, 0] +[1, 1, 1, 1, 1, 1, 1, 1, 0, 0] +[1, 1, 1, 1, 1, 0, 0, 0, 0, 0] +[1, 0, 0, 0, 0, 0, 0, 0, 0, 0] +[1, 1, 1, 0, 0, 0, 0, 0, 0, 0] +[1, 1, 1, 1, 1, 1, 1, 1, 0, 0] +[1, 1, 1, 1, 1, 1, 0, 0, 0, 0] +``` +其对应的 `seq_lens` 如下: + +```bash +seq_lens: +[7, 6, 8, 5, 1, 3, 8, 6] +``` + +可见序列之间的实际 token 数量是不均衡的,会浪费大量 GPU 时间在处理 `pad_token` 上 + +为了计算效率,ROLL Dynamic Batching 基于下面的步骤来消除 `micro_batch` 中的 pad_token,从而达到资源利用的最大化。 + +1. shard表示每个`dp_rank`中的训练数据,默认按照顺序切分,在Dynamic Batching中会基于序列实际长度排序并切分shard,使得 `dp_rank` 之间的tokens数均匀 + +```bash +# seq_lens 排序后: +[1, 3, 5, 6, 6, 7, 8, 8] +# 切分成dp_size个shard +shard0: + [1, 5, 6, 8] +shard1: + [3, 6, 7, 8] +``` + +2. 对于每个shard划分 `micro_batch`; + +划分时需要考虑如下两个参数: +- max_tokens_per_microbatch: 每个micro_batch中最大token数量,`micro_batch_size * micro_batch_seqlen` 不能超过这个值,如果超过需要再生成一个新的 `micro_batch`; +- sequence_length_round: `micro_batch_seqlen` 需要能够被这个值整除;假设micro_batch中的序列长度为 `[200, 240]`,`sequence_length_round=64`,则这个micro_batch需要pad成`[256, 256]`; + +Dynamic Batching的划分shard流程就是找到小于max_tokens_per_microbatch的micro_batch中tokens数量最大的划分。且保证每个micro_batch的序列长度需要根据实际长度pad到 `sequence_length_round` 的倍数; + +具体如下所示: + +```bash +shard0: + mbs0: # padding长度6 + [1, 0, 0, 0, 0, 0 + 1, 1, 1, 1, 1, 0] + mbs1: # padding长度8 + [1, 1, 1, 1, 1, 1, 0, 0] + mbs2: # padding长度8 + [1, 1, 1, 1, 1, 1, 1, 1] +shard1: + mbs0: # padding长度6 + [1, 1, 1, 0, 0, 0 + 1, 1, 1, 1, 1, 1] + mbs1: # padding长度8 + [1, 1, 1, 1, 1, 1, 1, 0] + mbs2: # padding长度8 + [1, 1, 1, 1, 1, 1, 1, 1] +``` +在这个随机mask矩阵中,原来token总数为 `attention_mask.size(0) * attention_mask.size(1) = 80`,经过 Dynamic Batching 之后的 token 数量为:56,remove掉了 `30%` 的 pad_token + +3. 支持Virtual Pipelie Model Parallel,优先拆分tokens数量多且micro_batch_size > 1的micro_batch,使得micro_batch数量为pp_size整除倍(支持megatron) + +原来的这个例子中 `num_micro_batches` 不能够被 `pp_size` 整除,因此选择 `mbs0`,将其拆分成两个 mbs,如下所示: + +```bash +shard0: + mbs0: # padding长度6 + [1, 0, 0, 0, 0, 0] + mbs1: # padding长度6 + [1, 1, 1, 1, 1, 0] + mbs2: # padding长度8 + [1, 1, 1, 1, 1, 1, 0, 0] + mbs3: # padding长度8 + [1, 1, 1, 1, 1, 1, 1, 1] +shard1: + mbs0: # padding长度6 + [1, 1, 1, 0, 0, 0] + mbs1: # padding长度6 + [1, 1, 1, 1, 1, 1] + mbs2: # padding长度8 + [1, 1, 1, 1, 1, 1, 1, 0] + mbs3: # padding长度8 + [1, 1, 1, 1, 1, 1, 1, 1] + +``` + + + +## 参数配置 + +与 Dynamic Batching 相关的参数如下,分为 train 和 infer 两个部分 +- Train + - use_dynamic_batching_in_train: 是否在 `train_step` 时开启; + - max_tokens_per_microbatch_in_train: 训练时每个 micro_batch 最大 token 数量; + - sequence_length_round_in_train: 训练时每个 micro_batch 的序列长度需要能被这个参数整除,需要能够被 `tensor_model_parallel_size * context_parallel_size` 整除,一般取 128,64 即可; +- Infer + - use_dynamic_batching_in_infer: 是否在 `compute_log_probs` 等不需要梯度更新的环节开启; + - max_tokens_per_microbatch_in_infer: 与train中含义相同,根据显存消耗情况可以大一些; + - sequence_length_round_in_infer: 与train中含义相同; + + + +## 完整配置 + +```yaml +actor_train: + # 同时开启 Dynamic Batching 和 Context Parallel 时推荐使用 flash_attn + system_envs: + NVTE_FLASH_ATTN: '1' + NVTE_FUSED_ATTN: '0' + NVTE_UNFUSED_ATTN: '0' + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 2 + gradient_accumulation_steps: 64 + warmup_steps: 10 + lr_scheduler_type: cosine + data_args: + template: qwen2_5 + strategy_args: + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + device_mapping: list(range(0,8)) + infer_batch_size: 2 + use_dynamic_batching_in_train: true + max_tokens_per_microbatch_in_train: 8192 + sequence_length_round_in_train: 128 + use_dynamic_batching_in_infer: true + max_tokens_per_microbatch_in_infer: 16384 + sequence_length_round_in_infer: 128 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: 128 # single-turn response length + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + load_format: auto + device_mapping: list(range(0,8)) + +reference: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + strategy_args: + strategy_name: megatron_infer + strategy_config: ~ + device_mapping: list(range(0,8)) + infer_batch_size: 2 + use_dynamic_batching_in_infer: true + max_tokens_per_microbatch_in_infer: 16384 + sequence_length_round_in_infer: 128 + +``` diff --git a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Advanced Features/sequence_packing.md b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Advanced Features/sequence_packing.md new file mode 100644 index 000000000..ac3b183ba --- /dev/null +++ b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Advanced Features/sequence_packing.md @@ -0,0 +1,321 @@ +# ROLL SEQUENCE PACKING + +ROLL框架目前支持了Sequence Packing功能,通过句子打包来避免pad token,提高计算效率。本文档详细介绍该功能的实现思路以及相应使用配置方法。 + +> **注意**:目前只有 `megatron_strategy` 支持了 `sequence_packing`。 + +## 1. 简介 + +在RL训练场景中,rollout数据的分布通常具有长尾效应。而在常规的训练过程中,我们通常将一个micro batch的数据组合为一个batch进行训练,每条样本都会被pad到预设的最大长度,这不仅导致了算力被消耗在了大量pad token上,而且拖慢了训练速度。 + +为了解决上面的问题,ROLL中提供了Sequence Packing这一特性,其核心思路是: +* 将当前micro batch中长短不同的句子打包在一起以消除pad token +* 使用打包算法优化打包效率,减少micro batch数量,提高训练效率 + +## 2. 实现原理 + +### 2.1 数据划分层次结构 + +在分布式训练中,数据按照以下层次结构进行划分: + +``` +GLOBAL BATCH (全局批次) +├── DP RANK 0 → BATCH 0 +│ └── MINI BATCH 0 (用于一次梯度更新) +│ ├── MICRO BATCH 0 (最小计算单元) +│ ├── MICRO BATCH 1 +│ └── ... +├── DP RANK 1 → BATCH 1 +│ └── MINI BATCH 0 +│ ├── MICRO BATCH 0 +│ └── ... +└── ... +``` + +- **GLOBAL BATCH**: actor_infer产生的完整rollout结果 +- **BATCH**: Global Batch按DP rank划分后的子集 +- **MINI BATCH**: Batch中用于单次梯度更新的数据(考虑gradient accumulation) +- **MICRO BATCH**: Mini Batch进一步划分的最小计算单元,参与单次forward/backward + +在常规训练中,每个micro batch中的样本都会被padding到固定长度,造成大量计算资源浪费。Sequence Packing通过在micro batch级别进行序列打包来解决这个问题。 + +### 2.2 序列打包核心机制 + +Sequence Packing的核心目标是在消除padding token的同时,确保在复杂的分布式训练环境下(特别是Context Parallel和Tensor Parallel)能够正确、高效地运行。为了实现这一目标,打包过程需要满足特定的对齐要求,这些要求直接关系到模型能否正常训练以及训练效率的高低。 + +#### 2.2.1 对齐要求:2×CP_SIZE×TP_SIZE的倍数 + +在启用Context Parallel (CP) 和 Tensor Parallel (TP) 的情况下,序列长度必须是 **2 × CP_SIZE × TP_SIZE** 的倍数。 + +这个对齐要求来源于两个并行策略的需求: + +1. **TENSOR PARALLEL (TP) 需求**:当启用Sequence Parallel时,序列会在forward过程中被切分到不同的TP rank上处理,因此序列长度需要能被TP_SIZE整除。 + +2. **CONTEXT PARALLEL (CP) 需求**:为了实现CP负载均衡,序列需要被切分为2×CP_SIZE个逻辑块,因此序列长度需要能被2×CP_SIZE整除。 + +综合这两个需求,序列长度必须是 **2 × CP_SIZE × TP_SIZE** 的倍数,这样才能同时满足TP和CP的正确运行要求。 + +#### 2.2.2 为什么需要因子2?CP负载均衡详解 + +在Context Parallel (CP) 训练中,因果注意力机制的特殊性会导致严重的负载不均衡问题。 + +**问题根源 - 因果注意力的不对称性** + +考虑一个长度为6的序列 `[0, 1, 2, 3, 4, 5]`,在CP=2的情况下: + +``` +完整的因果注意力掩码: + 0 1 2 3 4 5 +0 [ 1 0 0 0 0 0 ] +1 [ 1 1 0 0 0 0 ] +2 [ 1 1 1 0 0 0 ] +3 [ 1 1 1 1 0 0 ] +4 [ 1 1 1 1 1 0 ] +5 [ 1 1 1 1 1 1 ] +``` + +**朴素切分方案的问题**: + +如果简单地将序列均分为两部分: +- CP0负责: `[0, 1, 2]` +- CP1负责: `[3, 4, 5]` + +那么实际的计算负载为: +- **CP0**: 只需要计算自己负责位置的注意力权重(6个权重计算) +- **CP1**: 需要计算自己负责位置对所有前面位置的注意力权重(15个权重计算) + +**负载比例: 6:15 = 2:5**,CP1的计算量是CP0的2.5倍! + +**解决方案 - 2×CP交错切分** + +Megatron-Core采用的解决方案是将序列切分为 **2×CP** 个块,然后采用交错分配策略: + +``` +原始序列: [0, 1, 2, 3, 4, 5] +切分为4块: |[0,1]|[2,3]|[4,5]|[p,p]| (需要padding到4的倍数) + +交错分配: +- 块0 [0,1] → CP0 +- 块1 [2,3] → CP1 +- 块2 [4,5] → CP1 +- 块3 [p,p] → CP0 + +最终分配: +- CP0: [0,1] + [p,p] +- CP1: [2,3] + [4,5] +``` + +通过这种精心设计的分配策略,两个CP rank的计算负载变得相对均衡,避免了明显的性能瓶颈。 + +因此,**因子2是CP负载均衡的核心设计**,确保在因果注意力机制下各个CP rank的工作量基本相等。 + +#### 2.2.3 完整打包示例 + +假设当前microbatch包含以下样本(原始序列长度为8): + +| 样本ID | 原始序列 | 有效长度 | +|--------|----------|----------| +| 0 | `[0, 0, p, p, p, p, p, p]` | 2 | +| 1 | `[1, 1, 1, 1, p, p, p, p]` | 4 | +| 2 | `[2, 2, 2, 2, 2, 2, p, p]` | 6 | +| 3 | `[3, p, p, p, p, p, p, p]` | 1 | + +配置参数:`CP_SIZE=2`, `TP_SIZE=1` + +**步骤1:移除原始padding** +``` +样本0: [0, 0] +样本1: [1, 1, 1, 1] +样本2: [2, 2, 2, 2, 2, 2] +样本3: [3] +``` + +**步骤2:重新padding到对齐边界** +- 对齐因子 = 2 × CP_SIZE × TP_SIZE = 2 × 2 × 1 = 4 + +重新padding后的序列: +``` +样本0: [0, 0, p, p] → 长度4 +样本1: [1, 1, 1, 1] → 长度4 +样本2: [2, 2, 2, 2, 2, 2, p, p] → 长度8 +样本3: [3, p, p, p] → 长度4 +``` + +**步骤3:CP切分详细过程** + +在CP_SIZE=2的情况下,每个序列会被逻辑上切分为 **2×CP_SIZE = 4** 个部分,然后按照交错规则分配给不同的CP rank。 + +具体切分和分配规则如下: + +对于任意长度为L的序列,在CP_SIZE=2时: +- 序列被划分为4个连续的段:段0、段1、段2、段3 +- 每个段的长度为 L/4 +- 分配规则: + - **CP0**: 段0 + 段3 + - **CP1**: 段1 + 段2 + +应用到我们的例子: + +- **样本0** `[0, 0, p, p]` (长度4): + - 段0: `[0]`, 段1: `[0]`, 段2: `[p]`, 段3: `[p]` + - CP0获得: 段0 + 段3 = `[0] + [p]` → 实际处理 `[0, p]` + - CP1获得: 段1 + 段2 = `[0] + [p]` → 实际处理 `[0, p]` + +- **样本1** `[1, 1, 1, 1]` (长度4): + - 段0: `[1]`, 段1: `[1]`, 段2: `[1]`, 段3: `[1]` + - CP0获得: `[1] + [1]` → `[1, 1]` + - CP1获得: `[1] + [1]` → `[1, 1]` + +- **样本2** `[2, 2, 2, 2, 2, 2, p, p]` (长度8): + - 段0: `[2, 2]`, 段1: `[2, 2]`, 段2: `[2, 2]`, 段3: `[p, p]` + - CP0获得: `[2, 2] + [p, p]` → `[2, 2, p, p]` + - CP1获得: `[2, 2] + [2, 2]` → `[2, 2, 2, 2]` + +- **样本3** `[3, p, p, p]` (长度4): + - 段0: `[3]`, 段1: `[p]`, 段2: `[p]`, 段3: `[p]` + - CP0获得: `[3] + [p]` → `[3, p]` + - CP1获得: `[p] + [p]` → `[p, p]` + +**步骤4:各CP rank的最终打包结果** + +- **CP0的完整输入**: `[0, p, 1, 1, 2, 2, p, p, 3, p]` +- **CP1的完整输入**: `[0, p, 1, 1, 2, 2, 2, 2, p, p]` + +**步骤5:累积序列长度计算** + +Padded累积长度: `[0, 4, 8, 16, 20]` + +### 2.3 LOSS计算流程 + +在Sequence Packing模式下,loss计算需要特殊的处理流程: + +1. **模型输出解包**:使用`_unpack_sequences`函数将packed的输出还原为单个序列 + - 根据`cu_seqlens_padded`计算每个序列在当前CP rank上的起止位置 + - `seq_starts = cu_seqlens_padded[:-1] // cp_size` + - `seq_ends = cu_seqlens_padded[1:] // cp_size` + +2. **逐序列loss计算**: + - 对每个解包后的序列单独调用loss函数 + - 需要将原始数据调整到对应的序列长度(使用`adjust_sequence_length`) + - 累加所有序列的loss值 + +3. **结果聚合**: + - 将所有序列的loss相加得到总loss + - 聚合各个序列的metrics + - 应用loss scaling(如果启用) + +这种逐序列计算的方式确保了loss计算的正确性,即使在复杂的CP+TP+packing组合场景下也能准确计算梯度。 + +### 2.4 负载均衡优化 + +为了最大化Sequence Packing的效果,ROLL在多个层面应用了**Karmarkar-Karp算法**进行负载均衡优化。 + +**Karmarkar-Karp算法简介**: +这是一种经典的多路划分算法,用于将一组数字划分为k个子集,使得各子集的和尽可能接近。在Sequence Packing场景中,该算法被用来确保各个计算单元的负载相对均衡,避免性能瓶颈。 + +主要优化包括: +- **GLOBAL BATCH → DP RANK 负载均衡**:确保每个DP rank获得相似的总token数量 +- **MINI BATCH → MICRO BATCH 负载均衡**:确保每个micro batch的计算负载均衡 + +具体的实现细节和责任分工请参考第3.2节。 + +## 3. 实现流程 + +### 3.1 打包与解包核心逻辑 + +pack部分主要是在strategy中进行处理的,开启`use_sequence_packing`后strategy会自动对microbatch进行pack,并对输出的logits进行unpack并计算loss。 + +**核心打包函数 `_pack_sequences`** 实现了以下逻辑: +1. 移除原始padding,提取有效token +2. 计算累积序列长度(原始和padded版本) +3. 重新padding到`2*cp_size*tp_size`的倍数 +4. 处理CP切分和分配 +5. 拼接序列并创建`PackedSeqParams` + +**Loss计算**通过`loss_wrapper`实现解包和逐序列loss计算。 + +### 3.2 负载均衡责任分工 + +负载均衡在ROLL框架中有明确的责任分工: + +1. **GLOBAL BATCH → DP RANK 负载均衡**: + - **负责模块**: Pipeline层(`batch_balance`函数) + - **优化目标**: 确保每个DP rank获得相似的总token数量 + - **实现方式**: 在数据分发前使用Karmarkar-Karp算法重排序 + +2. **MINI BATCH → MICRO BATCH 负载均衡**: + - **负责模块**: Strategy层(`make_micro_batch_iter_for_sequence_packing`) + - **优化目标**: 确保每个micro batch的计算负载均衡 + - **实现方式**: 在micro batch生成时应用Karmarkar-Karp算法 + +3. **随机性保留**: + - Batch → Mini Batch的划分保持随机性(用于shuffle),因此不进行负载均衡优化 + +这种分层优化策略确保了从全局到局部的各个层面都能获得良好的负载均衡,最大化硬件利用率。 + +## 4. 参数配置 + +### 4.1 如何启用SEQUENCE PACKING + +要使用Sequence Packing功能,只需要在配置文件中设置 `use_sequence_packing: true` 即可。 + +### 4.2 配置参数详解(通俗版) + +#### `algorithm`(打包算法) +- **`none`**:默认的简单打包方式,按照数据原有的顺序进行打包 +- **`load_balance`**:智能负载均衡打包,会重新排列数据使得每个micro batch的计算量更加均衡,推荐使用 + +#### `max_packed_sequence_length_train`(训练时最大打包长度) +- 这个参数控制在训练时,打包后的序列最长可以有多长 +- 比如设置为8192,意味着打包后的序列总长度不会超过8192个token +- 设置合理的值可以避免内存溢出,同时保证打包效率 + +#### `max_packed_sequence_length_forward`(推理时最大打包长度) +- 和训练时的参数类似,但专门用于推理阶段 +- 通常可以和训练时设置相同的值 + +#### `min_num_micro_batches_train`(训练时最少micro batch数量) +- 控制每个mini batch至少要分成多少个micro batch +- 设置为1表示不限制,让系统自动决定最优的划分方式 +- 如果遇到显存不足的问题,可以适当增大这个值来减少每个micro batch的大小 + +#### `min_num_micro_batches_forward`(推理时最少micro batch数量) +- 和训练时的参数类似,但用于推理阶段 + +### 4.3 完整配置示例 + +```yaml +actor_train: + # 启用sequence packing功能 + use_sequence_packing: True + + # sequence packing的具体配置 + sequence_packing_args: + # 使用负载均衡算法,效果更好 + algorithm: load_balance + + # 训练时打包后的最大序列长度为8192 + max_packed_sequence_length_train: 8192 + + # 推理时打包后的最大序列长度为8192 + max_packed_sequence_length_forward: 8192 + + # 训练时最少分成1个micro batch(即不限制) + min_num_micro_batches_train: 1 + + # 推理时最少分成1个micro batch + min_num_micro_batches_forward: 1 + + # 必须使用megatron策略才能支持sequence packing + strategy_args: + strategy_name: megatron_train +``` + +### 4.4 使用建议 + +1. **必选条件**:只能在`megatron_train`或`megatron_infer`策略下使用 +2. **推荐配置**:建议使用`load_balance`算法,可以获得更好的性能 +3. **长度设置**:`max_packed_sequence_length`应该根据你的GPU显存大小来调整,一般可以设置为模型支持的最大序列长度 +4**自定义Loss函数**:如果是自定义loss func使用sequence packing的话,请参考自定义loss func文档,确保正确设置了`apply_loss_scale`参数 + +通过合理配置Sequence Packing,可以在保持模型性能的同时显著提升训练效率,特别是在处理变长序列的强化学习场景中效果尤为明显。 \ No newline at end of file diff --git a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Configuration/fsdp2.md b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Configuration/fsdp2.md new file mode 100644 index 000000000..ae3337829 --- /dev/null +++ b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Configuration/fsdp2.md @@ -0,0 +1,242 @@ +# FSDP2 训练和推理后端配置指南 + +[FSDP2 (Fully Sharded Data Parallel 2)](https://docs.pytorch.org/tutorials/intermediate/FSDP_tutorial.html) 是 PyTorch 最新的分布式训练框架,提供高效的参数分片和 [DTensor](https://docs.pytorch.org/docs/stable/distributed.tensor.html) 支持。本文档将详细介绍如何在 ROLL 框架中配置和使用 FSDP2 后端。 + +## FSDP2 与 ROLL + +ROLL 支持以下 FSDP2 特性: +1. **FSDP2 分片**:使用 FSDP2 [fully_shard](https://docs.pytorch.org/docs/main/distributed.fsdp.fully_shard.html) 分片模型参数、梯度和优化器状态。同时支持使用 [DCP](https://docs.pytorch.org/tutorials/recipes/distributed_checkpoint_recipe.html) 进行检查点管理。 +2. **上下文并行**:支持与序列并行(Ulysses)集成 +3. **模型支持**:支持文本模型、视觉语言(VL)模型和 MoE(混合专家)模型。 + +## 配置 FSDP2 策略 + +在 ROLL 框架中,可以通过在 YAML 配置文件中设置 `strategy_args` 来配置 FSDP2 训练和推理策略。 + +### 训练配置示例 + +以下是一个典型的 FSDP2 训练配置示例(来自 `examples_lixing/qwen3-8B-rlvr_fsdp2/rlvr_config.yaml`): + +```yaml +actor_train: + model_args: + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 32 + warmup_steps: 20 + num_train_epochs: 50 + strategy_args: + strategy_name: fsdp2_train + strategy_config: + fsdp_size: 16 + param_dtype: bf16 + reduce_dtype: float32 + reshard_after_forward: true + offload_policy: false + device_mapping: list(range(0,16)) + infer_batch_size: 4 +``` + +### 推理配置示例 + +以下是一个典型的 FSDP2 推理配置示例: + +```yaml +reference: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + strategy_args: + strategy_name: fsdp2_infer + strategy_config: + fsdp_size: 4 + param_dtype: bf16 + reduce_dtype: float32 + reshard_after_forward: true + offload_policy: false + device_mapping: list(range(0,8)) + infer_batch_size: 1 +``` + +### FSDP2 + 上下文并行配置示例 + +以下是一个结合 FSDP2 和序列并行(Ulysses)的配置示例(来自 `examples_lixing/qwen3-4b-vl_fsdp2_lct/vl_fsdp2_lct_cp2.yaml`): + +```yaml +actor_train: + model_args: + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + ulysses_size: 2 # 序列并行大小 + training_args: + learning_rate: 1.0e-6 + weight_decay: 1.0e-2 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 256 + warmup_steps: 0 + num_train_epochs: 50 + strategy_args: + strategy_name: fsdp2_train + strategy_config: + fsdp_size: 4 # FSDP 分片大小 + param_dtype: bf16 + reduce_dtype: float32 + reshard_after_forward: true + offload_policy: false + device_mapping: list(range(0,8)) + infer_batch_size: 1 +``` + +在此示例中: +- 总 GPU 数:8 +- 上下文并行(Ulysses)大小:2 +- FSDP 大小:4 +- 设备网格形状:(2, 4) [ddp, fsdp] +- 2 个副本,每个副本有 4 路参数分片 + +### 配置参数详解 + +1. **strategy_name**: + - `fsdp2_train` 用于训练 + - `fsdp2_infer` 用于推理 + +2. **strategy_config**:FSDP2 特定的配置参数 + - `fsdp_size`:FSDP 分片数量 + - 如果 `fsdp_size >= world_size` 或 `fsdp_size <= 1`:纯 FSDP2 模式 + - 如果 `fsdp_size < world_size`:带有 DDP 副本的 HSDP 模式 + - `param_dtype`:参数数据类型(例如 `bf16`、`fp16`、`float32`) + - `reduce_dtype`:梯度归约的数据类型(例如 `float32`) + - `reshard_after_forward`:是否在前向传播后重新分片参数 + - `true`:前向传播后重新分片 + - `false`:保持参数gathered + - `offload_policy`:是否启用 CPU 卸载 + - `true`:在不使用时将参数卸载到 CPU(节省 GPU 内存) + - `false`:将所有参数保留在 GPU 上(更快但使用更多内存) + - `wrap_policy`:模块包装策略 + - `transformer_layer_cls_to_wrap`:要wrap的 Transformer 层类名列表(例如 `["Qwen3DecoderLayer"]`) + - `wrap_embeddings`:是否wrap input embedding(默认:`false`) + - `wrap_lm_output`:是否wrap LM head(默认:`false`) + - `moe_experts`:要包装的 MoE Expert类名列表(对于 MoE 模型,我们可能希望单独wrap每个expert以避免参数gather时OOM,但需要dummy前向传播以避免程序挂起,请参阅[示例](../../../../roll/third_party/fsdp2/qwen3_moe_patch.py)) + + 如果未设置 `wrap_policy`,默认将使用 transformers 模型的 `_no_split_modules`。 + - `apply_expert_patch`:是否应用 MoE 专家补丁(用于 MoE 模型) + - `true`:应用补丁以防止不同 rank 激活不同专家时的死锁 + - `false`:不应用补丁(在 MoE 模型中可能导致死锁) + - `apply_tiled_mlp`:是否应用 TiledMLP 优化 + - `true`:使用分块 MLP 计算以减少内存使用 + - `false`:使用标准 MLP 计算 + - `tiled_num_shards`:TiledMLP 的分片数量(默认:4) + - `async_save_ckpt`:是否异步保存checkpoint(默认:`true`) + +3. **ulysses_size**:序列并行大小(在 `model_args` 中设置) + - 在多个 GPU 之间拆分序列维度 + - 与 FSDP2 兼容以实现混合并行 + - 适用于长上下文训练 + +4. **device_mapping**:指定要使用的 GPU 设备 ID 列表 + +5. **infer_batch_size**:推理期间的批量大小 + +## 设备网格配置 + +FSDP2 根据 `fsdp_size` 和 `ulysses_size` 支持不同的设备网格配置: + +### FSDP2 模式 + +当 `fsdp_size >= world_size` 或 `fsdp_size <= 1` 时: + +```yaml +# 示例:16 个 GPU,fsdp_size=16 +strategy_config: + fsdp_size: 16 +# 设备网格:(16,) [fsdp] +# 所有 16 个 GPU 分片参数 +``` + +### HSDP 模式 + +当 `fsdp_size < world_size` 时: + +```yaml +# 示例:16 个 GPU,fsdp_size=8 +strategy_config: + fsdp_size: 8 +# ddp_size = 16 // 8 = 2 +# 设备网格:(2, 8) [ddp, fsdp] +# 2 个副本,每个副本有 8 路参数分片 +``` + +### FSDP2 + 序列并行(Ulysses) + +当同时配置 `ulysses_size` 和 `fsdp_size` 时: + +```yaml +# 示例:8 个 GPU,ulysses_size=2,fsdp_size=4 +model_args: + ulysses_size: 2 +strategy_config: + fsdp_size: 4 +# ddp_size = 8 // 4 = 2 +# 设备网格:(2, 4) [ddp, fsdp] +# 2 个副本,每个副本有 4 路参数分片 +# Ulysses:2 路序列并行(序列维度拆分) +``` + +## 模型特定配置 + +### 文本模型(Qwen2.5、Qwen3、LLaMA) + +```yaml +strategy_config: + fsdp_size: 16 + param_dtype: bf16 + reduce_dtype: float32 + wrap_policy: + transformer_layer_cls_to_wrap: ["Qwen3DecoderLayer"] +``` + +### 视觉语言模型(Qwen2.5-VL、Qwen3-VL) + +```yaml +actor_train: + model_args: + freeze_module_prefix: vision_model # 冻结 + ulysses_size: 2 # 可选:序列并行 + strategy_args: + strategy_name: fsdp2_train + strategy_config: + fsdp_size: 4 + param_dtype: bf16 + reduce_dtype: float32 + # vision encoder自动禁用 cast_forward_inputs +``` + +### MoE 模型(Qwen3-MoE) + + +```yaml +strategy_config: + fsdp_size: 16 + param_dtype: bf16 + reduce_dtype: float32 + apply_expert_patch: true # 如果单独wrap每个expert + wrap_policy: + moe_experts: ["Qwen3MoeMLP"] +``` + +## 注意事项 + +1. **PyTorch 版本**:FSDP2 需要 PyTorch >= 2.4 +2. **MoE 模型**:如果单独wrap expert,始终启用 `apply_expert_patch: true` 以防止死锁(目前仅支持Qwen3-MoE) +3. **VL 模型**:对视Vision Encoder将默认`cast_forward_inputs=False`防止可能的精度问题 +4. **内存与性能**: + - `offload_policy: true` 节省内存但速度较慢 + - `reshard_after_forward: true` 节省内存但可能较慢 + - 根据硬件和要求进行平衡 \ No newline at end of file diff --git a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Configuration/vllm.md b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Configuration/vllm.md index f2cc4574e..84543d004 100644 --- a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Configuration/vllm.md +++ b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Configuration/vllm.md @@ -74,21 +74,6 @@ actor_infer: 这种设计允许不同组件根据其需求选择最适合的推理引擎。 -### beam_search 配置方式 -RLVRPipeline 支持vllm beam_search 的生成方式,配置方式如下: -```yaml -generate_opt_level: 0 # 退化为batch_generate生成方式,generate_opt_level=1是prompt粒度并行方式 -num_return_sequences_in_group: 8 -actor_infer: - generating_args: - num_beams: ${num_return_sequences_in_group} - num_return_sequences: ${num_return_sequences_in_group} -``` -注意: -- generating_args.num_beams 和 generating_args.num_return_sequences 必须设置为相同的值。 -- validate中配置generating_args也是相同的方式。 - - ## 性能优化建议 1. **内存管理**: diff --git a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/agentic_pipeline_start.md b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/agentic_pipeline_start.md index 9036d13e5..aef64cc18 100644 --- a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/agentic_pipeline_start.md +++ b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/agentic_pipeline_start.md @@ -32,6 +32,8 @@ Agentic Pipeline 是ROLL提供的智能体训练核心Pipeline,支持多种算 * **异步训练**: rollout/training解耦,支持异步训练 * **多轮交互支持本地调试**: 多轮交互rollout支持本地调试,提高多轮交互业务开发效率 * **灵活的策略配置**:支持多种分布式训练策略,如 Megatron、DeepSpeed、vLLM 等,可以根据硬件资源进行灵活配置。 +* **高效训练优化**:支持 **Sequence Packing**(将多条短样本拼接成连续序列,减少 padding)与 **Dynamic Batching**(根据样本长度动态组 +batch,按 batch 内最大长度统一 padding,最小化无效计算)。配置方法和实现原理详见`sequence packing`和`dynamic batching` 对应文档。 --- ## ✨️ 核心组件 diff --git a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/distill_pipeline_start.md b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/distill_pipeline_start.md index d30a40144..7d0a3f726 100644 --- a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/distill_pipeline_start.md +++ b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/distill_pipeline_start.md @@ -33,6 +33,7 @@ * **高效分布式计算**:利用 [Ray](https://www.ray.io/) 框架,在大型 GPU 集群上实现高效的分布式训练,显著提升训练速度和资源利用率。 +* **高效训练优化**:支持 **Sequence Packing**(将多条短样本拼接成连续序列,减少 padding)。配置方法和实现原理详见`sequence packing`对应文档。 --- ## ✨️ 核心组件 diff --git a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/rlvr_pipeline_start.md b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/rlvr_pipeline_start.md index 81e597dce..38678d803 100644 --- a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/rlvr_pipeline_start.md +++ b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/rlvr_pipeline_start.md @@ -41,6 +41,8 @@ * **高效的分布式计算**:利用[Ray](https://www.ray.io/)框架在大规模GPU集群上实现高效的分布式训练,显著提高训练速度和资源利用率。 +* **高效训练优化**:支持 **Sequence Packing**(将多条短样本拼接成连续序列,减少 padding)与 **Dynamic Batching**(根据样本长度动态组 +batch,按 batch 内最大长度统一 padding,最小化无效计算)。配置方法和实现原理详见`sequence packing`和`dynamic batching` 对应文档。 --- diff --git a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/sft_pipeline_start.md b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/sft_pipeline_start.md index b20326837..48e689089 100644 --- a/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/sft_pipeline_start.md +++ b/docs_roll/i18n/zh-Hans/docusaurus-plugin-content-docs/current/User Guides/Pipeline/sft_pipeline_start.md @@ -1,2 +1,273 @@ -# SFTPipeline Quick Start -施工中... \ No newline at end of file +# SFT 流水线 + +**目录** + +- [SFT 流水线](#sft-流水线) + - [✨️概述](#️概述) + - [✨️核心组件](#️核心组件) + - [主模块(`SFTPipeline`)](#主模块sftpipeline) + - [工作器(`SFTWorker`)](#工作器sftworker) + - [配置文件(`SFTConfig`)](#配置文件sftconfig) + - [配置文件结构和组织](#配置文件结构和组织) + - [✨️数据准备](#️数据准备) + - [数据格式](#数据格式) + - [必需字段与字段映射](#必需字段与字段映射) + - [对话模板与标签(labels)规则](#对话模板与标签labels规则) + - [验证集(`validation`)](#验证集validation) + - [✨️运行流水线](#️运行流水线) + - [方法1:使用Python启动脚本](#方法1使用python启动脚本) + - [方法2:使用辅助Shell脚本](#方法2使用辅助shell脚本) + - [✨️逐步示例](#️逐步示例) + - [步骤1:配置设置](#步骤1配置设置) + - [步骤2:准备环境和依赖](#步骤2准备环境和依赖) + - [步骤3:启动流水线](#步骤3启动流水线) + - [步骤4:监控](#步骤4监控) + - [步骤5:输出和结果](#步骤5输出和结果) + +--- + +## ✨️概述 + +此流水线用于监督微调(SFT),提供: + +* **统一的数据编码与对话模板**:支持 system/user/assistant 对话格式拼接,并自动构造 `labels`(仅对回答部分计 loss)。 +* **高效分布式训练**:使用 [Ray](https://www.ray.io/) + Cluster/Worker 抽象启动分布式训练。 +* **全面的性能监控**:细粒度度量跟踪系统,监控性能指标,为模型训练过程提供全面的可视化和分析能力。 +* **高效训练优化**:支持 **Sequence Packing**(将多条短样本拼接成连续序列,减少 padding)。配置方法和实现原理详见`sequence packing`对应文档。 + +--- + +## ✨️核心组件 + +### 主模块(`SFTPipeline`) + +`SFTPipeline`(位于 `roll/pipeline/sft/sft_pipeline.py`)是 SFT 训练的主流程,负责: + +* 加载 tokenizer。 +* 加载训练数据集 与(可选)验证数据集。 +* 按模板编码数据:生成 `input_ids` / `attention_mask` / `labels`。 +* 初始化分布式训练集群(`Cluster` + `SFTWorker`)。 +* 训练循环:按 step 训练、按 `eval_steps` 验证、按保存策略写 checkpoint、记录指标并上报 tracker。 + +--- + +### 工作器(`SFTWorker`) + +`SFTWorker`(位于 `roll/pipeline/sft/sft_worker.py`)负责执行训练、验证与保存: + +* `initialize()`:创建并初始化分布式策略(`create_strategy`),并加载模型。 +* `train_step()`:执行一次训练 step,返回训练 metrics。 +* `val_step()`:执行一次验证 step(前向 + loss),返回验证 metrics。 +* `do_checkpoint()`:保存 checkpoint,并返回保存耗时等 metrics。 + +--- + +### 配置文件(`SFTConfig`) + +`SFTConfig`(定义于 `roll/pipeline/sft/sft_config.py`)是 SFT 流水线的配置对象(dataclass 风格),支持通过 YAML + Hydra 管理。 + +#### 配置文件结构和组织 + +示例配置文件:`examples/qwen2.5-7B-sft_megatron/sft_config.yaml` + +配置通常包含以下部分: + +1. **实验基本设置** + * `exp_name`:实验名称 + * `seed`:随机种子 + * `logging_dir`:日志目录 + * `output_dir`:checkpoint/输出目录 + +2. **训练控制参数** + * `save_steps`:保存 checkpoint 的频率 + * `logging_steps`:记录训练指标的频率 + * `eval_steps`:验证频率(启用验证集时生效) + * `resume_from_checkpoint`:断点续训配置 + +3. **模型配置** + * `pretrain`:预训练模型路径 + +4. **数据字段映射(关键)** + * `system_key`:system prompt 字段(可选) + * `prompt_key`:prompt 字段名(默认 `instruction`) + * `query_key`:query 字段名(可选) + * `response_key`:response 字段名(默认 `output`) + * `global_template`:全局模板名(可选;否则使用 `sft_train.data_args.template`) + +5. **工作器配置(`sft_train`)** + `sft_train` 是一个 `WorkerConfig`,包含: + + * **数据参数**(`data_args`) + * `file_name`:训练数据 JSON 路径(字符串或列表) + * `template`:对话模板名(当未设置 `global_template` 时使用) + * `preprocessing_num_workers`:数据预处理并行数 + * **训练参数**(`training_args`) + * `num_train_epochs` + * `learning_rate` + * `per_device_train_batch_size` + * `gradient_accumulation_steps` + * `dataloader_num_workers` + * ... + * **策略参数**(`strategy_args`) + * `strategy_name`:如 `megatron_train` / `deepspeed_train` 等 + * 并行相关参数(tensor/pipeline 并行大小等) + * **设备映射**(`device_mapping`) + * 指定该 worker 使用哪些 GPU + * **验证 batch**(推理 batch) + * `infer_batch_size`:验证阶段使用 + +6. **验证配置(可选)** + * `validation.data_args.file_name`:验证集 JSON 路径(配置后才会启用验证) + +--- + +## ✨️数据准备 + +### 数据格式 + +SFT 流水线使用 **JSON** 文件,并通过 HuggingFace Datasets 加载。 + +#### 必需字段与字段映射 + +每条样本至少需要能映射出: + +* Prompt:由 `prompt_key` 指定(默认 `instruction`) +* Response:由 `response_key` 指定(默认 `output`) + +可选字段: + +* `system_key`:system prompt(可选) +* `query_key`:附加输入(可选,会拼到 user 内容中) + +#### 对话模板与标签(labels)规则 + +对话结构: + +- system(可选) +- user(prompt + query) +- assistant(response) + +labels 构造: + +* prompt 部分全部置为 `IGNORE_INDEX`(不参与 loss) +* response 部分使用真实 token id(参与 loss) + +即:只监督模型“回答部分”。 + +--- + +### 验证集(`validation`) + +验证集是可选项: + +* 仅当配置了 `validation.data_args.file_name` 才加载验证集。 +* 训练时按 `eval_steps` 触发验证。 +* 验证由 `sft_train.val_step` 执行(不会额外启动一个 validation worker)。 + +--- + +## ✨️运行流水线 + +### 方法1:使用Python启动脚本 + +使用 `examples/start_sft_pipeline.py` 启动,Hydra 负责加载配置: + +```bash +# 确保您在 ROLL 项目根目录 +# export PYTHONPATH=$(pwd):$PYTHONPATH + +python examples/start_sft_pipeline.py \ + --config_path examples/qwen2.5-7B-sft_megatron \ + --config_name sft_config +``` + +* `--config_path` – 配置目录:`examples/qwen2.5-7B-sft_megatron` +* `--config_name` – 配置文件名:`sft_config`(对应 `sft_config.yaml`) + +--- + +### 方法2:使用辅助Shell脚本 + +示例: + +```bash +#!/bin/bash +# 示例:examples/qwen2.5-7B-sft_megatron/run_sft_pipeline.sh + +CONFIG_NAME="sft_config" +CONFIG_PATH="examples/qwen2.5-7B-sft_megatron" + +python examples/start_sft_pipeline.py \ + --config_path $CONFIG_PATH \ + --config_name $CONFIG_NAME \ + "$@" +``` + +运行: + +```bash +bash examples/qwen2.5-7B-sft_megatron/run_sft_pipeline.sh +``` + +--- + +## ✨️逐步示例 + +### 步骤1:配置设置 + +配置文件:`examples/qwen2.5-7B-sft_megatron/sft_config.yaml` + +重点检查: + +* **数据配置**:`sft_train.data_args.file_name` +* **字段映射**:`prompt_key/query_key/response_key/system_key` +* **模型配置**:`pretrain` +* **分布式策略**:`sft_train.strategy_args` 与 `sft_train.device_mapping` +* **验证配置(可选)**:`validation.data_args.file_name` 与 `eval_steps` +* **模板选择**:`global_template` 或 `sft_train.data_args.template` + +### 步骤2:准备环境和依赖 + +```bash +pip install -r requirements.txt +``` + +并确保: + +* `pretrain` 路径可访问 +* 训练/验证 JSON 的字段与 `prompt_key/response_key/...` 对齐 + +### 步骤3:启动流水线 + +```bash +python examples/start_sft_pipeline.py \ + --config_path examples/qwen2.5-7B-sft_megatron \ + --config_name sft_config +``` + +### 步骤4:监控 + +* **控制台输出** – 观察 Hydra、Ray 与流水线日志 +* **日志文件** – 检查 `logging_dir` +* **TensorBoard** + ```bash + tensorboard --logdir + ``` + +### 步骤5:输出和结果 + +* **训练模型** – checkpoint 保存在 `output_dir` 下,默认目录结构为: + + ``` + /sft_train/checkpoint-// + ``` + + 其中: + * ``:当前训练步数(例如 `checkpoint-200`) + * ``:分布式集群名称(由 Cluster/Ray 运行时决定) + +* **训练/验证指标** – 记录在终端与 tracker/TensorBoard(取决于 tracker 配置) + +--- + +*祝您实验愉快!* \ No newline at end of file diff --git a/examples/agentic_deepeyes/deepeyes.yaml b/examples/agentic_deepeyes/deepeyes.yaml new file mode 100644 index 000000000..ce87caa94 --- /dev/null +++ b/examples/agentic_deepeyes/deepeyes.yaml @@ -0,0 +1,218 @@ +defaults: + - ../config/envs@_here_ + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +# Use standard AgenticPipeline instead of DeepEyesPipeline +# pipeline_cls defaults to roll.pipeline.agentic.agentic_pipeline.AgenticPipeline + +exp_name: "deepeyes_pipeline" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +render_save_dir: ./output/render +system_envs: + USE_MODELSCOPE: '1' + +checkpoint_config: + type: file_system + output_dir: /data/cpfs_0/yuzhao/models/${exp_name} + +# track_with: tensorboard +# tracker_kwargs: +# log_dir: /data/oss_bucket_0/yuzhao/llm/tensorboard/roll_exp/deepeyes + +offload_nccl: true + +num_gpus_per_node: 8 + +max_steps: 1024 +save_steps: 200 +logging_steps: 1 +eval_steps: 0 +resume_from_checkpoint: false + +rollout_batch_size: 2048 # 4096 # batch_size for dataloader * group_size +val_batch_size: 1024 # batch_size for dataloader +# prompt_length: 8192 # data.max_prompt_length in deepeyes: 8192 +response_length: 1024 # data.max_response_length in deepeyes: 20480 +sequence_length: 16384 + +reward_clip: 20 +advantage_clip: 10.0 +ppo_epochs: 1 +adv_estimator: "grpo" +whiten_advantages: false +add_token_level_kl: false +use_kl_loss: false +init_kl_coef: 0.0 +entropy_loss_coef: 0 + +pretrain: Qwen/Qwen2.5-VL-7B-Instruct + +actor_train: + system_envs: + NVTE_FLASH_ATTN: '1' + NVTE_FUSED_ATTN: '0' + NVTE_UNFUSED_ATTN: '0' + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + freeze_module_prefix: "vision_model.blocks,vision_model.patch_embed" + training_args: + learning_rate: 1.0e-6 + lr_scheduler_type: constant + weight_decay: 1.0e-2 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 512 + warmup_steps: 5 + strategy_args: + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 2 + sequence_parallel: true + context_parallel_size: 2 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + recompute_granularity: full + device_mapping: list(range(0,16)) + infer_batch_size: 1 + offload_nccl: ${offload_nccl} + use_dynamic_batching_in_train: true + max_tokens_per_microbatch_in_train: 32768 + sequence_length_round_in_train: 8 + use_dynamic_batching_in_infer: true + max_tokens_per_microbatch_in_infer: 32768 + sequence_length_round_in_infer: 8 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${response_length} # single-turn response length + top_p: 1 + top_k: -1 + num_beams: 1 + temperature: 1.0 + num_return_sequences: 1 + strategy_args: + strategy_name: vllm + strategy_config: + tensor_parallel_size: 1 + gpu_memory_utilization: 0.8 + block_size: 16 + # cache missing errors happen occasionally thus disable + disable_mm_preprocessor_cache: true + # enable_prefix_caching: false + sleep_level: 2 # 2 will destroy model parameter and kv_cache after generate to save cpu memory, 1 will destroy kv_cache only. + device_mapping: list(range(0,12)) + offload_nccl: ${offload_nccl} + +reference: + system_envs: + NVTE_FLASH_ATTN: '1' + NVTE_FUSED_ATTN: '0' + NVTE_UNFUSED_ATTN: '0' + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + strategy_args: + strategy_name: megatron_infer + strategy_config: + tensor_model_parallel_size: 2 + context_parallel_size: 2 + sequence_parallel: true + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + device_mapping: list(range(0,16)) + infer_batch_size: 1 + offload_nccl: ${offload_nccl} + use_dynamic_batching_in_train: true + max_tokens_per_microbatch_in_train: 32768 + sequence_length_round_in_train: 8 + use_dynamic_batching_in_infer: true + max_tokens_per_microbatch_in_infer: 32768 + sequence_length_round_in_infer: 8 + +# Reward cluster configuration for LLM-as-judge +# Uses InferWorker (default from AgenticConfig) for reward model inference +reward: + name: deepeyes_reward + # worker_cls defaults to InferWorker from AgenticConfig + model_args: + model_name_or_path: Qwen/Qwen2.5-72B-Instruct + dtype: bf16 + generating_args: + temperature: 0.3 # Lower temperature for stable judgment + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + tensor_parallel_size: 4 + gpu_memory_utilization: 0.8 + block_size: 16 + load_format: auto + device_mapping: list(range(12,16)) + +max_actions_per_traj: 5 +reward_normalization: + grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv + method: mean_std # asym_clip / identity / mean_std + +custom_envs: + deepeyes: + env_type: deepeyes + max_steps: ${max_actions_per_traj} # used in environment state manager to control the actual max actions executed per trajectory + # used to curate llm prompt "max words", not used for rollout, + # single_response_max_tokens in deepeyes: 10240 + max_tokens_per_step: ${response_length} + env_manager_cls: roll.pipeline.agentic.env_manager.vl_traj_env_manager.VLTrajEnvManager + use_thread_lock: true + # max_env_step_concurrent: 256 # Control concurrent reward computation + agent_system_template: "" + pre_step_template: "" + next_step_template: "" + env_config: + data_args: + file_name: /data/oss_bucket_0/yuzhao/data/ChenShawn/DeepEyes-Datasets-47k/data_0.1.2_visual_toolbox_v2.parquet + preprocessing_num_workers: 64 + max_steps: ${max_actions_per_traj} + seed: ${seed} + mode: train + epoch: 0 + idx: 0 + # Reward weights for DeepEyes environment + acc_weight: 0.8 + format_weight: 0.2 + tool_weight: 1.2 + enable_thinking: false + +train_env_manager: + max_env_num_per_worker: 32 + num_env_groups: 256 + # under the same group, the env config and env seed are ensured to be equal + group_size: 8 + tags: [deepeyes] + num_groups_partition: + - 256 +val_env_manager: + max_env_num_per_worker: 32 + num_env_groups: ${val_batch_size} + group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output + tags: [deepeyes] + num_groups_partition: + - ${val_batch_size} diff --git a/examples/agentic_deepeyes/run_agentic_pipeline.sh b/examples/agentic_deepeyes/run_agentic_pipeline.sh new file mode 100755 index 000000000..9bd80bcb1 --- /dev/null +++ b/examples/agentic_deepeyes/run_agentic_pipeline.sh @@ -0,0 +1,6 @@ +#!/bin/bash +set +x + +CONFIG_PATH=$(basename $(dirname $0)) +python examples/start_agentic_pipeline.py --config_path $CONFIG_PATH --config_name deepeyes_2gpus + diff --git a/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake-pg_var.yaml b/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake-pg_var.yaml new file mode 100644 index 000000000..646f8e97d --- /dev/null +++ b/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake-pg_var.yaml @@ -0,0 +1,175 @@ +defaults: + - ../config/traj_envs@_here_ + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +exp_name: "agentic_pipeline" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +render_save_dir: ./output/render +system_envs: + USE_MODELSCOPE: '1' + +#track_with: wandb +#tracker_kwargs: +# api_key: +# project: roll-agentic +# name: ${exp_name}_sokoban +# notes: "agentic_pipeline" +# tags: +# - agentic +# - roll +# - baseline + +track_with: tensorboard +tracker_kwargs: + log_dir: /data/oss_bucket_0/yali/llm/tensorboard/roll_exp/agentic_frozen_lake + +checkpoint_config: + type: file_system + output_dir: /data/cpfs_0/rl_examples/models/${exp_name} + +num_gpus_per_node: 8 + +max_steps: 1024 +save_steps: 10000 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + +rollout_batch_size: 1024 +val_batch_size: 1024 +sequence_length: 8192 + +advantage_clip: 0.2 +ppo_epochs: 1 +adv_estimator: "grpo" +#pg_clip: 0.1 +#dual_clip_loss: True +init_kl_coef: 0.0 +whiten_advantages: true +entropy_loss_coef: 0 +max_grad_norm: 1.0 + +pretrain: Qwen/Qwen2.5-0.5B-Instruct +reward_pretrain: Qwen/Qwen2.5-0.5B-Instruct + +actor_train: + # worker_cls: roll.pipeline.rlvr.actor_pg_worker.ActorPGWorker + worker_cls: roll.pipeline.agentic.agentic_actor_pg_worker.ActorWorker + pg_variant: topr # topr, vanilla, tis, cispo, kimi15, ppo + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 2 + gradient_accumulation_steps: 64 + warmup_steps: 10 + lr_scheduler_type: cosine + data_args: + template: qwen2_5 + strategy_args: +# strategy_name: deepspeed_train +# strategy_config: ${deepspeed_zero3} + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + recompute_granularity: full + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: 128 # single-turn response length + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + load_format: auto + device_mapping: list(range(0,8)) + +reference: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + strategy_args: + strategy_name: hf_infer + strategy_config: ~ + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +reward_normalization: + grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv + method: mean_std # asym_clip / identity / mean_std + +train_env_manager: + max_env_num_per_worker: 16 + num_env_groups: 128 + # under the same group, the env config and env seed are ensured to be equal + group_size: 8 + tags: [FrozenLake] + num_groups_partition: [128] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + +val_env_manager: + max_env_num_per_worker: 32 + num_env_groups: 1024 + group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output + tags: [SimpleSokoban, LargerSokoban, SokobanDifferentGridVocab, FrozenLake] + num_groups_partition: [256, 256, 256, 256] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + + +# Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64 +max_tokens_per_step: 64 + +custom_envs: + SimpleSokoban: + ${custom_env.SimpleSokoban} + LargerSokoban: + ${custom_env.LargerSokoban} + SokobanDifferentGridVocab: + ${custom_env.SokobanDifferentGridVocab} + FrozenLake: + ${custom_env.FrozenLake} + FrozenLakeThink: + ${custom_env.FrozenLakeThink} + FrozenLakeLocallyDefineExamples: # Can import from unified envs config or define dict locally + env_type: frozen_lake + max_steps: ${max_actions_per_traj} + max_tokens_per_step: ${max_tokens_per_step} + env_manager_cls: ${env_manager_cls} + use_thread_lock: true + env_config: + env_instruction: "You are solving the FrozenLake puzzle. Forbid the whole and go to the target. You may move to the unintended direction due to the slippery ice. The answer must be one of action in a turn, format is Right" + action_pattern: ${think_action_pattern} + max_steps: ${max_actions_per_traj} + is_slippery: false diff --git a/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake-pg_var_is_correct.yaml b/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake-pg_var_is_correct.yaml new file mode 100644 index 000000000..8e1e79631 --- /dev/null +++ b/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake-pg_var_is_correct.yaml @@ -0,0 +1,192 @@ +defaults: + - ../config/traj_envs@_here_ + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +exp_name: "agentic_pipeline" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +render_save_dir: ./output/render +system_envs: + USE_MODELSCOPE: '1' + +#track_with: wandb +#tracker_kwargs: +# api_key: +# project: roll-agentic +# name: ${exp_name}_sokoban +# notes: "agentic_pipeline" +# tags: +# - agentic +# - roll +# - baseline + +track_with: tensorboard +tracker_kwargs: + log_dir: /data/oss_bucket_0/yali/llm/tensorboard/roll_exp/agentic_frozen_lake + + +checkpoint_config: + type: file_system + output_dir: /data/cpfs_0/rl_examples/models/${exp_name} + +num_gpus_per_node: 8 + +max_steps: 1024 +save_steps: 10000 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + +rollout_batch_size: 1024 +val_batch_size: 1024 +sequence_length: 8192 + +advantage_clip: 0.2 +ppo_epochs: 1 +adv_estimator: "grpo" +#pg_clip: 0.1 +#dual_clip_loss: True +init_kl_coef: 0.0 +whiten_advantages: true +entropy_loss_coef: 0 +max_grad_norm: 1.0 + +pretrain: Qwen/Qwen2.5-0.5B-Instruct +reward_pretrain: Qwen/Qwen2.5-0.5B-Instruct + +# is_correct: +# is_ratio_type: "token" +# is_upper_bound: 0.5 + +enable_old_logprobs_recompute: True +train_infer_correction: + filters: + - enabled: true + agg_type: segment + ratio_enabled: true + diff_enabled: true + ratio_low: 0.8 + ratio_high: 1.05 + diff_low: -0.1 + diff_high: 0.1 + +actor_train: + # worker_cls: roll.pipeline.rlvr.actor_pg_worker.ActorPGWorker + worker_cls: roll.pipeline.agentic.agentic_actor_pg_worker.ActorWorker + pg_variant: topr # topr, vanilla, tis, cispo, kimi15, ppo + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 2 + gradient_accumulation_steps: 64 + warmup_steps: 10 + lr_scheduler_type: cosine + data_args: + template: qwen2_5 + strategy_args: +# strategy_name: deepspeed_train +# strategy_config: ${deepspeed_zero3} + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + recompute_granularity: full + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: 128 # single-turn response length + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + load_format: auto + device_mapping: list(range(0,8)) + +reference: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + strategy_args: + strategy_name: hf_infer + strategy_config: ~ + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +reward_normalization: + grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv + method: mean_std # asym_clip / identity / mean_std + +train_env_manager: + max_env_num_per_worker: 16 + num_env_groups: 128 + # under the same group, the env config and env seed are ensured to be equal + group_size: 8 + tags: [FrozenLake] + num_groups_partition: [128] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + +val_env_manager: + max_env_num_per_worker: 32 + num_env_groups: 1024 + group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output + tags: [SimpleSokoban, LargerSokoban, SokobanDifferentGridVocab, FrozenLake] + num_groups_partition: [256, 256, 256, 256] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + + +# Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64 +max_tokens_per_step: 64 + +custom_envs: + SimpleSokoban: + ${custom_env.SimpleSokoban} + LargerSokoban: + ${custom_env.LargerSokoban} + SokobanDifferentGridVocab: + ${custom_env.SokobanDifferentGridVocab} + FrozenLake: + ${custom_env.FrozenLake} + FrozenLakeThink: + ${custom_env.FrozenLakeThink} + FrozenLakeLocallyDefineExamples: # Can import from unified envs config or define dict locally + env_type: frozen_lake + max_steps: ${max_actions_per_traj} + max_tokens_per_step: ${max_tokens_per_step} + env_manager_cls: ${env_manager_cls} + use_thread_lock: true + env_config: + env_instruction: "You are solving the FrozenLake puzzle. Forbid the whole and go to the target. You may move to the unintended direction due to the slippery ice. The answer must be one of action in a turn, format is Right" + action_pattern: ${think_action_pattern} + max_steps: ${max_actions_per_traj} + is_slippery: false diff --git a/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake.yaml b/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake.yaml index 3ff1d9cb1..b93ad3b75 100644 --- a/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake.yaml +++ b/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake.yaml @@ -95,13 +95,6 @@ actor_infer: model_args: disable_gradient_checkpointing: true dtype: bf16 - generating_args: - max_new_tokens: 128 # single-turn response length - top_p: 0.99 - top_k: 100 - num_beams: 1 - temperature: 0.99 - num_return_sequences: 1 data_args: template: qwen2_5 strategy_args: @@ -137,6 +130,12 @@ train_env_manager: group_size: 8 tags: [FrozenLake] num_groups_partition: [128] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + generating_args: + max_new_tokens: 128 # single-turn response length + top_p: 0.99 + top_k: 100 + temperature: 0.99 + num_return_sequences: 1 val_env_manager: max_env_num_per_worker: 32 @@ -144,6 +143,12 @@ val_env_manager: group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output tags: [SimpleSokoban, LargerSokoban, SokobanDifferentGridVocab, FrozenLake] num_groups_partition: [256, 256, 256, 256] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + generating_args: + max_new_tokens: 128 # single-turn response length + top_p: 0.99 + top_k: 100 + temperature: 0.2 + num_return_sequences: 1 # Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64 diff --git a/examples/qwen2.5-0.5B-agentic/agentic_sokoban_rollout_mock_dump.yaml b/examples/qwen2.5-0.5B-agentic/agentic_sokoban_rollout_mock_dump.yaml new file mode 100644 index 000000000..b59749301 --- /dev/null +++ b/examples/qwen2.5-0.5B-agentic/agentic_sokoban_rollout_mock_dump.yaml @@ -0,0 +1,153 @@ +defaults: + - ../config/traj_envs@_here_ + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +exp_name: "sokoban_precision_test_dump" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +render_save_dir: ./output/render +system_envs: + NCCL_ALGO: Ring + NVTE_ALLOW_NONDETERMINISTIC_ALGO: '0' + CUBLAS_WORKSPACE_CONFIG: ':4096:8' + DETERMINISTIC_MODE: '1' + +track_with: tensorboard +tracker_kwargs: + log_dir: ./output/tensorboard + +checkpoint_config: + type: file_system + output_dir: ./output/models/${exp_name} + +num_gpus_per_node: 8 + +max_steps: 50 +save_steps: 10000 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + +rollout_batch_size: 128 +val_batch_size: 128 +sequence_length: 8192 + +# Rollout Mock Configuration - DUMP MODE +rollout_mock: + enable: true + mode: dump + dump_dir: ./output/rollout_dumps/baseline_v1 + +advantage_clip: 0.2 +ppo_epochs: 1 +adv_estimator: "grpo" +init_kl_coef: 0.0 +whiten_advantages: true +entropy_loss_coef: 0 +max_grad_norm: 1.0 + +pretrain: Qwen/Qwen2.5-0.5B-Instruct +reward_pretrain: Qwen/Qwen2.5-0.5B-Instruct + +actor_train: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 2 + gradient_accumulation_steps: 32 + # warmup_steps: 10 + lr_scheduler_type: cosine + data_args: + template: qwen2_5 + strategy_args: + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + recompute_granularity: full + device_mapping: list(range(2,4)) + infer_batch_size: 2 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: 128 + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + load_format: auto + device_mapping: list(range(2,4)) + +reference: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + strategy_args: + strategy_name: hf_infer + strategy_config: ~ + device_mapping: list(range(2,4)) + infer_batch_size: 2 + +reward_normalization: + grouping: traj_group_id + method: mean_std + +train_env_manager: + format_penalty: -0.15 + max_env_num_per_worker: 16 + num_env_groups: 16 + group_size: 8 + tags: [SimpleSokoban] + num_groups_partition: [16] + +val_env_manager: + max_env_num_per_worker: 32 + num_env_groups: 128 + group_size: 1 + tags: [SimpleSokoban] + num_groups_partition: [128] + +max_tokens_per_step: 64 + +custom_envs: + SimpleSokoban: + ${custom_env.SimpleSokoban} + LargerSokoban: + ${custom_env.LargerSokoban} + SokobanDifferentGridVocab: + ${custom_env.SokobanDifferentGridVocab} + FrozenLake: + ${custom_env.FrozenLake} + FrozenLakeThink: + ${custom_env.FrozenLakeThink} diff --git a/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_dynamic_batching.yaml b/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_dynamic_batching.yaml new file mode 100644 index 000000000..02d28f8c0 --- /dev/null +++ b/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_dynamic_batching.yaml @@ -0,0 +1,175 @@ +defaults: + - ../config/traj_envs@_here_ + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +exp_name: "agentic_pipeline" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +render_save_dir: ./output/render +system_envs: + USE_MODELSCOPE: '1' + +#track_with: wandb +#tracker_kwargs: +# api_key: +# project: roll-agentic +# name: ${exp_name}_sokoban +# notes: "agentic_pipeline" +# tags: +# - agentic +# - roll +# - baseline + +track_with: tensorboard +tracker_kwargs: + log_dir: /data/oss_bucket_0/yali/llm/tensorboard/roll_exp/agentic_sokoban + + +checkpoint_config: + type: file_system + output_dir: /data/cpfs_0/rl_examples/models/${exp_name} + +num_gpus_per_node: 8 + +max_steps: 1024 +save_steps: 10000 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + +rollout_batch_size: 1024 +val_batch_size: 1024 +sequence_length: 8192 + +advantage_clip: 0.2 +ppo_epochs: 1 +adv_estimator: "grpo" +#pg_clip: 0.1 +#dual_clip_loss: True +init_kl_coef: 0.0 +whiten_advantages: true +entropy_loss_coef: 0 +max_grad_norm: 1.0 + +pretrain: Qwen/Qwen2.5-0.5B-Instruct +reward_pretrain: Qwen/Qwen2.5-0.5B-Instruct + +actor_train: + system_envs: + NVTE_FLASH_ATTN: '1' + NVTE_FUSED_ATTN: '0' + NVTE_UNFUSED_ATTN: '0' + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 2 + gradient_accumulation_steps: 64 + warmup_steps: 10 + lr_scheduler_type: cosine + data_args: + template: qwen2_5 + strategy_args: +# strategy_name: deepspeed_train +# strategy_config: ${deepspeed_zero3} + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + device_mapping: list(range(0,8)) + infer_batch_size: 2 + use_dynamic_batching_in_train: true + max_tokens_per_microbatch_in_train: 8192 + sequence_length_round_in_train: 128 + use_dynamic_batching_in_infer: true + max_tokens_per_microbatch_in_infer: 16384 + sequence_length_round_in_infer: 128 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: 128 # single-turn response length + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + load_format: auto + device_mapping: list(range(0,8)) + +reference: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + strategy_args: + strategy_name: megatron_infer + strategy_config: ~ + device_mapping: list(range(0,8)) + infer_batch_size: 2 + use_dynamic_batching_in_infer: true + max_tokens_per_microbatch_in_infer: 16384 + sequence_length_round_in_infer: 128 + +reward_normalization: + grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv + method: mean_std # asym_clip / identity / mean_std + +train_env_manager: + format_penalty: -0.15 # sokoban env penalty_for_step=-0.1 + max_env_num_per_worker: 16 + num_env_groups: 128 + # under the same group, the env config and env seed are ensured to be equal + group_size: 8 + tags: [SimpleSokoban] + num_groups_partition: [128] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + +val_env_manager: + max_env_num_per_worker: 32 + num_env_groups: 1024 + group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output + tags: [SimpleSokoban, LargerSokoban, SokobanDifferentGridVocab, FrozenLake] + num_groups_partition: [256, 256, 256, 256] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + + +# Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64 +max_tokens_per_step: 64 + +custom_envs: + SimpleSokoban: + ${custom_env.SimpleSokoban} + LargerSokoban: + ${custom_env.LargerSokoban} + SokobanDifferentGridVocab: + ${custom_env.SokobanDifferentGridVocab} + FrozenLake: + ${custom_env.FrozenLake} + FrozenLakeThink: + ${custom_env.FrozenLakeThink} diff --git a/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_lora.yaml b/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_lora.yaml new file mode 100644 index 000000000..5dd3fe377 --- /dev/null +++ b/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_lora.yaml @@ -0,0 +1,169 @@ +defaults: + - ../config/traj_envs@_here_ + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +exp_name: "agentic_pipeline" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +render_save_dir: ./output/render +system_envs: + USE_MODELSCOPE: '1' + +#track_with: wandb +#tracker_kwargs: +# api_key: +# project: roll-agentic +# name: ${exp_name}_sokoban +# notes: "agentic_pipeline" +# tags: +# - agentic +# - roll +# - baseline + +track_with: tensorboard +tracker_kwargs: + log_dir: /data/oss_bucket_0/yali/llm/tensorboard/roll_exp/agentic_sokoban + + +checkpoint_config: + type: file_system + output_dir: /data/cpfs_0/rl_examples/models/${exp_name} + +num_gpus_per_node: 8 + +max_steps: 1024 +save_steps: 10000 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + +rollout_batch_size: 1024 +val_batch_size: 1024 +sequence_length: 8192 + +advantage_clip: 0.2 +ppo_epochs: 1 +adv_estimator: "grpo" +#pg_clip: 0.1 +#dual_clip_loss: True +init_kl_coef: 0.0 +whiten_advantages: true +entropy_loss_coef: 0 +max_grad_norm: 1.0 + +pretrain: Qwen/Qwen2.5-0.5B-Instruct +reward_pretrain: Qwen/Qwen2.5-0.5B-Instruct + +actor_train: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + lora_target: all-linear + lora_rank: 32 + lora_alpha: 32 + training_args: + learning_rate: 2.0e-5 + weight_decay: 0 + per_device_train_batch_size: 2 + gradient_accumulation_steps: 64 + warmup_steps: 10 + lr_scheduler_type: cosine + data_args: + template: qwen2_5 + strategy_args: +# strategy_name: deepspeed_train +# strategy_config: ${deepspeed_zero3} + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + recompute_granularity: full + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + lora_target: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj + lora_rank: 32 + lora_alpha: 32 + generating_args: + max_new_tokens: 128 # single-turn response length + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + load_format: auto + device_mapping: list(range(0,8)) + +reference: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + strategy_args: + strategy_name: hf_infer + strategy_config: ~ + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +reward_normalization: + grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv + method: mean_std # asym_clip / identity / mean_std + +train_env_manager: + format_penalty: -0.15 # sokoban env penalty_for_step=-0.1 + max_env_num_per_worker: 16 + num_env_groups: 128 + # under the same group, the env config and env seed are ensured to be equal + group_size: 8 + tags: [SimpleSokoban] + num_groups_partition: [128] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + +val_env_manager: + max_env_num_per_worker: 32 + num_env_groups: 1024 + group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output + tags: [SimpleSokoban, LargerSokoban, SokobanDifferentGridVocab, FrozenLake] + num_groups_partition: [256, 256, 256, 256] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + + +# Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64 +max_tokens_per_step: 64 + +custom_envs: + SimpleSokoban: + ${custom_env.SimpleSokoban} + LargerSokoban: + ${custom_env.LargerSokoban} + SokobanDifferentGridVocab: + ${custom_env.SokobanDifferentGridVocab} + FrozenLake: + ${custom_env.FrozenLake} + FrozenLakeThink: + ${custom_env.FrozenLakeThink} diff --git a/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_native.yaml b/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_native.yaml new file mode 100644 index 000000000..3483fac2b --- /dev/null +++ b/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_native.yaml @@ -0,0 +1,175 @@ +defaults: + - ../config/traj_envs@_here_ + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +exp_name: "agentic_pipeline" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +render_save_dir: ./output/render +system_envs: + USE_MODELSCOPE: '1' + +#track_with: wandb +#tracker_kwargs: +# api_key: +# project: roll-agentic +# name: ${exp_name}_sokoban +# notes: "agentic_pipeline" +# tags: +# - agentic +# - roll +# - baseline + +checkpoint_config: + type: file_system + output_dir: /data/cpfs_0/rl_examples/models/${exp_name} + +num_gpus_per_node: 8 + +max_steps: 102400 +save_steps: 10000 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + +rollout_batch_size: 1024 +val_batch_size: 256 +sequence_length: 8192 +async_generation_ratio: 1 + +advantage_clip: 20 +ppo_epochs: 1 +adv_estimator: "step_reinforce" +batch_adjust_mode: "copy" +step_reward_gamma: 1.0 + +#pg_clip: 0.1 +#dual_clip_loss: True +init_kl_coef: 0.0 +whiten_advantages: false +entropy_loss_coef: 0 +max_grad_norm: 1.0 + +reward_normalization: + grouping: traj_group_id +# method: mean + norm_mean_type: group + norm_std_type: batch + + +pretrain: Qwen/Qwen2.5-0.5B-Instruct +reward_pretrain: Qwen/Qwen2.5-0.5B-Instruct + +actor_train: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 128 + warmup_steps: 20 + lr_scheduler_type: cosine + data_args: + template: qwen2_5 + strategy_args: +# strategy_name: deepspeed_train +# strategy_config: ${deepspeed_zero3} + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + bias_activation_fusion: true + apply_rope_fusion: true +# recompute_granularity: full + device_mapping: list(range(0,16)) + infer_batch_size: 1 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${max_tokens_per_step} # single-turn response length + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 +# load_format: auto + device_mapping: list(range(16,24)) + +reference: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + strategy_args: + strategy_name: hf_infer + strategy_config: ~ + device_mapping: list(range(0,16)) + infer_batch_size: 2 + +train_env_manager: + max_env_num_per_worker: 16 + num_env_groups: 128 + # under the same group, the env config and env seed are ensured to be equal + group_size: 8 + tags: [SokobanNativeEnv] + num_groups_partition: [128] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + +val_env_manager: + max_env_num_per_worker: 32 + num_env_groups: 256 + group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output + tags: [SokobanNativeEnv] + num_groups_partition: [256] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + + +# Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64 +max_tokens_per_step: 128 +max_actions_per_traj: 10 + +custom_envs: + SimpleSokoban: + ${custom_env.SimpleSokoban} + LargerSokoban: + ${custom_env.LargerSokoban} + SokobanDifferentGridVocab: + ${custom_env.SokobanDifferentGridVocab} + FrozenLake: + ${custom_env.FrozenLake} + SokobanNativeEnv: + env_type: "sokoban_native_env" + max_steps: ${max_actions_per_traj} + max_tokens_per_step: ${max_tokens_per_step} + env_manager_cls: roll.pipeline.agentic.env_manager.agent_native_env_manager.AgentNativeStepEnvManager + agent_system_template: "agent_system_template placeholder" + agent_template: "agent_template placeholder" + env_config: + max_steps: ${max_actions_per_traj} + format_penalty: -0.15 diff --git a/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_sandbox.yaml b/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_sandbox.yaml new file mode 100644 index 000000000..df2feb840 --- /dev/null +++ b/examples/qwen2.5-0.5B-agentic/agentic_val_sokoban_sandbox.yaml @@ -0,0 +1,176 @@ +defaults: + - ../config/traj_envs@_here_ + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +exp_name: "agentic_pipeline" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +render_save_dir: ./output/render +system_envs: + USE_MODELSCOPE: '1' + +#track_with: wandb +#tracker_kwargs: +# api_key: +# project: roll-agentic +# name: ${exp_name}_sokoban +# notes: "agentic_pipeline" +# tags: +# - agentic +# - roll +# - baseline + +track_with: tensorboard +tracker_kwargs: + log_dir: /data/oss_bucket_0/yali/llm/tensorboard/roll_exp/agentic_sokoban + +checkpoint_config: + type: file_system + output_dir: /data/cpfs_0/rl_examples/models/${exp_name} + +num_gpus_per_node: 8 + +max_steps: 1024 +save_steps: 10000 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + +rollout_batch_size: 1024 +val_batch_size: 1024 +sequence_length: 8192 + +advantage_clip: 0.2 +ppo_epochs: 1 +adv_estimator: "grpo" +#pg_clip: 0.1 +#dual_clip_loss: True +init_kl_coef: 0.0 +whiten_advantages: true +entropy_loss_coef: 0 +max_grad_norm: 1.0 + +pretrain: Qwen/Qwen2.5-0.5B-Instruct +reward_pretrain: Qwen/Qwen2.5-0.5B-Instruct + +actor_train: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 2 + gradient_accumulation_steps: 64 + warmup_steps: 10 + lr_scheduler_type: cosine + data_args: + template: qwen2_5 + strategy_args: +# strategy_name: deepspeed_train +# strategy_config: ${deepspeed_zero3} + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + recompute_granularity: full + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: 128 # single-turn response length + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + load_format: auto + device_mapping: list(range(0,8)) + +reference: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + strategy_args: + strategy_name: hf_infer + strategy_config: ~ + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +reward_normalization: + grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv + method: mean_std # asym_clip / identity / mean_std + +train_env_manager: + format_penalty: -0.15 # sokoban env penalty_for_step=-0.1 + max_env_num_per_worker: 16 + num_env_groups: 128 + # under the same group, the env config and env seed are ensured to be equal + group_size: 8 + tags: [SokobanSandbox] + num_groups_partition: [128] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + +val_env_manager: + max_env_num_per_worker: 32 + num_env_groups: 1024 + group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output + tags: [SimpleSokoban, LargerSokoban, SokobanDifferentGridVocab, FrozenLake] + num_groups_partition: [256, 256, 256, 256] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + +# Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64 +max_tokens_per_step: 64 + +custom_envs: + SimpleSokoban: + ${custom_env.SimpleSokoban} + LargerSokoban: + ${custom_env.LargerSokoban} + SokobanDifferentGridVocab: + ${custom_env.SokobanDifferentGridVocab} + FrozenLake: + ${custom_env.FrozenLake} + FrozenLakeThink: + ${custom_env.FrozenLakeThink} + SokobanSandbox: + env_type: sokoban_sandbox + max_steps: ${max_actions_per_traj} + max_tokens_per_step: ${max_tokens_per_step} + env_manager_cls: ${env_manager_cls} + use_thread_lock: true + agent_system_template: "You are an expert Sokoban-playing agent. Your only task is to output the next action like Right. Do not output any other text, reasoning, or explanations." + agent_template: | + Current State: + {observation} + Turn {turn_idx}: + {suffix} + Respond with one action from the available list. + Format: ACTION + Choose your action. \ No newline at end of file diff --git a/examples/qwen2.5-0.5B-agentic/submit_pipeline_amd.sh b/examples/qwen2.5-0.5B-agentic/submit_pipeline_amd.sh deleted file mode 100644 index 54d095440..000000000 --- a/examples/qwen2.5-0.5B-agentic/submit_pipeline_amd.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -set +x -source "examples/scripts/config.sh" - -WORKER_COUNT=1 -CONFIG_FILE="agent_val_frozen_lake_amd.yaml" -# 替换为mos uri -NEBULA_MODEL="" -ENTRY_FILE="examples/start_agentic_pipeline.py" - -CONFIG_PATH=$(basename $(dirname $0)) -CONFIG_NAME="${CONFIG_FILE%.yaml}" -JOB_NAME="$CONFIG_PATH-$CONFIG_NAME" - - -QUEUE="nebula_test2_308x_gpu_hang" -# QUEUE="nebula_test_308x" -ENVS="NCCL_PF_UCM_TIMEOUT=600000,NCCL_SOCKET_IFNAME=bond0" -# ENVS="NCCL_PF_UCM_TIMEOUT=600000" - -echo "JOB_NAME: ${JOB_NAME}" -echo "WORKER_COUNT: ${WORKER_COUNT}" -echo "CONFIG_NAME: ${CONFIG_NAME}" -echo "CONFIG_PATH: ${CONFIG_PATH}" -echo "ENTRY_FILE: ${ENTRY_FILE}" - -args="--config_name ${CONFIG_NAME} --config_path ${CONFIG_PATH}" - -mdl_args="--queue=${QUEUE} \ - --entry=${ENTRY_FILE} \ - --worker_count=${WORKER_COUNT} \ - --file.cluster_file=examples/scripts/cluster.json \ - --job_name=${JOB_NAME} \ - --algo_name=pytorch280 \ - --requirements_file_name=nebula_patch/requirements/requirements_torch280_vllm_amd.txt \ - --oss_appendable=true \ - --_NEBULA_MODEL=${NEBULA_MODEL} \ - --nebula_model=${NEBULA_MODEL} \ - --env=${ENVS} \ - --force \ - " -if [ -n "${OPENLM_TOKEN}" ]; then - mdl_args="${mdl_args} --env=OPENLM_TOKEN=${OPENLM_TOKEN}" -fi - -echo ${args} -echo ${mdl_args} - -nebulactl run mdl --user_params="${args}" $mdl_args diff --git a/examples/qwen2.5-0.5B-agentic/submit_pipeline_amd_async.sh b/examples/qwen2.5-0.5B-agentic/submit_pipeline_amd_async.sh deleted file mode 100644 index aa06c2054..000000000 --- a/examples/qwen2.5-0.5B-agentic/submit_pipeline_amd_async.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -set +x -source "examples/scripts/config.sh" - -WORKER_COUNT=1 -CONFIG_FILE="agent_val_frozen_lake_async_amd.yaml" -# 替换为mos uri -NEBULA_MODEL="" -ENTRY_FILE="examples/start_agentic_pipeline.py" - -CONFIG_PATH=$(basename $(dirname $0)) -CONFIG_NAME="${CONFIG_FILE%.yaml}" -JOB_NAME="$CONFIG_PATH-$CONFIG_NAME" - - -QUEUE="nebula_test2_308x_gpu_hang" -# QUEUE="nebula_test_308x" -ENVS="NCCL_PF_UCM_TIMEOUT=600000,NCCL_SOCKET_IFNAME=bond0" -# ENVS="NCCL_PF_UCM_TIMEOUT=600000" - -echo "JOB_NAME: ${JOB_NAME}" -echo "WORKER_COUNT: ${WORKER_COUNT}" -echo "CONFIG_NAME: ${CONFIG_NAME}" -echo "CONFIG_PATH: ${CONFIG_PATH}" -echo "ENTRY_FILE: ${ENTRY_FILE}" - -args="--config_name ${CONFIG_NAME} --config_path ${CONFIG_PATH}" - -mdl_args="--queue=${QUEUE} \ - --entry=${ENTRY_FILE} \ - --worker_count=${WORKER_COUNT} \ - --file.cluster_file=examples/scripts/cluster.json \ - --job_name=${JOB_NAME} \ - --algo_name=pytorch280 \ - --requirements_file_name=nebula_patch/requirements/requirements_torch280_vllm_amd.txt \ - --oss_appendable=true \ - --_NEBULA_MODEL=${NEBULA_MODEL} \ - --nebula_model=${NEBULA_MODEL} \ - --env=${ENVS} \ - --force \ - " -if [ -n "${OPENLM_TOKEN}" ]; then - mdl_args="${mdl_args} --env=OPENLM_TOKEN=${OPENLM_TOKEN}" -fi - -echo ${args} -echo ${mdl_args} - -nebulactl run mdl --user_params="${args}" $mdl_args diff --git a/examples/qwen2.5-7B-agentic_megatron/run_agentic_pipeline_webshop.sh b/examples/qwen2.5-7B-agentic_megatron/run_agentic_pipeline_webshop.sh old mode 100644 new mode 100755 diff --git a/examples/qwen2.5-7B-rlvr-offpolicy/rlvr_config.yaml b/examples/qwen2.5-7B-rlvr-offpolicy/rlvr_config.yaml index ba8d6bde9..2f9e023f2 100644 --- a/examples/qwen2.5-7B-rlvr-offpolicy/rlvr_config.yaml +++ b/examples/qwen2.5-7B-rlvr-offpolicy/rlvr_config.yaml @@ -15,9 +15,11 @@ checkpoint_config: type: file_system output_dir: /data/cpfs_0/rl_examples/models/${exp_name} + + # track_with: wandb # tracker_kwargs: -# api_key: xxx +# api_key: xxxx # project: roll_examples # name: ${exp_name} # notes: roll_examples diff --git a/examples/qwen2.5-7B-rlvr-offpolicy/run_rlvr_pipeline.sh b/examples/qwen2.5-7B-rlvr-offpolicy/run_rlvr_pipeline.sh old mode 100644 new mode 100755 diff --git a/examples/qwen2.5-7B-rlvr_megatron/rlvr_config_amd_async.yaml b/examples/qwen2.5-7B-rlvr_megatron/rlvr_config_amd_async.yaml index 579245ad3..370cbff2f 100644 --- a/examples/qwen2.5-7B-rlvr_megatron/rlvr_config_amd_async.yaml +++ b/examples/qwen2.5-7B-rlvr_megatron/rlvr_config_amd_async.yaml @@ -14,8 +14,6 @@ checkpoint_config: type: file_system output_dir: /data/cpfs_0/rl_examples/lzc/models/${exp_name} -track_with: ml_tracker - num_gpus_per_node: 8 max_steps: 1000 diff --git a/examples/qwen2.5-7B-rlvr_megatron/rlvr_config_lora.yaml b/examples/qwen2.5-7B-rlvr_megatron/rlvr_config_lora.yaml new file mode 100644 index 000000000..e1bef22a8 --- /dev/null +++ b/examples/qwen2.5-7B-rlvr_megatron/rlvr_config_lora.yaml @@ -0,0 +1,265 @@ +hydra: + run: + dir: . + output_subdir: null + +exp_name: "qwen2.5-7B-rlvr-lora-config" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +system_envs: + USE_MODELSCOPE: '1' + +checkpoint_config: + type: file_system + output_dir: /data/cpfs_0/rl_examples/models/${exp_name} + +#track_with: wandb +#tracker_kwargs: +# api_key: +# project: roll_examples +# notes: roll_examples +# tags: +# - rlvr +# - baseline + +track_with: tensorboard +tracker_kwargs: + log_dir: /data/oss_bucket_0/rl_examples/llm/tensorboard/roll_exp/rlvr + +num_gpus_per_node: 8 + +max_steps: 500 +save_steps: 500 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + + +rollout_batch_size: 128 # prompt +prompt_length: 2048 +response_length: 4096 + +num_return_sequences_in_group: 8 +ppo_epochs: 1 +adv_estimator: "reinforce" + +# clip +value_clip: 0.5 +reward_clip: 10 +advantage_clip: 2.0 +dual_clip_loss: true + +# normalize +norm_mean_type: ~ +norm_std_type: ~ + +# data mask +max_len_mask: true +difficulty_mask: true +difficulty_low_threshold: 0.1 +difficulty_high_threshold: 0.95 +error_max_len_clip: false + +# data weight +difficulty_loss_weight: false +length_loss_weight: false + +# reward +add_token_level_kl: false + +# advantage +whiten_advantages: true + +# dynamic sampling scheduler +# use_additional_prompts: true +# max_running_requests: 256 +# is_num_return_sequences_expand: false +global_template: qwen2_5 + +pretrain: Qwen/Qwen2.5-7B +reward_pretrain: Qwen/Qwen2.5-7B + +validation: + data_args: + template: qwen2_5 + file_name: + - data/math_benchmarks.jsonl + generating_args: + max_new_tokens: ${response_length} + top_p: 0.6 + top_k: 50 + num_beams: 1 + temperature: 0.6 + num_return_sequences: 1 + + +actor_train: + model_args: + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + lora_target: all-linear + lora_rank: 64 + lora_alpha: 64 + training_args: + learning_rate: 1.0e-5 + weight_decay: 0 + per_device_train_batch_size: 4 + gradient_accumulation_steps: 32 + warmup_steps: 20 + num_train_epochs: 50 + data_args: + template: qwen2_5 + file_name: + - data/code_KodCode_data.jsonl + - data/llm_judge_Multi-subject-RLVR_deal_new.jsonl + - data/math_deepmath_deal.jsonl + - data/general_ifeval_train_deal.jsonl + - data/general_CrossThink-QA_deal.jsonl + domain_interleave_probs: + math_rule: 0.4 + code_sandbox: 0.3 + llm_judge: 0.1 + crossthinkqa: 0.1 + ifeval: 0.1 + dataset_dir: data + messages: messages + interleave_probs: "1.0" + preprocessing_num_workers: 16 + strategy_args: + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + recompute_granularity: full + device_mapping: list(range(0,8)) + infer_batch_size: 4 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + lora_target: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj + lora_rank: 64 + lora_alpha: 64 + generating_args: + max_new_tokens: ${response_length} + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: ${num_return_sequences_in_group} + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + max_model_len: 8192 + load_format: auto + device_mapping: list(range(0,6)) + infer_batch_size: 2 + +reference: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + strategy_args: + strategy_name: megatron_infer + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + device_mapping: list(range(0,8)) + infer_batch_size: 8 + +rewards: + crossthinkqa: + worker_cls: roll.pipeline.rlvr.rewards.crossthinkqa_rule_reward_worker.CrossThinkQARuleRewardWorker + reward_type: soft + response_length_penalty_coef: 0.0 + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen2_5 + tag_included: [crossthinkqa] + world_size: 8 + infer_batch_size: 4 + ifeval: + worker_cls: roll.pipeline.rlvr.rewards.ifeval_rule_reward_worker.GeneralRuleRewardWorker + reward_type: soft + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen2_5 + tag_included: [ifeval] + world_size: 8 + infer_batch_size: 4 + math_rule: + worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen2_5 + tag_included: [deepmath_103k, aime] + world_size: 8 + infer_batch_size: 1 +# dynamic filter config +# query_filter_config: +# type: mean_filter +# filter_args: +# threshold_up: 0.9 +# threshold_down: 0.1 + code_sandbox: + use_local: true + worker_cls: roll.pipeline.rlvr.rewards.code_sandbox_reward_worker.CodeSandboxRewardWorker + tag_included: [KodCode] + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen2_5 + world_size: 8 + infer_batch_size: 1 +# query_filter_config: +# type: std_filter +# filter_args: +# std_threshold: 0 + llm_judge: + # NOTE: llm as judge 也需要gpu, 不能和actor infer共享gpu + worker_cls: roll.pipeline.rlvr.rewards.llm_judge_reward_worker.LLMJudgeRewardWorker + judge_prompt: Qwen2.5-7B-Instruct-RLVR-prompt + judge_model_type: inference + tag_included: [RLVR] + model_args: + model_name_or_path: virtuoussy/Qwen2.5-7B-Instruct-RLVR + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: trl + generating_args: + max_new_tokens: 100 + top_p: 0.8 + top_k: 50 + num_beams: 1 + temperature: 0.8 + num_return_sequences: 1 + data_args: + template: qwen2_5 + strategy_args: + # strategy_name: hf_infer + # strategy_config: null + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + max_model_len: 10000 + load_format: auto + device_mapping: list(range(6,8)) + infer_batch_size: 4 \ No newline at end of file diff --git a/examples/qwen2.5-7B-rlvr_megatron/rlvr_config_sequence_packing.yaml b/examples/qwen2.5-7B-rlvr_megatron/rlvr_config_sequence_packing.yaml new file mode 100644 index 000000000..4693c2dd5 --- /dev/null +++ b/examples/qwen2.5-7B-rlvr_megatron/rlvr_config_sequence_packing.yaml @@ -0,0 +1,273 @@ +hydra: + run: + dir: . + output_subdir: null + +exp_name: "qwen2.5-7B-rlvr-config" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +system_envs: + USE_MODELSCOPE: '1' + +checkpoint_config: + type: file_system + output_dir: /data/cpfs_0/rl_examples/models/${exp_name} + +#track_with: wandb +#tracker_kwargs: +# api_key: +# project: roll_examples +# notes: roll_examples +# tags: +# - rlvr +# - baseline + +track_with: tensorboard +tracker_kwargs: + log_dir: /data/oss_bucket_0/rl_examples/llm/tensorboard/roll_exp/rlvr + +num_gpus_per_node: 8 + +max_steps: 500 +save_steps: 100 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + + +rollout_batch_size: 64 # prompt +prompt_length: 2048 +response_length: 4096 + +num_return_sequences_in_group: 8 +ppo_epochs: 1 +adv_estimator: "reinforce" + +# clip +value_clip: 0.5 +reward_clip: 10 +advantage_clip: 2.0 +dual_clip_loss: true + +# normalize +norm_mean_type: ~ +norm_std_type: ~ + +# data mask +max_len_mask: true +difficulty_mask: true +difficulty_low_threshold: 0.1 +difficulty_high_threshold: 0.95 +error_max_len_clip: false + +# data weight +difficulty_loss_weight: false +length_loss_weight: false + +# reward +add_token_level_kl: false + +# advantage +whiten_advantages: true + +# dynamic sampling scheduler +# use_additional_prompts: true +# max_running_requests: 256 +# is_num_return_sequences_expand: false +global_template: qwen2_5 + +pretrain: Qwen/Qwen2.5-7B +reward_pretrain: Qwen/Qwen2.5-7B + +validation: + data_args: + template: qwen2_5 + file_name: + - data/math_benchmarks.jsonl + generating_args: + max_new_tokens: ${response_length} + top_p: 0.6 + top_k: 50 + num_beams: 1 + temperature: 0.6 + num_return_sequences: 1 + + +actor_train: + model_args: + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 64 + warmup_steps: 20 + num_train_epochs: 50 + data_args: + template: qwen2_5 + file_name: + - data/code_KodCode_data.jsonl + - data/llm_judge_Multi-subject-RLVR_deal_new.jsonl + - data/math_deepmath_deal.jsonl + - data/general_ifeval_train_deal.jsonl + - data/general_CrossThink-QA_deal.jsonl + domain_interleave_probs: + math_rule: 0.4 + code_sandbox: 0.3 + llm_judge: 0.1 + crossthinkqa: 0.1 + ifeval: 0.1 + dataset_dir: data + messages: messages + interleave_probs: "1.0" + preprocessing_num_workers: 16 + strategy_args: + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + recompute_granularity: full + use_sequence_packing: True + sequence_packing_args: + algorithm: load_balance + max_packed_sequence_length_train: 8192 + max_packed_sequence_length_forward: 8192 + min_num_micro_batches_train: 1 + min_num_micro_batches_forward: 1 + device_mapping: list(range(0,8)) + infer_batch_size: 4 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${response_length} + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: ${num_return_sequences_in_group} + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + max_model_len: 8000 + load_format: auto + device_mapping: list(range(0,6)) + infer_batch_size: 1 + +reference: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + strategy_args: + strategy_name: megatron_infer + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_sequence_packing: True + sequence_packing_args: + algorithm: load_balance + max_packed_sequence_length_train: 8192 + max_packed_sequence_length_forward: 8192 + min_num_micro_batches_train: 1 + min_num_micro_batches_forward: 1 + device_mapping: list(range(0,8)) + infer_batch_size: 8 + +rewards: + crossthinkqa: + worker_cls: roll.pipeline.rlvr.rewards.crossthinkqa_rule_reward_worker.CrossThinkQARuleRewardWorker + reward_type: soft + response_length_penalty_coef: 0.0 + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen2_5 + tag_included: [crossthinkqa] + world_size: 8 + infer_batch_size: 4 + ifeval: + worker_cls: roll.pipeline.rlvr.rewards.ifeval_rule_reward_worker.GeneralRuleRewardWorker + reward_type: soft + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen2_5 + tag_included: [ifeval] + world_size: 8 + infer_batch_size: 4 + math_rule: + worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen2_5 + tag_included: [deepmath_103k, aime] + world_size: 8 + infer_batch_size: 1 +# dynamic filter config +# query_filter_config: +# type: mean_filter +# filter_args: +# threshold_up: 0.9 +# threshold_down: 0.1 + code_sandbox: + use_local: true + worker_cls: roll.pipeline.rlvr.rewards.code_sandbox_reward_worker.CodeSandboxRewardWorker + tag_included: [KodCode] + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen2_5 + world_size: 8 + infer_batch_size: 1 +# query_filter_config: +# type: std_filter +# filter_args: +# std_threshold: 0 + llm_judge: + # NOTE: llm as judge 也需要gpu, 不能和actor infer共享gpu + worker_cls: roll.pipeline.rlvr.rewards.llm_judge_reward_worker.LLMJudgeRewardWorker + judge_prompt: Qwen2.5-7B-Instruct-RLVR-prompt + judge_model_type: inference + tag_included: [RLVR] + model_args: + model_name_or_path: virtuoussy/Qwen2.5-7B-Instruct-RLVR + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: trl + generating_args: + max_new_tokens: 100 + top_p: 0.8 + top_k: 50 + num_beams: 1 + temperature: 0.8 + num_return_sequences: 1 + data_args: + template: qwen2_5 + strategy_args: + # strategy_name: hf_infer + # strategy_config: null + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + max_model_len: 8000 + load_format: auto + device_mapping: list(range(6,8)) + infer_batch_size: 4 \ No newline at end of file diff --git a/examples/qwen2.5-7B-rlvr_megatron/rlvr_rollout_mock_dump.yaml b/examples/qwen2.5-7B-rlvr_megatron/rlvr_rollout_mock_dump.yaml new file mode 100644 index 000000000..045829326 --- /dev/null +++ b/examples/qwen2.5-7B-rlvr_megatron/rlvr_rollout_mock_dump.yaml @@ -0,0 +1,166 @@ +hydra: + run: + dir: . + output_subdir: null + +exp_name: "rlvr_precision_test_dump" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +system_envs: + USE_MODELSCOPE: '1' + +checkpoint_config: + type: file_system + output_dir: ./output/models/${exp_name} + +track_with: tensorboard +tracker_kwargs: + log_dir: ./output/tensorboard + +num_gpus_per_node: 8 + +max_steps: 20 +save_steps: 10000 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + +# Rollout Mock Configuration - DUMP MODE +rollout_mock: + enable: true + mode: dump + dump_dir: ./output/rollout_dumps/rlvr_baseline_v1 + +rollout_batch_size: 8 # prompt +prompt_length: 2048 +response_length: 4096 + +num_return_sequences_in_group: 8 +ppo_epochs: 1 +adv_estimator: "reinforce" + +# clip +value_clip: 0.5 +reward_clip: 10 +advantage_clip: 2.0 +dual_clip_loss: true + +# normalize +norm_mean_type: ~ +norm_std_type: ~ + +# data mask +max_len_mask: true +difficulty_mask: true +difficulty_low_threshold: 0.1 +difficulty_high_threshold: 0.95 +error_max_len_clip: false + +# data weight +difficulty_loss_weight: false +length_loss_weight: false + +# reward +add_token_level_kl: false + +# advantage +whiten_advantages: true + +pretrain: Qwen/Qwen2.5-7B +reward_pretrain: Qwen/Qwen2.5-7B + +validation: + data_args: + template: qwen2_5 + file_name: + - data/math_benchmarks.jsonl + generating_args: + max_new_tokens: ${response_length} + top_p: 0.6 + top_k: 50 + num_beams: 1 + temperature: 0.6 + num_return_sequences: 1 + + +actor_train: + model_args: + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 8 + num_train_epochs: 50 + data_args: + template: qwen2_5 + file_name: + - data/math_deepmath_deal.jsonl + domain_interleave_probs: + math_rule: 1 + dataset_dir: data + messages: messages + interleave_probs: "1.0" + preprocessing_num_workers: 16 + strategy_args: + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + use_distributed_optimizer: true + recompute_granularity: full + device_mapping: list(range(0,8)) + infer_batch_size: 1 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${response_length} + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: ${num_return_sequences_in_group} + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + max_model_len: 8000 + device_mapping: list(range(0,8)) + infer_batch_size: 1 + +reference: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + strategy_args: + strategy_name: megatron_infer + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + device_mapping: list(range(0,8)) + infer_batch_size: 1 + +rewards: + math_rule: + worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen2_5 + tag_included: [deepmath_103k, aime] + world_size: 8 + infer_batch_size: 1 diff --git a/examples/qwen2.5-7B-rlvr_megatron/submit_pipeline_amd.sh b/examples/qwen2.5-7B-rlvr_megatron/submit_pipeline_amd.sh deleted file mode 100644 index fccb1ab1c..000000000 --- a/examples/qwen2.5-7B-rlvr_megatron/submit_pipeline_amd.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -set +x -source "examples/scripts/config.sh" - -WORKER_COUNT=2 -CONFIG_FILE="rlvr_config_amd.yaml" -# 替换为mos uri -NEBULA_MODEL="" -ENTRY_FILE="examples/start_rlvr_pipeline.py" - -CONFIG_PATH=$(basename $(dirname $0)) -CONFIG_NAME="${CONFIG_FILE%.yaml}" -JOB_NAME="$CONFIG_PATH-$CONFIG_NAME" - - -QUEUE="nebula_test2_308x_gpu_hang" -# QUEUE="nebula_test_308x" -ENVS="NCCL_PF_UCM_TIMEOUT=600000,NCCL_SOCKET_IFNAME=bond0" -# ENVS="NCCL_PF_UCM_TIMEOUT=600000" - -echo "JOB_NAME: ${JOB_NAME}" -echo "WORKER_COUNT: ${WORKER_COUNT}" -echo "CONFIG_NAME: ${CONFIG_NAME}" -echo "CONFIG_PATH: ${CONFIG_PATH}" -echo "ENTRY_FILE: ${ENTRY_FILE}" - -args="--config_name ${CONFIG_NAME} --config_path ${CONFIG_PATH}" - -mdl_args="--queue=${QUEUE} \ - --entry=${ENTRY_FILE} \ - --worker_count=${WORKER_COUNT} \ - --file.cluster_file=examples/scripts/cluster.json \ - --job_name=${JOB_NAME} \ - --algo_name=pytorch280 \ - --requirements_file_name=nebula_patch/requirements/requirements_torch280_vllm_amd.txt \ - --oss_appendable=true \ - --_NEBULA_MODEL=${NEBULA_MODEL} \ - --nebula_model=${NEBULA_MODEL} \ - --env=${ENVS} \ - --force \ - " -if [ -n "${OPENLM_TOKEN}" ]; then - mdl_args="${mdl_args} --env=OPENLM_TOKEN=${OPENLM_TOKEN}" -fi - -echo ${args} -echo ${mdl_args} - -nebulactl run mdl --user_params="${args}" $mdl_args diff --git a/examples/qwen2.5-7B-rlvr_megatron/submit_pipeline_amd_async.sh b/examples/qwen2.5-7B-rlvr_megatron/submit_pipeline_amd_async.sh deleted file mode 100644 index 484218310..000000000 --- a/examples/qwen2.5-7B-rlvr_megatron/submit_pipeline_amd_async.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -set +x -source "examples/scripts/config.sh" - -WORKER_COUNT=3 -CONFIG_FILE="rlvr_config_amd_async.yaml" -# 替换为mos uri -NEBULA_MODEL="" -ENTRY_FILE="examples/start_rlvr_pipeline.py" - -CONFIG_PATH=$(basename $(dirname $0)) -CONFIG_NAME="${CONFIG_FILE%.yaml}" -JOB_NAME="$CONFIG_PATH-$CONFIG_NAME" - - -QUEUE="nebula_test2_308x_gpu_hang" -# QUEUE="nebula_test_308x" -ENVS="NCCL_PF_UCM_TIMEOUT=600000,NCCL_SOCKET_IFNAME=bond0" -# ENVS="NCCL_PF_UCM_TIMEOUT=600000" - -echo "JOB_NAME: ${JOB_NAME}" -echo "WORKER_COUNT: ${WORKER_COUNT}" -echo "CONFIG_NAME: ${CONFIG_NAME}" -echo "CONFIG_PATH: ${CONFIG_PATH}" -echo "ENTRY_FILE: ${ENTRY_FILE}" - -args="--config_name ${CONFIG_NAME} --config_path ${CONFIG_PATH}" - -mdl_args="--queue=${QUEUE} \ - --entry=${ENTRY_FILE} \ - --worker_count=${WORKER_COUNT} \ - --file.cluster_file=examples/scripts/cluster.json \ - --job_name=${JOB_NAME} \ - --algo_name=pytorch280 \ - --requirements_file_name=nebula_patch/requirements/requirements_torch280_vllm_amd.txt \ - --oss_appendable=true \ - --_NEBULA_MODEL=${NEBULA_MODEL} \ - --nebula_model=${NEBULA_MODEL} \ - --env=${ENVS} \ - --force \ - " -if [ -n "${OPENLM_TOKEN}" ]; then - mdl_args="${mdl_args} --env=OPENLM_TOKEN=${OPENLM_TOKEN}" -fi - -echo ${args} -echo ${mdl_args} - -nebulactl run mdl --user_params="${args}" $mdl_args diff --git a/examples/qwen2.5-7B-rlvr_megatron/submit_pipeline_amd_zero3_lora.sh b/examples/qwen2.5-7B-rlvr_megatron/submit_pipeline_amd_zero3_lora.sh deleted file mode 100644 index 25016bfa5..000000000 --- a/examples/qwen2.5-7B-rlvr_megatron/submit_pipeline_amd_zero3_lora.sh +++ /dev/null @@ -1,46 +0,0 @@ -#!/bin/bash -set +x -source "examples/scripts/config.sh" - -WORKER_COUNT=2 -CONFIG_FILE="rlvr_lora_zero3_amd.yaml" -# 替换为mos uri -NEBULA_MODEL="" -ENTRY_FILE="examples/start_rlvr_pipeline.py" - -CONFIG_PATH=$(basename $(dirname $0)) -CONFIG_NAME="${CONFIG_FILE%.yaml}" -JOB_NAME="$CONFIG_PATH-$CONFIG_NAME" - -QUEUE="nebula_test2_308x_gpu_hang" - - -echo "JOB_NAME: ${JOB_NAME}" -echo "WORKER_COUNT: ${WORKER_COUNT}" -echo "CONFIG_NAME: ${CONFIG_NAME}" -echo "CONFIG_PATH: ${CONFIG_PATH}" -echo "ENTRY_FILE: ${ENTRY_FILE}" - -args="--config_name ${CONFIG_NAME} --config_path ${CONFIG_PATH}" - - -mdl_args="--queue=${QUEUE} \ - --entry=${ENTRY_FILE} \ - --worker_count=${WORKER_COUNT} \ - --file.cluster_file=examples/scripts/cluster.json \ - --job_name=${JOB_NAME} \ - --algo_name=pytorch260_rocm700rc4 \ - --requirements_file_name=nebula_patch/requirements/requirements_torch260_vllm_amd.txt \ - --oss_appendable=true \ - --_NEBULA_MODEL=${NEBULA_MODEL} \ - --nebula_model=${NEBULA_MODEL} \ - --force \ - " -if [ -n "${OPENLM_TOKEN}" ]; then - mdl_args="${mdl_args} --env=OPENLM_TOKEN=${OPENLM_TOKEN}" -fi - -echo ${args} -echo ${mdl_args} - -nebulactl run mdl --user_params="${args}" $mdl_args diff --git a/examples/qwen2.5-vl-7B-distill/distill_vl_zero3.yaml b/examples/qwen2.5-vl-7B-distill/distill_vl_zero3.yaml index ac91a5687..b81355bbc 100644 --- a/examples/qwen2.5-vl-7B-distill/distill_vl_zero3.yaml +++ b/examples/qwen2.5-vl-7B-distill/distill_vl_zero3.yaml @@ -18,6 +18,7 @@ checkpoint_config: type: file_system output_dir: /data/cpfs_0/rl_examples/models/${exp_name} + save_steps: 100 logging_steps: 1 resume_from_checkpoint: false diff --git a/examples/qwen2.5-vl-7B-math/rlvr_math_lora.yaml b/examples/qwen2.5-vl-7B-math/rlvr_math_lora.yaml index cfbbcccf6..c8ab855e8 100644 --- a/examples/qwen2.5-vl-7B-math/rlvr_math_lora.yaml +++ b/examples/qwen2.5-vl-7B-math/rlvr_math_lora.yaml @@ -102,6 +102,8 @@ actor_infer: strategy_config: gpu_memory_utilization: 0.8 block_size: 16 + # mm preprocessor cache mismatch error occured in vllm084 + enable_prefix_caching: false num_gpus_per_worker: 1 device_mapping: list(range(0,16)) infer_batch_size: 32 diff --git a/examples/qwen2.5-vl-7B-math/rlvr_math_megatron.yaml b/examples/qwen2.5-vl-7B-math/rlvr_math_megatron.yaml index fcd11abda..0d3c45a06 100644 --- a/examples/qwen2.5-vl-7B-math/rlvr_math_megatron.yaml +++ b/examples/qwen2.5-vl-7B-math/rlvr_math_megatron.yaml @@ -112,6 +112,8 @@ actor_infer: gpu_memory_utilization: 0.9 block_size: 16 disable_mm_preprocessor_cache: true # RAM leak: https://github.com/vllm-project/vllm/issues/15085 + # mm preprocessor cache mismatch error occured in vllm084 + enable_prefix_caching: false num_gpus_per_worker: 1 device_mapping: list(range(0,16)) infer_batch_size: 32 diff --git a/examples/qwen2.5-vl-7B-math/rlvr_math_zero3.yaml b/examples/qwen2.5-vl-7B-math/rlvr_math_zero3.yaml index c520df3a4..2499ab156 100644 --- a/examples/qwen2.5-vl-7B-math/rlvr_math_zero3.yaml +++ b/examples/qwen2.5-vl-7B-math/rlvr_math_zero3.yaml @@ -90,6 +90,8 @@ actor_infer: strategy_config: gpu_memory_utilization: 0.9 block_size: 16 + # mm preprocessor cache mismatch error occured in vllm084 + enable_prefix_caching: false num_gpus_per_worker: 1 device_mapping: list(range(0,16)) infer_batch_size: 32 diff --git a/examples/qwen2.5-vl-7B-math/submit_pipeline_amd.sh b/examples/qwen2.5-vl-7B-math/submit_pipeline_amd.sh deleted file mode 100644 index 2f5102eb3..000000000 --- a/examples/qwen2.5-vl-7B-math/submit_pipeline_amd.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -set +x -source "examples/scripts/config.sh" - -WORKER_COUNT=2 -CONFIG_FILE="rlvr_math_megatron_amd.yaml" -# 替换为mos uri -NEBULA_MODEL="" -ENTRY_FILE="examples/start_rlvr_vlmath_pipeline.py" - -CONFIG_PATH=$(basename $(dirname $0)) -CONFIG_NAME="${CONFIG_FILE%.yaml}" -JOB_NAME="$CONFIG_PATH-$CONFIG_NAME" - - -QUEUE="nebula_test2_308x_gpu_hang" -# QUEUE="nebula_test_308x" -ENVS="NCCL_PF_UCM_TIMEOUT=600000,NCCL_SOCKET_IFNAME=bond0,MIOPEN_DEBUG_FORCE_IMMED_MODE_FALLBACK=1" -# ENVS="NCCL_PF_UCM_TIMEOUT=600000" - -echo "JOB_NAME: ${JOB_NAME}" -echo "WORKER_COUNT: ${WORKER_COUNT}" -echo "CONFIG_NAME: ${CONFIG_NAME}" -echo "CONFIG_PATH: ${CONFIG_PATH}" -echo "ENTRY_FILE: ${ENTRY_FILE}" - -args="--config_name ${CONFIG_NAME} --config_path ${CONFIG_PATH}" - -mdl_args="--queue=${QUEUE} \ - --entry=${ENTRY_FILE} \ - --worker_count=${WORKER_COUNT} \ - --file.cluster_file=examples/scripts/cluster.json \ - --job_name=${JOB_NAME} \ - --algo_name=pytorch280 \ - --requirements_file_name=nebula_patch/requirements/requirements_torch280_vllm_amd.txt \ - --oss_appendable=true \ - --_NEBULA_MODEL=${NEBULA_MODEL} \ - --nebula_model=${NEBULA_MODEL} \ - --env=${ENVS} \ - --force \ - " -if [ -n "${OPENLM_TOKEN}" ]; then - mdl_args="${mdl_args} --env=OPENLM_TOKEN=${OPENLM_TOKEN}" -fi - -echo ${args} -echo ${mdl_args} - -nebulactl run mdl --user_params="${args}" $mdl_args diff --git a/examples/qwen2.5-vl-7B-rlvr/rlvr_async.yaml b/examples/qwen2.5-vl-7B-rlvr/rlvr_async.yaml new file mode 100644 index 000000000..8bf5e1bc7 --- /dev/null +++ b/examples/qwen2.5-vl-7B-rlvr/rlvr_async.yaml @@ -0,0 +1,151 @@ +defaults: + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ +hydra: + run: + dir: . + output_subdir: null +exp_name: "qwen2_5_vl_7B_rlvr" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +checkpoint_config: + type: file_system + output_dir: /data/cpfs_0/yuzhao/models +track_with: tensorboard +tracker_kwargs: + log_dir: /data/oss_bucket_0/yuzhao/llm/tensorboard +save_steps: 40 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false +rollout_batch_size: 256 +num_return_sequences_in_group: 8 +is_num_return_sequences_expand: true +prompt_length: 2048 +response_length: 4096 +async_generation_ratio: 1 +ppo_epochs: 1 +value_clip: 0.5 +reward_clip: 10 +advantage_clip: 10.0 +whiten_advantages: false +init_kl_coef: 0.0 +adv_estimator: "grpo" +use_kl_loss: true +kl_loss_coef: 1.0e-2 +pretrain: Qwen/Qwen2.5-VL-7B-Instruct +validation: + data_args: + file_name: + - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/test/test_math_megabench_237.parquet + - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/test/test_detection_coco_test_multi_2000.parquet + dataset_dir: ./ + generating_args: + max_new_tokens: ${response_length} + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + eval_steps: ${eval_steps} +actor_train: + model_args: + flash_attn: fa2 + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 1.0e-6 + weight_decay: 1.0e-2 + per_device_train_batch_size: 2 + gradient_accumulation_steps: 256 + warmup_steps: 0 + num_train_epochs: 50 + data_args: + # use One-RL-to-See-Them-All/Orsta-Data-47k as train dataset + # download from https://huggingface.co/datasets/One-RL-to-See-Them-All/Orsta-Data-47k + file_name: + - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/train/train_detection_v3det_4000.parquet + - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/train/train_math_mmmath_3539.parquet + domain_interleave_probs: + math: 0.5 + cv_detection: 0.5 + dataset_dir: ./ + messages: prompt + preprocessing_num_workers: 32 + strategy_args: + strategy_name: megatron_train + strategy_config: + sequence_parallel: true + tensor_model_parallel_size: 4 + context_parallel_size: 1 + expert_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + overlap_grad_reduce: true + use_distributed_optimizer: true + bf16: true + device_mapping: list(range(0,16)) + infer_batch_size: 8 +actor_infer: + model_args: + flash_attn: fa2 + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${response_length} + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: ${num_return_sequences_in_group} + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + # mm preprocessor cache mismatch error occured in vllm084 + disable_mm_preprocessor_cache: true + enable_prefix_caching: false + num_gpus_per_worker: 1 + device_mapping: list(range(16,24)) + infer_batch_size: 32 +reference: + model_args: + flash_attn: fa2 + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + strategy_args: + strategy_name: megatron_infer + strategy_config: + sequence_parallel: true + tensor_model_parallel_size: 2 + context_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + bf16: true + device_mapping: list(range(0,16)) + infer_batch_size: 8 +rewards: + math: + worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker + model_args: + model_name_or_path: ${pretrain} + # data source whose ability is math in One-RL-to-See-Them-All/Orsta-Data-47k + tag_included: [mm_math, megabench_math] + world_size: 8 + infer_batch_size: 1 + cv_detection: + worker_cls: roll.pipeline.rlvr.rewards.detection_reward_worker.DetectionRewardWorker + model_args: + model_name_or_path: ${pretrain} + # data source whose ability is cv_detection in One-RL-to-See-Them-All/Orsta-Data-47k + tag_included: [v3det_train, object365_train, coco_val_multi_test] + world_size: 8 + infer_batch_size: 1 diff --git a/examples/qwen2.5-vl-7B-rlvr/rlvr_megatron.yaml b/examples/qwen2.5-vl-7B-rlvr/rlvr_megatron.yaml index 54807d13b..804ee3bd2 100644 --- a/examples/qwen2.5-vl-7B-rlvr/rlvr_megatron.yaml +++ b/examples/qwen2.5-vl-7B-rlvr/rlvr_megatron.yaml @@ -118,6 +118,8 @@ actor_infer: strategy_config: gpu_memory_utilization: 0.8 block_size: 16 + # mm preprocessor cache mismatch error occured in vllm084 + enable_prefix_caching: false num_gpus_per_worker: 1 device_mapping: list(range(0,32)) infer_batch_size: 32 diff --git a/examples/qwen3-235BA22B-rlvr_megatron/submit_pipeline_amd.sh b/examples/qwen3-235BA22B-rlvr_megatron/submit_pipeline_amd.sh deleted file mode 100644 index bf10ec75a..000000000 --- a/examples/qwen3-235BA22B-rlvr_megatron/submit_pipeline_amd.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -set +x -source "examples/scripts/config.sh" - -WORKER_COUNT=32 -CONFIG_FILE="rlvr_config_amd.yaml" -# 替换为mos uri -NEBULA_MODEL="" -ENTRY_FILE="examples/start_rlvr_pipeline.py" - -CONFIG_PATH=$(basename $(dirname $0)) -CONFIG_NAME="${CONFIG_FILE%.yaml}" -JOB_NAME="$CONFIG_PATH-$CONFIG_NAME" - - -QUEUE="nebula_test2_308x_gpu_hang" -# QUEUE="nebula_test_308x" -ENVS="NCCL_PF_UCM_TIMEOUT=600000,NCCL_SOCKET_IFNAME=bond0,NCCL_DEBUG=INFO" -# ENVS="NCCL_PF_UCM_TIMEOUT=600000" - -echo "JOB_NAME: ${JOB_NAME}" -echo "WORKER_COUNT: ${WORKER_COUNT}" -echo "CONFIG_NAME: ${CONFIG_NAME}" -echo "CONFIG_PATH: ${CONFIG_PATH}" -echo "ENTRY_FILE: ${ENTRY_FILE}" - -args="--config_name ${CONFIG_NAME} --config_path ${CONFIG_PATH}" - -mdl_args="--queue=${QUEUE} \ - --entry=${ENTRY_FILE} \ - --worker_count=${WORKER_COUNT} \ - --file.cluster_file=examples/scripts/cluster.json \ - --job_name=${JOB_NAME} \ - --algo_name=pytorch280 \ - --requirements_file_name=nebula_patch/requirements/requirements_torch280_vllm_amd.txt \ - --oss_appendable=true \ - --_NEBULA_MODEL=${NEBULA_MODEL} \ - --nebula_model=${NEBULA_MODEL} \ - --env=${ENVS} \ - --force \ - " -if [ -n "${OPENLM_TOKEN}" ]; then - mdl_args="${mdl_args} --env=OPENLM_TOKEN=${OPENLM_TOKEN}" -fi - -echo ${args} -echo ${mdl_args} - -nebulactl run mdl --user_params="${args}" $mdl_args diff --git a/examples/qwen3-30BA3B-agentic_fsdp2/agentic_val_sokoban_30a3.yaml b/examples/qwen3-30BA3B-agentic_fsdp2/agentic_val_sokoban_30a3.yaml new file mode 100644 index 000000000..ce30a9a83 --- /dev/null +++ b/examples/qwen3-30BA3B-agentic_fsdp2/agentic_val_sokoban_30a3.yaml @@ -0,0 +1,184 @@ +defaults: + - ../config/traj_envs@_here_ + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +exp_name: "agentic_pipeline_30a3_2k" +seed: 42 +rpc_timeout: 72000 # Timeout for Ray RPC calls in seconds (20 hours) +logging_dir: ./output/logs +output_dir: ./output +render_save_dir: ./output/render + +profiler_memory: false +system_envs: + USE_MODELSCOPE: '1' + +# profiler_output_dir: /data/oss_bucket_0/pumpkin/exps/profile/${exp_name} + +#track_with: wandb +#tracker_kwargs: +# api_key: +# project: roll-agentic +# name: ${exp_name}_sokoban +# notes: "agentic_pipeline" +# tags: +# - agentic +# - roll +# - baseline + +track_with: tensorboard +tracker_kwargs: + log_dir: ./tf_log + + +checkpoint_config: + type: file_system + +num_gpus_per_node: 8 + +max_steps: 1024 +save_steps: 5 +logging_steps: 1 +eval_steps: 10 + +rollout_batch_size: 128 +val_batch_size: 128 +sequence_length: 8192 + +advantage_clip: 0.2 +ppo_epochs: 1 +adv_estimator: "grpo" +#pg_clip: 0.1 +#dual_clip_loss: True +init_kl_coef: 0.0 +whiten_advantages: true +entropy_loss_coef: 0 +max_grad_norm: 1.0 + +pretrain: Qwen/Qwen3-30B-A3B +reward_pretrain: Qwen/Qwen3-30B-A3B + +actor_train: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + ulysses_size: 2 + training_args: + learning_rate: 1.0e-6 + weight_decay: 0 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 16 + warmup_steps: 10 + lr_scheduler_type: cosine + data_args: + template: qwen2_5 + strategy_args: + strategy_name: fsdp2_train + strategy_config: + fsdp_size: 16 + param_dtype: bf16 + reduce_dtype: bf16 + offload_policy: true + apply_expert_patch: true + apply_tiled_mlp: true + tiled_num_shards: 4 + reshard_after_forward: true + wrap_policy: + wrap_embeddings: true + wrap_lm_output: true + moe_experts: + - Qwen3MoeMLP + transformer_layer_cls_to_wrap: + - Qwen3MoeAttention + - Qwen3MoeSparseMoeBlock + use_remove_padding: true + device_mapping: list(range(0,32)) + infer_batch_size: 1 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: 128 # single-turn response length + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.7 + block_size: 16 + load_format: auto + tensor_parallel_size: 4 + device_mapping: list(range(0,32)) + +reference: + model_args: + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + ulysses_size: 2 + data_args: + template: qwen2_5 + strategy_args: + strategy_name: fsdp2_infer + strategy_config: + fsdp_size: 16 + param_dtype: bf16 + reduce_dtype: bf16 + apply_tiled_mlp: true + tiled_num_shards: 8 + reshard_after_forward: true + offload_policy: true + device_mapping: list(range(0,32)) + infer_batch_size: 1 + +reward_normalization: + grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv + method: mean_std # asym_clip / identity / mean_std + +train_env_manager: + format_penalty: -0.15 # sokoban env penalty_for_step=-0.1 + max_env_num_per_worker: 16 + num_env_groups: 16 + # under the same group, the env config and env seed are ensured to be equal + group_size: 8 + tags: [SimpleSokoban] + num_groups_partition: [16] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + +val_env_manager: + max_env_num_per_worker: 32 + num_env_groups: 128 + group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output + tags: [SimpleSokoban, LargerSokoban, SokobanDifferentGridVocab, FrozenLake] + num_groups_partition: [32, 32, 32, 32] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation + + +# Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64 +max_tokens_per_step: 64 + +custom_envs: + SimpleSokoban: + ${custom_env.SimpleSokoban} + LargerSokoban: + ${custom_env.LargerSokoban} + SokobanDifferentGridVocab: + ${custom_env.SokobanDifferentGridVocab} + FrozenLake: + ${custom_env.FrozenLake} + FrozenLakeThink: + ${custom_env.FrozenLakeThink} diff --git a/examples/qwen3-30BA3B-rlvr_megatron/rlvr_config_amd.yaml b/examples/qwen3-30BA3B-rlvr_megatron/rlvr_config_amd.yaml index 4af2f0893..60d4b9171 100644 --- a/examples/qwen3-30BA3B-rlvr_megatron/rlvr_config_amd.yaml +++ b/examples/qwen3-30BA3B-rlvr_megatron/rlvr_config_amd.yaml @@ -133,7 +133,7 @@ actor_train: use_distributed_optimizer: true sequence_parallel: true moe_token_dispatcher_type: "alltoall" - moe_grouped_gemm: true + moe_grouped_gemm: false # If encoutering some timeout issues, you can disable this option. moe_layer_recompute: true device_mapping: list(range(0,32)) infer_batch_size: 2 @@ -176,7 +176,7 @@ reference: pipeline_model_parallel_size: 1 expert_model_parallel_size: 4 moe_token_dispatcher_type: "alltoall" - moe_grouped_gemm: true + moe_grouped_gemm: false # If encoutering some timeout issues, you can disable this option. device_mapping: list(range(0,32)) infer_batch_size: 2 @@ -257,7 +257,7 @@ rewards: # strategy_config: null strategy_name: vllm strategy_config: - gpu_memory_utilization: 0.8 + gpu_memory_utilization: 0.5 block_size: 16 max_model_len: 8000 load_format: auto diff --git a/examples/qwen3-30BA3B-rlvr_megatron/rlvr_config_lora.yaml b/examples/qwen3-30BA3B-rlvr_megatron/rlvr_config_lora.yaml new file mode 100644 index 000000000..ea7a556b9 --- /dev/null +++ b/examples/qwen3-30BA3B-rlvr_megatron/rlvr_config_lora.yaml @@ -0,0 +1,271 @@ +hydra: + run: + dir: . + output_subdir: null + +exp_name: "qwen3-30BA3B-rlvr-config-lora" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output +system_envs: + USE_MODELSCOPE: '1' + +checkpoint_config: + type: file_system + output_dir: ./rl_examples/models/${exp_name} + +#track_with: wandb +#tracker_kwargs: +# api_key: +# project: roll_examples +# notes: roll_examples +# tags: +# - rlvr +# - baseline + +track_with: tensorboard +tracker_kwargs: + log_dir: /data/oss_bucket_0/shidie/tensorboard/${exp_name} + +model_download_type: MODELSCOPE +num_gpus_per_node: 8 + +max_steps: 500 +save_steps: 100 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + + +rollout_batch_size: 64 # prompt +prompt_length: 2048 +response_length: 4096 + +num_return_sequences_in_group: 8 +ppo_epochs: 1 +adv_estimator: "reinforce" + +# clip +value_clip: 0.5 +reward_clip: 10 +advantage_clip: 2.0 +dual_clip_loss: true + +# normalize +norm_mean_type: ~ +norm_std_type: ~ + +# data mask +max_len_mask: true +difficulty_mask: true +difficulty_low_threshold: 0.1 +difficulty_high_threshold: 0.95 +error_max_len_clip: false + +# data weight +difficulty_loss_weight: false +length_loss_weight: false + +# reward +add_token_level_kl: false + +# advantage +whiten_advantages: true + +# dynamic sampling scheduler +# use_additional_prompts: true +# max_running_requests: 256 +# is_num_return_sequences_expand: false + +pretrain: Qwen/Qwen3-30B-A3B-Base +reward_pretrain: Qwen/Qwen3-30B-A3B-Base + +validation: + data_args: + template: qwen2_5 + file_name: + - data/math_benchmarks.jsonl + generating_args: + max_new_tokens: ${response_length} + top_p: 0.6 + top_k: 50 + num_beams: 1 + temperature: 0.6 + num_return_sequences: 1 + +actor_train: + model_args: + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + lora_target: all-linear + lora_rank: 64 + lora_alpha: 64 + training_args: + learning_rate: 2.0e-5 + weight_decay: 0 + per_device_train_batch_size: 4 + gradient_accumulation_steps: 16 + warmup_steps: 20 + num_train_epochs: 50 + data_args: + template: qwen2_5 + file_name: + - data/code_KodCode_data.jsonl + - data/llm_judge_Multi-subject-RLVR_deal_new.jsonl + - data/math_deepmath_deal.jsonl + - data/general_ifeval_train_deal.jsonl + - data/general_CrossThink-QA_deal.jsonl + domain_interleave_probs: + math_rule: 0.4 + code_sandbox: 0.3 + # llm_judge: 0.1 + crossthinkqa: 0.1 + ifeval: 0.1 + dataset_dir: data + messages: messages + interleave_probs: "1.0" + preprocessing_num_workers: 16 + strategy_args: + strategy_name: megatron_train + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 2 + expert_model_parallel_size: 8 + context_parallel_size: 1 + use_distributed_optimizer: true + sequence_parallel: true + moe_token_dispatcher_type: "alltoall" + moe_grouped_gemm: true + recompute_granularity: full + device_mapping: list(range(0,16)) + infer_batch_size: 4 + +actor_infer: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + lora_target: q_proj,k_proj,v_proj,o_proj,gate_proj,up_proj,down_proj + lora_rank: 64 + lora_alpha: 64 + generating_args: + max_new_tokens: ${response_length} + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: ${num_return_sequences_in_group} + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.7 + load_format: auto + tensor_parallel_size: 2 + max_model_len: 8192 + num_gpus_per_worker: 2 + device_mapping: list(range(0,16)) # device share with llm reward + infer_batch_size: 1 + +reference: + model_args: + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + strategy_args: + strategy_name: megatron_infer + strategy_config: + tensor_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 8 + moe_token_dispatcher_type: "alltoall" + moe_grouped_gemm: true + device_mapping: list(range(0,16)) + infer_batch_size: 2 + +rewards: + crossthinkqa: + worker_cls: roll.pipeline.rlvr.rewards.crossthinkqa_rule_reward_worker.CrossThinkQARuleRewardWorker + reward_type: soft + response_length_penalty_coef: 0.0 + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen2_5 + tag_included: [crossthinkqa] + world_size: 8 + infer_batch_size: 4 + ifeval: + worker_cls: roll.pipeline.rlvr.rewards.ifeval_rule_reward_worker.GeneralRuleRewardWorker + reward_type: soft + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen2_5 + tag_included: [ifeval] + world_size: 8 + infer_batch_size: 4 + math_rule: + worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen2_5 + tag_included: [deepmath_103k, aime] + world_size: 8 + infer_batch_size: 1 +# dynamic filter config +# query_filter_config: +# type: mean_filter +# filter_args: +# threshold_up: 0.9 +# threshold_down: 0.1 + code_sandbox: + use_local: true + worker_cls: roll.pipeline.rlvr.rewards.code_sandbox_reward_worker.CodeSandboxRewardWorker + tag_included: [KodCode] + model_args: + model_name_or_path: ${reward_pretrain} + data_args: + template: qwen2_5 + world_size: 8 + infer_batch_size: 1 +# query_filter_config: +# type: std_filter +# filter_args: +# std_threshold: 0 + # llm_judge: + # # NOTE: llm as judge 也需要gpu, 不能和actor infer共享gpu + # worker_cls: roll.pipeline.rlvr.rewards.llm_judge_reward_worker.LLMJudgeRewardWorker + # judge_prompt: Qwen2.5-7B-Instruct-RLVR-prompt + # judge_model_type: inference + # tag_included: [RLVR] + # model_args: + # model_name_or_path: virtuoussy/Qwen2.5-7B-Instruct-RLVR + # attn_implementation: fa2 + # disable_gradient_checkpointing: true + # dtype: bf16 + # model_type: trl + # generating_args: + # max_new_tokens: 100 + # top_p: 0.8 + # top_k: 50 + # num_beams: 1 + # temperature: 0.8 + # num_return_sequences: 1 + # data_args: + # template: qwen2_5 + # strategy_args: + # # strategy_name: hf_infer + # # strategy_config: null + # strategy_name: vllm + # strategy_config: + # gpu_memory_utilization: 0.7 + # block_size: 16 + # max_model_len: 10000 + # load_format: auto + # device_mapping: list(range(0,2))+list(range(14,16)) + # infer_batch_size: 4 \ No newline at end of file diff --git a/examples/qwen3-30BA3B-rlvr_megatron/rlvr_config_sglang.yaml b/examples/qwen3-30BA3B-rlvr_megatron/rlvr_config_sglang.yaml index 7e4b577af..57749482e 100644 --- a/examples/qwen3-30BA3B-rlvr_megatron/rlvr_config_sglang.yaml +++ b/examples/qwen3-30BA3B-rlvr_megatron/rlvr_config_sglang.yaml @@ -83,7 +83,7 @@ validation: data_args: template: qwen2_5 file_name: - - data/math_benchmarks.jsonl + - data/aime24_25_deal.jsonl generating_args: top_p: 0.6 top_k: 50 diff --git a/examples/qwen3-30BA3B-rlvr_megatron/submit_pipeline_amd.sh b/examples/qwen3-30BA3B-rlvr_megatron/submit_pipeline_amd.sh deleted file mode 100644 index f2937e32e..000000000 --- a/examples/qwen3-30BA3B-rlvr_megatron/submit_pipeline_amd.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -set +x -source "examples/scripts/config.sh" - -WORKER_COUNT=4 -CONFIG_FILE="rlvr_config_amd.yaml" -# 替换为mos uri -NEBULA_MODEL="" -ENTRY_FILE="examples/start_rlvr_pipeline.py" - -CONFIG_PATH=$(basename $(dirname $0)) -CONFIG_NAME="${CONFIG_FILE%.yaml}" -JOB_NAME="$CONFIG_PATH-$CONFIG_NAME" - - -QUEUE="nebula_test2_308x_gpu_hang" -# QUEUE="nebula_test_308x" -ENVS="NCCL_PF_UCM_TIMEOUT=600000,NCCL_SOCKET_IFNAME=bond0" -# ENVS="NCCL_PF_UCM_TIMEOUT=600000" - -echo "JOB_NAME: ${JOB_NAME}" -echo "WORKER_COUNT: ${WORKER_COUNT}" -echo "CONFIG_NAME: ${CONFIG_NAME}" -echo "CONFIG_PATH: ${CONFIG_PATH}" -echo "ENTRY_FILE: ${ENTRY_FILE}" - -args="--config_name ${CONFIG_NAME} --config_path ${CONFIG_PATH}" - -mdl_args="--queue=${QUEUE} \ - --entry=${ENTRY_FILE} \ - --worker_count=${WORKER_COUNT} \ - --file.cluster_file=examples/scripts/cluster.json \ - --job_name=${JOB_NAME} \ - --algo_name=pytorch280 \ - --requirements_file_name=nebula_patch/requirements/requirements_torch280_vllm_amd.txt \ - --oss_appendable=true \ - --_NEBULA_MODEL=${NEBULA_MODEL} \ - --nebula_model=${NEBULA_MODEL} \ - --env=${ENVS} \ - --force \ - " -if [ -n "${OPENLM_TOKEN}" ]; then - mdl_args="${mdl_args} --env=OPENLM_TOKEN=${OPENLM_TOKEN}" -fi - -echo ${args} -echo ${mdl_args} - -nebulactl run mdl --user_params="${args}" $mdl_args diff --git a/examples/qwen3-next-80BA3B-rlvr_megatron/submit_pipeline_amd.sh b/examples/qwen3-next-80BA3B-rlvr_megatron/submit_pipeline_amd.sh deleted file mode 100644 index 8d10e0037..000000000 --- a/examples/qwen3-next-80BA3B-rlvr_megatron/submit_pipeline_amd.sh +++ /dev/null @@ -1,49 +0,0 @@ -#!/bin/bash -set +x -source "examples/scripts/config.sh" - -WORKER_COUNT=8 -CONFIG_FILE="rlvr_config_amd.yaml" -# 替换为mos uri -NEBULA_MODEL="" -ENTRY_FILE="examples/start_rlvr_pipeline.py" - -CONFIG_PATH=$(basename $(dirname $0)) -CONFIG_NAME="${CONFIG_FILE%.yaml}" -JOB_NAME="$CONFIG_PATH-$CONFIG_NAME" - - -QUEUE="nebula_test2_308x_gpu_hang" -# QUEUE="nebula_test_308x" -ENVS="NCCL_PF_UCM_TIMEOUT=600000,NCCL_SOCKET_IFNAME=bond0" -# ENVS="NCCL_PF_UCM_TIMEOUT=600000" - -echo "JOB_NAME: ${JOB_NAME}" -echo "WORKER_COUNT: ${WORKER_COUNT}" -echo "CONFIG_NAME: ${CONFIG_NAME}" -echo "CONFIG_PATH: ${CONFIG_PATH}" -echo "ENTRY_FILE: ${ENTRY_FILE}" - -args="--config_name ${CONFIG_NAME} --config_path ${CONFIG_PATH}" - -mdl_args="--queue=${QUEUE} \ - --entry=${ENTRY_FILE} \ - --worker_count=${WORKER_COUNT} \ - --file.cluster_file=examples/scripts/cluster.json \ - --job_name=${JOB_NAME} \ - --algo_name=pytorch280 \ - --requirements_file_name=nebula_patch/requirements/requirements_torch280_vllm_amd.txt \ - --oss_appendable=true \ - --_NEBULA_MODEL=${NEBULA_MODEL} \ - --nebula_model=${NEBULA_MODEL} \ - --env=${ENVS} \ - --force \ - " -if [ -n "${OPENLM_TOKEN}" ]; then - mdl_args="${mdl_args} --env=OPENLM_TOKEN=${OPENLM_TOKEN}" -fi - -echo ${args} -echo ${mdl_args} - -nebulactl run mdl --user_params="${args}" $mdl_args diff --git a/examples/qwen3-vl-4B-rlvr_megatron/rlvr_megatron.yaml b/examples/qwen3-omni/rlvr_megatron.yaml similarity index 82% rename from examples/qwen3-vl-4B-rlvr_megatron/rlvr_megatron.yaml rename to examples/qwen3-omni/rlvr_megatron.yaml index 4b6c12123..6ce249b30 100644 --- a/examples/qwen3-vl-4B-rlvr_megatron/rlvr_megatron.yaml +++ b/examples/qwen3-omni/rlvr_megatron.yaml @@ -9,19 +9,11 @@ hydra: dir: . output_subdir: null -exp_name: "qwen3_vl_4B_rlvr" +exp_name: "qwen3_omni_rlvr" seed: 42 logging_dir: ./output/logs output_dir: ./output -checkpoint_config: - type: file_system - output_dir: /data/cpfs_0/yuzhao/models - -track_with: tensorboard -tracker_kwargs: - log_dir: /data/oss_bucket_0/yuzhao/llm/tensorboard - save_steps: 20 logging_steps: 1 eval_steps: 10 @@ -43,7 +35,8 @@ adv_estimator: "grpo" use_kl_loss: true kl_loss_coef: 1.0e-2 -pretrain: Qwen/Qwen3-VL-4B-Thinking +pretrain: Qwen/Qwen3-Omni-30B-A3B-Thinking +# pretrain: Qwen/Qwen3-Omni-30B-A3B-Instruct validation: data_args: @@ -67,12 +60,12 @@ actor_train: disable_gradient_checkpointing: false dtype: bf16 model_type: ~ - freeze_module_prefix: vision_model + freeze_module_prefix: "vision_model,audio_model,talker,code2wav" training_args: learning_rate: 1.0e-6 weight_decay: 1.0e-2 per_device_train_batch_size: 2 - gradient_accumulation_steps: 64 + gradient_accumulation_steps: 32 warmup_steps: 0 num_train_epochs: 50 data_args: @@ -93,13 +86,16 @@ actor_train: sequence_parallel: true tensor_model_parallel_size: 1 context_parallel_size: 1 - expert_model_parallel_size: 1 + expert_model_parallel_size: 8 pipeline_model_parallel_size: 1 - overlap_grad_reduce: true use_distributed_optimizer: true + # overlap_grad_reduce: true # to be resolved + moe_token_dispatcher_type: alltoall + moe_grouped_gemm: true + recompute_granularity: full bf16: true - device_mapping: list(range(0,8)) - infer_batch_size: 2 + device_mapping: list(range(0,32)) + infer_batch_size: 1 actor_infer: model_args: @@ -117,11 +113,15 @@ actor_infer: strategy_args: strategy_name: vllm strategy_config: - gpu_memory_utilization: 0.8 + gpu_memory_utilization: 0.6 block_size: 16 - num_gpus_per_worker: 1 - device_mapping: list(range(0,8)) - infer_batch_size: 32 + max_model_len: 8192 + tensor_parallel_size: 4 + enforce_eager: true + load_format: dummy + num_gpus_per_worker: 4 + device_mapping: list(range(0,32)) + infer_batch_size: 2 reference: model_args: @@ -135,12 +135,12 @@ reference: strategy_config: sequence_parallel: true tensor_model_parallel_size: 1 - context_parallel_size: 1 + context_parallel_size: 2 pipeline_model_parallel_size: 1 - expert_model_parallel_size: 1 + expert_model_parallel_size: 8 bf16: true - device_mapping: list(range(0,8)) - infer_batch_size: 8 + device_mapping: list(range(0,32)) + infer_batch_size: 2 rewards: math: diff --git a/examples/qwen3-omni/run_rlvr_pipeline.sh b/examples/qwen3-omni/run_rlvr_pipeline.sh new file mode 100755 index 000000000..9a8f0ef42 --- /dev/null +++ b/examples/qwen3-omni/run_rlvr_pipeline.sh @@ -0,0 +1,5 @@ +#!/bin/bash +set +x + +CONFIG_PATH=$(basename $(dirname $0)) +python examples/start_rlvr_vl_pipeline.py --config_path $CONFIG_PATH --config_name rlvr_megatron diff --git a/examples/qwen3-vl-30BA3B-rlvr_megatron/rlvr_megatron_80GB.yaml b/examples/qwen3-vl-30BA3B-rlvr_megatron/rlvr_megatron_80GB.yaml new file mode 100644 index 000000000..749848410 --- /dev/null +++ b/examples/qwen3-vl-30BA3B-rlvr_megatron/rlvr_megatron_80GB.yaml @@ -0,0 +1,172 @@ +defaults: + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +exp_name: "qwen3_vl_moe_30BA3B_rlvr" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output + +checkpoint_config: + type: file_system + output_dir: /data/cpfs_0/yuzhao/models + +track_with: tensorboard +tracker_kwargs: + log_dir: /data/oss_bucket_0/yuzhao/llm/tensorboard + +save_steps: 20 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + +rollout_batch_size: 256 +num_return_sequences_in_group: 8 +is_num_return_sequences_expand: true +prompt_length: 2048 +response_length: 4096 + +ppo_epochs: 1 +value_clip: 0.5 +reward_clip: 10 +advantage_clip: 10.0 +whiten_advantages: false +init_kl_coef: 0.0 +adv_estimator: "grpo" +use_kl_loss: true +kl_loss_coef: 1.0e-2 + +pretrain: Qwen/Qwen3-VL-30B-A3B-Instruct + +validation: + data_args: + file_name: + - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/test/test_math_megabench_237.parquet + - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/test/test_detection_coco_test_multi_2000.parquet + dataset_dir: ./ + preprocessing_num_workers: 32 + generating_args: + max_new_tokens: ${response_length} + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + eval_steps: ${eval_steps} + +actor_train: + model_args: + flash_attn: fa2 + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + freeze_module_prefix: vision_model + training_args: + learning_rate: 1.0e-6 + weight_decay: 1.0e-2 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 64 + warmup_steps: 0 + num_train_epochs: 50 + data_args: + # use One-RL-to-See-Them-All/Orsta-Data-47k as train dataset + # download from https://huggingface.co/datasets/One-RL-to-See-Them-All/Orsta-Data-47k + file_name: + - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/train/train_detection_v3det_4000.parquet + - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/train/train_math_mmmath_3539.parquet + domain_interleave_probs: + math: 0.5 + cv_detection: 0.5 + dataset_dir: ./ + messages: prompt + preprocessing_num_workers: 32 + strategy_args: + strategy_name: megatron_train + strategy_config: + sequence_parallel: true + tensor_model_parallel_size: 1 + context_parallel_size: 1 + expert_model_parallel_size: 8 + pipeline_model_parallel_size: 1 + use_distributed_optimizer: true + moe_token_dispatcher_type: alltoall + recompute_granularity: selective + recompute_modules: "moe,layernorm" + bias_activation_fusion: true + apply_rope_fusion: true + moe_grouped_gemm: true + moe_shared_expert_overlap: true + bf16: true + device_mapping: list(range(0,16)) + infer_batch_size: 1 + +actor_infer: + model_args: + flash_attn: fa2 + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${response_length} + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: ${num_return_sequences_in_group} + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.75 + block_size: 16 + max_model_len: 8192 + tensor_parallel_size: 2 + enforce_eager: true + load_format: dummy + num_gpus_per_worker: 4 + device_mapping: list(range(0,16)) + infer_batch_size: 2 + +reference: + model_args: + flash_attn: fa2 + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + strategy_args: + strategy_name: megatron_infer + strategy_config: + sequence_parallel: true + tensor_model_parallel_size: 1 + context_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 2 + bf16: true + device_mapping: list(range(0,16)) + infer_batch_size: 2 + +rewards: + math: + worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker + model_args: + model_name_or_path: ${pretrain} + # data source whose ability is math in One-RL-to-See-Them-All/Orsta-Data-47k + tag_included: [mm_math, megabench_math] + world_size: 8 + infer_batch_size: 1 + cv_detection: + worker_cls: roll.pipeline.rlvr.rewards.detection_reward_worker.DetectionRewardWorker + model_args: + model_name_or_path: ${pretrain} + # data source whose ability is cv_detection in One-RL-to-See-Them-All/Orsta-Data-47k + tag_included: [v3det_train, object365_train, coco_val_multi_test] + world_size: 8 + infer_batch_size: 1 diff --git a/examples/qwen3-vl-30BA3B-rlvr_megatron/run_rlvr_pipeline.sh b/examples/qwen3-vl-30BA3B-rlvr_megatron/run_rlvr_pipeline.sh new file mode 100755 index 000000000..fc40086a7 --- /dev/null +++ b/examples/qwen3-vl-30BA3B-rlvr_megatron/run_rlvr_pipeline.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set +x + +# fix cudnn for roll-image +# mv /usr/local/lib/python3.12/dist-packages/torch/lib/../../nvidia/cudnn /usr/local/lib/python3.12/dist-packages/torch/lib/../../nvidia/cudnn_bak + +CONFIG_PATH=$(basename $(dirname $0)) +python examples/start_rlvr_vl_pipeline.py --config_path $CONFIG_PATH --config_name rlvr_megatron_80GB diff --git a/examples/qwen3-vl-32B-rlvr_megatron/run_rlvr_pipeline.sh b/examples/qwen3-vl-32B-rlvr_megatron/run_rlvr_pipeline.sh old mode 100644 new mode 100755 diff --git a/examples/qwen3-vl-4B-rlvr_megatron/rlvr_megatron_80G.yaml b/examples/qwen3-vl-4B-rlvr_megatron/rlvr_megatron_80G.yaml new file mode 100644 index 000000000..0e1900e61 --- /dev/null +++ b/examples/qwen3-vl-4B-rlvr_megatron/rlvr_megatron_80G.yaml @@ -0,0 +1,163 @@ +defaults: + - ../config/deepspeed_zero@_here_ + - ../config/deepspeed_zero2@_here_ + - ../config/deepspeed_zero3@_here_ + - ../config/deepspeed_zero3_cpuoffload@_here_ + +hydra: + run: + dir: . + output_subdir: null + +exp_name: "qwen3_vl_4B_rlvr" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output + +checkpoint_config: + type: file_system + output_dir: /data/cpfs_0/yuzhao/models + +track_with: tensorboard +tracker_kwargs: + log_dir: /data/oss_bucket_0/yuzhao/llm/tensorboard + +save_steps: 20 +logging_steps: 1 +eval_steps: 10 +resume_from_checkpoint: false + +rollout_batch_size: 256 +num_return_sequences_in_group: 8 +is_num_return_sequences_expand: true +prompt_length: 2048 +response_length: 4096 + +ppo_epochs: 1 +value_clip: 0.5 +reward_clip: 10 +advantage_clip: 10.0 +whiten_advantages: false +init_kl_coef: 0.0 +adv_estimator: "grpo" +use_kl_loss: true +kl_loss_coef: 1.0e-2 + +pretrain: Qwen/Qwen3-VL-4B-Thinking + +validation: + data_args: + file_name: + - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/test/test_math_megabench_237.parquet + - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/test/test_detection_coco_test_multi_2000.parquet + dataset_dir: ./ + preprocessing_num_workers: 32 + generating_args: + max_new_tokens: ${response_length} + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + eval_steps: ${eval_steps} + +actor_train: + model_args: + flash_attn: fa2 + attn_implementation: fa2 + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + freeze_module_prefix: vision_model + training_args: + learning_rate: 1.0e-6 + weight_decay: 1.0e-2 + per_device_train_batch_size: 2 + gradient_accumulation_steps: 64 + warmup_steps: 0 + num_train_epochs: 50 + data_args: + # use One-RL-to-See-Them-All/Orsta-Data-47k as train dataset + # download from https://huggingface.co/datasets/One-RL-to-See-Them-All/Orsta-Data-47k + file_name: + - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/train/train_detection_v3det_4000.parquet + - /data/oss_bucket_0/yuzhao/data/One-RL-to-See-Them-All/Orsta-Data-47k/train/train_math_mmmath_3539.parquet + domain_interleave_probs: + math: 0.5 + cv_detection: 0.5 + dataset_dir: ./ + messages: prompt + preprocessing_num_workers: 32 + strategy_args: + strategy_name: megatron_train + strategy_config: + sequence_parallel: true + tensor_model_parallel_size: 1 + context_parallel_size: 1 + expert_model_parallel_size: 1 + pipeline_model_parallel_size: 1 + overlap_grad_reduce: true + use_distributed_optimizer: true + recompute_granularity: full + bf16: true + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +actor_infer: + model_args: + flash_attn: fa2 + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${response_length} + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: ${num_return_sequences_in_group} + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + num_gpus_per_worker: 1 + device_mapping: list(range(0,8)) + infer_batch_size: 32 + +reference: + model_args: + flash_attn: fa2 + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + model_type: ~ + strategy_args: + strategy_name: megatron_infer + strategy_config: + sequence_parallel: true + tensor_model_parallel_size: 1 + context_parallel_size: 1 + pipeline_model_parallel_size: 1 + expert_model_parallel_size: 1 + bf16: true + device_mapping: list(range(0,8)) + infer_batch_size: 4 + +rewards: + math: + worker_cls: roll.pipeline.rlvr.rewards.math_rule_reward_worker.MathRuleRewardWorker + model_args: + model_name_or_path: ${pretrain} + # data source whose ability is math in One-RL-to-See-Them-All/Orsta-Data-47k + tag_included: [mm_math, megabench_math] + world_size: 8 + infer_batch_size: 1 + cv_detection: + worker_cls: roll.pipeline.rlvr.rewards.detection_reward_worker.DetectionRewardWorker + model_args: + model_name_or_path: ${pretrain} + # data source whose ability is cv_detection in One-RL-to-See-Them-All/Orsta-Data-47k + tag_included: [v3det_train, object365_train, coco_val_multi_test] + world_size: 8 + infer_batch_size: 1 diff --git a/examples/qwen3-vl-4B-rlvr_megatron/run_rlvr_pipeline.sh b/examples/qwen3-vl-4B-rlvr_megatron/run_rlvr_pipeline.sh old mode 100644 new mode 100755 index 9a8f0ef42..c7875c344 --- a/examples/qwen3-vl-4B-rlvr_megatron/run_rlvr_pipeline.sh +++ b/examples/qwen3-vl-4B-rlvr_megatron/run_rlvr_pipeline.sh @@ -1,5 +1,8 @@ #!/bin/bash set +x +# fix cudnn for roll-image +# mv /usr/local/lib/python3.12/dist-packages/torch/lib/../../nvidia/cudnn /usr/local/lib/python3.12/dist-packages/torch/lib/../../nvidia/cudnn_bak + CONFIG_PATH=$(basename $(dirname $0)) -python examples/start_rlvr_vl_pipeline.py --config_path $CONFIG_PATH --config_name rlvr_megatron +python examples/start_rlvr_vl_pipeline.py --config_path $CONFIG_PATH --config_name rlvr_megatron_80G diff --git a/examples/start_agentic_pipeline.py b/examples/start_agentic_pipeline.py index 29aa8ed2c..1b10c685f 100644 --- a/examples/start_agentic_pipeline.py +++ b/examples/start_agentic_pipeline.py @@ -6,6 +6,8 @@ from roll.distributed.scheduler.initialize import init from roll.pipeline.agentic.agentic_config import AgenticConfig +from roll.utils.import_utils import safe_import_class +from roll.utils.str_utils import print_pipeline_config def main(): @@ -19,14 +21,17 @@ def main(): initialize(config_path=args.config_path, job_name="app") cfg = compose(config_name=args.config_name) - print(OmegaConf.to_yaml(cfg, resolve=True)) - ppo_config = from_dict(data_class=AgenticConfig, data=OmegaConf.to_container(cfg, resolve=True)) init() - from roll.pipeline.agentic.agentic_pipeline import AgenticPipeline - pipeline = AgenticPipeline(pipeline_config=ppo_config) + print_pipeline_config(ppo_config) + + pipeline_cls = getattr(cfg, "pipeline_cls", "roll.pipeline.agentic.agentic_pipeline.AgenticPipeline") + if isinstance(pipeline_cls, str): + pipeline_cls = safe_import_class(pipeline_cls) + + pipeline = pipeline_cls(pipeline_config=ppo_config) pipeline.run() diff --git a/examples/wan2.2-14B-reward_fl_ds/reward_fl_config.yaml b/examples/wan2.2-14B-reward_fl_ds/reward_fl_config.yaml index b950a1dde..7c9c2b5fd 100644 --- a/examples/wan2.2-14B-reward_fl_ds/reward_fl_config.yaml +++ b/examples/wan2.2-14B-reward_fl_ds/reward_fl_config.yaml @@ -45,6 +45,9 @@ actor_train: num_inference_steps: 8 mid_timestep: 4 final_timestep: 7 + lora_base_model: dit2 + lora_target_modules: q,k,v,o,ffn.0,ffn.2 + lora_rank: 32 training_args: learning_rate: 2.5e-6 diff --git a/mcore_adapter/requirements.txt b/mcore_adapter/requirements.txt index d47035d4a..ee28b802c 100644 --- a/mcore_adapter/requirements.txt +++ b/mcore_adapter/requirements.txt @@ -1,3 +1,3 @@ megatron-core>=0.13.0,<0.14.0 -transformers>=4.48 +transformers>=4.50.0 accelerate>=0.27.2 diff --git a/mcore_adapter/src/mcore_adapter/__init__.py b/mcore_adapter/src/mcore_adapter/__init__.py index a0c9e6110..72637ff8f 100644 --- a/mcore_adapter/src/mcore_adapter/__init__.py +++ b/mcore_adapter/src/mcore_adapter/__init__.py @@ -3,5 +3,5 @@ from .training_args import Seq2SeqTrainingArguments, TrainingArguments -__version__ = "0.7.0.dev0" +__version__ = "0.8.0" __all__ = ["McaModelConfig", "McaGPTModel", "TrainingArguments", "Seq2SeqTrainingArguments", "McaTrainer"] diff --git a/mcore_adapter/src/mcore_adapter/adapters/__init__.py b/mcore_adapter/src/mcore_adapter/adapters/__init__.py index b223260a7..01a2123cf 100644 --- a/mcore_adapter/src/mcore_adapter/adapters/__init__.py +++ b/mcore_adapter/src/mcore_adapter/adapters/__init__.py @@ -1,4 +1,6 @@ -from ..utils import get_logger, is_peft_available +from transformers.utils import is_peft_available + +from ..utils import get_logger logger = get_logger(__name__) @@ -13,7 +15,7 @@ ) else: - def apply_megatron_lora(*args, **kwargs): + def apply_megatron_lora(): raise ValueError("PEFT is not available. Please install PEFT to use LoRA adapters.") def find_all_linear_modules(model): diff --git a/mcore_adapter/src/mcore_adapter/adapters/lora_layer.py b/mcore_adapter/src/mcore_adapter/adapters/lora_layer.py index 88980555b..f201a1446 100644 --- a/mcore_adapter/src/mcore_adapter/adapters/lora_layer.py +++ b/mcore_adapter/src/mcore_adapter/adapters/lora_layer.py @@ -53,8 +53,6 @@ def __init__( super().__init__(config=config) LoraLayer.__init__(self, base_layer=base_layer) - # lora needs to be forced to upgrade to 32-bit precision, otherwise it will overflow - self.config.params_dtype = torch.float32 if use_dora: raise ValueError(f"{self.__class__.__name__} does not support DoRA yet, please set it to False") self.is_grouped = isinstance(base_layer, TEGroupedLinear) @@ -84,7 +82,15 @@ def _create_lora_layers(self, r, lora_bias, **kwargs): raise NotImplementedError("_create_lora_layers must be implemented in subclasses") def update_layer( - self, adapter_name, r, *, lora_alpha, lora_dropout, init_lora_weights, use_rslora, lora_bias, **kwargs + self, + adapter_name, + r, + lora_alpha, + lora_dropout, + init_lora_weights, + use_rslora: bool = False, + lora_bias: bool = False, + **kwargs, ): if r <= 0: raise ValueError(f"`r` should be a positive integer value but the value passed is {r}") @@ -99,11 +105,11 @@ def update_layer( # Create LoRA layers based on subclass implementation lora_layer_kwargs = { - "skip_bias_add": False, - "init_method": self.config.init_method, "config": self.config, + "init_method": self.config.init_method, "is_expert": self.is_expert, - "tp_group": self.base_layer.tp_group + "skip_bias_add": False, + "tp_group": self.base_layer.tp_group, } lora_a, lora_b = self._create_lora_layers(r, lora_bias, **lora_layer_kwargs) @@ -243,7 +249,8 @@ def forward(self, x: torch.Tensor, *args: Any, **kwargs: Any): ) if isinstance(lora_result, tuple): lora_result = lora_result[0] - lora_result = lora_result * scaling + if scaling != 1.0: + lora_result = lora_result * scaling if self.sequence_parallel and self.base_layer.parallel_mode == "row": lora_result = scatter_to_sequence_parallel_region(lora_result) @@ -406,6 +413,7 @@ def _create_lora_layers(self, r, lora_bias, **kwargs): in_features = self.in_features * self.tp_size if self.is_grouped: + r = r // self.config.moe_router_topk lora_a = TERowParallelGroupedLinear( num_gemms=self.base_layer.num_gemms, input_size=in_features, @@ -449,6 +457,7 @@ def _create_lora_layers(self, r, lora_bias, **kwargs): out_features = self.out_features * self.tp_size if self.is_grouped: + r = r // self.config.moe_router_topk lora_a = TEGroupedLinear( num_gemms=self.base_layer.num_gemms, input_size=self.in_features, @@ -502,7 +511,9 @@ def dispatch_megatron( new_module = LoraRouterParallelLinear(base_layer=target, adapter_name=adapter_name, **kwargs) elif isinstance(target_base_layer, (TERowParallelLinear, TERowParallelGroupedLinear)): new_module = LoraRowParallelLinear(base_layer=target, adapter_name=adapter_name, **kwargs) - elif isinstance(target_base_layer, (TEColumnParallelLinear, TEColumnParallelGroupedLinear, TELayerNormColumnParallelLinear)): + elif isinstance( + target_base_layer, (TEColumnParallelLinear, TEColumnParallelGroupedLinear, TELayerNormColumnParallelLinear) + ): new_module = LoraColumnParallelLinear(base_layer=target, adapter_name=adapter_name, **kwargs) elif isinstance(target_base_layer, (TELinear, TEGroupedLinear)): # default to column parallel linear for non-parallel linear layers @@ -510,6 +521,7 @@ def dispatch_megatron( return new_module + def patch_TELinear(): def __repr__(self): return ( diff --git a/mcore_adapter/src/mcore_adapter/checkpointing.py b/mcore_adapter/src/mcore_adapter/checkpointing.py index db548ef9e..df5c2667b 100644 --- a/mcore_adapter/src/mcore_adapter/checkpointing.py +++ b/mcore_adapter/src/mcore_adapter/checkpointing.py @@ -2,14 +2,6 @@ import torch from megatron.core import dist_checkpointing, mpu -from transformers.modeling_utils import ( - SAFE_WEIGHTS_INDEX_NAME, - SAFE_WEIGHTS_NAME, - WEIGHTS_INDEX_NAME, - WEIGHTS_NAME, - get_checkpoint_shard_files, - load_state_dict, -) from .constants import TRACKER_FILENAME from .utils import get_logger diff --git a/mcore_adapter/src/mcore_adapter/constants.py b/mcore_adapter/src/mcore_adapter/constants.py index 2f8b75138..6e1c44724 100644 --- a/mcore_adapter/src/mcore_adapter/constants.py +++ b/mcore_adapter/src/mcore_adapter/constants.py @@ -4,3 +4,5 @@ DIST_OPTIMIZER_DIR = "dist_optimizer" HUGGINGFACE_AUTOMAP_CACHE = "./.cache/huggingface/automap" + +ADAPTER_CONFIG_NAME = "adapter_config.json" diff --git a/mcore_adapter/src/mcore_adapter/initialize.py b/mcore_adapter/src/mcore_adapter/initialize.py index 117a763fc..fa8f70457 100644 --- a/mcore_adapter/src/mcore_adapter/initialize.py +++ b/mcore_adapter/src/mcore_adapter/initialize.py @@ -5,11 +5,10 @@ import torch from megatron.core import mpu, tensor_parallel +from .platforms import current_platform from .training_args import TrainingArguments from .utils import get_logger -from .platforms import current_platform - logger = get_logger(__name__) diff --git a/mcore_adapter/src/mcore_adapter/models/__init__.py b/mcore_adapter/src/mcore_adapter/models/__init__.py index 9f68e4208..2e6785fbc 100644 --- a/mcore_adapter/src/mcore_adapter/models/__init__.py +++ b/mcore_adapter/src/mcore_adapter/models/__init__.py @@ -11,7 +11,9 @@ qwen3, qwen3_moe, qwen3_next, + qwen3_omni, qwen3_vl, + qwen3_vl_moe, ) from .auto import AutoConfig, AutoModel from .model_config import McaModelConfig diff --git a/mcore_adapter/src/mcore_adapter/models/converter/convert_utils.py b/mcore_adapter/src/mcore_adapter/models/converter/convert_utils.py index 2d2cd7cdc..630d4145a 100644 --- a/mcore_adapter/src/mcore_adapter/models/converter/convert_utils.py +++ b/mcore_adapter/src/mcore_adapter/models/converter/convert_utils.py @@ -1,3 +1,4 @@ +import math import re from dataclasses import dataclass, field from importlib.metadata import version @@ -7,6 +8,7 @@ import torch.distributed as dist from megatron.core import mpu from packaging.version import Version as PkgVersion + from ...platforms import current_platform @@ -21,32 +23,76 @@ MAX_SHARD_SIZE = 5_000_000_000 # 5GB -def get_layer_index(weight_name: str, prefix: str): +def get_layer_index(weight_name: str, prefix: str) -> Optional[int]: + """ + 1. megatron format: decoder.layers.{layer_index}.{weight} -> layer_index + 2. mtp format: mtp.layers.{layer_index}.{weight} -> layer_index + 3. hf format: model.layers.{layer_index}.{weight} -> layer_index + """ + escaped_prefix = re.escape(prefix) + pattern = rf"^{escaped_prefix}(\d+)(?:\.|$)" + match = re.match(pattern, weight_name) + return int(match.group(1)) if match else None + + +def get_moe_index(weight_name: str, prefix: str, moe_prefix: str) -> Optional[int]: + """ + 1. megatron format: decoder.layers.{layer_index}.mlp.experts.local_experts.{moe_index}.{weight} -> moe_index + 2. mtp format: mtp.layers.{layer_index}.transformer_layer.mlp.experts.local_experts.{moe_index}.{weight} -> moe_index + """ if not weight_name.startswith(prefix): return None - return int(weight_name.replace(prefix, "").split(".")[0]) + escaped_prefix = re.escape(prefix) + escaped_moe_prefix = re.escape(moe_prefix) + pattern = rf"^({escaped_prefix}\d+{escaped_moe_prefix})(\d+)(?:\.|$)" + match = re.match(pattern, weight_name) + return int(match.group(2)) if match else None + + +def get_layer_prefix(weight_name: str, prefix: str) -> str: + """ + decoder.layers.{layer_index}.{weight} -> decoder.layers.{layer_index} + model.layers.{layer_index}.{weight} -> model.layers.{layer_index} + """ + escaped_prefix = re.escape(prefix) + pattern = rf"^({escaped_prefix}\d+)" + if match := re.match(pattern, weight_name): + return match.group(1) + raise ValueError(f"Cannot get layer prefix from {weight_name=} with {prefix=}") + + +def get_moe_prefix(weight_name: str, prefix: str, moe_prefix: str) -> str: + """ + decoder.layers.{layer_index}.mlp.experts.local_experts.{moe_index}.{weight} -> decoder.layers.{layer_index}.mlp.experts.local_experts.{moe_index} + model.layers.{layer_index}.mlp.experts.{moe_index}.{weight} -> model.layers.{layer_index}.mlp.experts.{moe_index} + + For qwen3_vl_moe: + model.language_model.layers.{layer_index}.mlp.experts.{weight} -> model.language_model.layers.{layer_index}.mlp.experts + """ + escaped_prefix = re.escape(prefix) + escaped_moe_prefix = re.escape(moe_prefix) + pattern = rf"^({escaped_prefix}\d+{escaped_moe_prefix}\d+)" + if match := re.match(pattern, weight_name): + return match.group(1) + + # For qwen3_vl_moe + pattern = rf"^({escaped_prefix}\d+{escaped_moe_prefix})" + if match := re.match(pattern, weight_name): + return match.group(1) + raise ValueError(f"Cannot get moe prefix from {weight_name=} with {prefix=} and {moe_prefix=}") def get_weight_prefix(weight_name: str, prefix: str, moe_prefix: str = None): if not weight_name.startswith(prefix): return "" - layer_index = get_layer_index(weight_name, prefix) - layer_prefix = prefix + str(layer_index) - if moe_prefix is None: - return layer_prefix - return layer_prefix + get_weight_prefix(weight_name[len(layer_prefix) :], prefix=moe_prefix) + if moe_prefix is not None and moe_prefix in weight_name: + return get_moe_prefix(weight_name, prefix, moe_prefix) + return get_layer_prefix(weight_name, prefix) def remove_weight_prefix(weight_name: str, prefix: str, moe_prefix: str = None): weight_prefix = get_weight_prefix(weight_name, prefix, moe_prefix) - return weight_name.replace(weight_prefix, "", 1) - - -def get_moe_index(weight_name: str, prefix: str, moe_prefix: str = None): - if not weight_name.startswith(prefix): - return None - mos_layer_name = remove_weight_prefix(weight_name, prefix) - return get_layer_index(mos_layer_name, moe_prefix) + return weight_name.removeprefix(weight_prefix) def add_layer_prefix( @@ -59,9 +105,13 @@ def add_layer_prefix( if not weight_name.startswith("."): # not weight in layer return weight_name - if moe_index is not None: - weight_name = add_layer_prefix(weight_name, moe_index, moe_prefix) - return prefix + str(layer_index) + weight_name + + if moe_index is not None and moe_prefix is not None: + full_prefix = f"{prefix}{layer_index}{moe_prefix}{moe_index}" + else: + full_prefix = f"{prefix}{layer_index}" + + return full_prefix + weight_name def convert_to_mca_prefix(weight_prefix: str, prefix: str, moe_prefix: str = None): @@ -256,6 +306,28 @@ def get_te_version_str(): return get_te_version() >= PkgVersion("1.9.0.dev0") +def _noisy_mean_initialization(embed_weight: "torch.Tensor", num_new_tokens: int) -> None: + embedding_dim = embed_weight.size(1) + if torch.distributed.get_rank() == 0: + avg_weight = embed_weight[:-num_new_tokens].mean(dim=0, keepdim=True) + noise_weight = torch.empty_like(embed_weight[-num_new_tokens:]) + noise_weight.normal_(mean=0, std=(1.0 / math.sqrt(embedding_dim))) + added_embed_weight = avg_weight + noise_weight + torch.distributed.broadcast(added_embed_weight.to(current_platform.current_device()), src=0) + else: + added_embed_weight = torch.empty_like(embed_weight[-num_new_tokens:], device=current_platform.current_device()) + torch.distributed.broadcast(added_embed_weight, src=0) + embed_weight[-num_new_tokens:] = added_embed_weight.cpu() + + +def resize_embedding_layer(original_mca_weight: torch.Tensor, resized_vocab_size: int): + mca_weight = original_mca_weight.clone() + original_vocab_size = mca_weight.size(0) + mca_weight.resize_((resized_vocab_size, mca_weight.size(1))) + _noisy_mean_initialization(mca_weight, resized_vocab_size - original_vocab_size) + return mca_weight + + @dataclass class StackedTensors: tensors: Optional[List["torch.Tensor"]] @@ -295,6 +367,7 @@ def pop_tensor(self, named_tensors: Dict[str, "torch.Tensor"]): @staticmethod def pop_tensor_in_buffer(named_tensors: Dict[str, "torch.Tensor"], tensors_meta, buffer: "torch.Tensor"): for name, meta in tensors_meta.items(): + meta = tensors_meta[name] bucket_start, tensor_start, save_bytes = meta["bucket_start"], meta["tensor_start"], meta["save_bytes"] tensor = named_tensors.get(name, None) if tensor is None: diff --git a/mcore_adapter/src/mcore_adapter/models/converter/dist_converter.py b/mcore_adapter/src/mcore_adapter/models/converter/dist_converter.py index f1a7e80b0..c19fb7971 100644 --- a/mcore_adapter/src/mcore_adapter/models/converter/dist_converter.py +++ b/mcore_adapter/src/mcore_adapter/models/converter/dist_converter.py @@ -8,7 +8,7 @@ import torch from megatron.core.transformer.pipeline_parallel_layer_layout import LayerType, PipelineParallelLayerLayout -from ...utils import get_logger +from ...utils import get_logger, is_megatron_llama from .convert_utils import ( StackedTensors, add_mca_layer_prefix, @@ -51,7 +51,7 @@ class DistParallelConfig: swiglu_weights: List[str] = field(default_factory=list) # ungrouped TE name to grouped - grouped_duplicated_map: Dict[str, str] = field(default_factory=dict) + grouped_duplicated_map: Dict[str, str] = field(default_factory=dict) grouped_column_map: Dict[str, str] = field(default_factory=dict) grouped_row_map: Dict[str, str] = field(default_factory=dict) @@ -59,7 +59,9 @@ class DistParallelConfig: def __post_init__(self): self.local_to_te_key_map = {v: k for k, v in self.te_to_local_key_map.items()} - self.grouped_duplicated_weights = list(self.grouped_duplicated_map.keys()) + list(self.grouped_duplicated_map.values()) + self.grouped_duplicated_weights = list(self.grouped_duplicated_map.keys()) + list( + self.grouped_duplicated_map.values() + ) self.grouped_column_weights = list(self.grouped_column_map.keys()) + list(self.grouped_column_map.values()) self.grouped_row_weights = list(self.grouped_row_map.keys()) + list(self.grouped_row_map.values()) self.grouped_map = {**self.grouped_duplicated_map, **self.grouped_column_map, **self.grouped_row_map} @@ -88,24 +90,24 @@ def merge_configs(self, other: "DistParallelConfig") -> "DistParallelConfig": lora_config = DistParallelConfig( duplicated_weights=[ - ".self_attention.linear_proj.lora_B.*.weight", - ".self_attention.linear_qkv.lora_A.*.weight", - ".mlp.linear_fc1.lora_A.*.weight", - ".linear_fc1.lora_A.*.weight", - ".mlp.linear_fc2.lora_B.*.weight", - ".linear_fc2.lora_B.*.weight", + ".self_attention.linear_proj.lora_B.weight", + ".self_attention.linear_qkv.lora_A.weight", + ".mlp.linear_fc1.lora_A.weight", + ".linear_fc1.lora_A.weight", + ".mlp.linear_fc2.lora_B.weight", + ".linear_fc2.lora_B.weight", ], column_parallel_weights=[ - ".self_attention.linear_qkv.lora_B.*.weight", - ".mlp.linear_fc1.lora_B.*.weight", - ".linear_fc1.lora_B.*.weight", + ".self_attention.linear_qkv.lora_B.weight", + ".mlp.linear_fc1.lora_B.weight", + ".linear_fc1.lora_B.weight", ], row_parallel_weights=[ - ".self_attention.linear_proj.lora_A.*.weight", - ".mlp.linear_fc2.lora_A.*.weight", - ".linear_fc2.lora_A.*.weight", + ".self_attention.linear_proj.lora_A.weight", + ".mlp.linear_fc2.lora_A.weight", + ".linear_fc2.lora_A.weight", ], - swiglu_weights=[".mlp.linear_fc1.lora_B.*.weight", ".linear_fc1.lora_B.*.weight"], + swiglu_weights=[".mlp.linear_fc1.lora_B.weight", ".linear_fc1.lora_B.weight"], ) @@ -141,11 +143,11 @@ def merge_configs(self, other: "DistParallelConfig") -> "DistParallelConfig": lora_te_moe_config = DistParallelConfig( grouped_duplicated_map={ - ".linear_fc1.lora_A.*.weight": ".mlp.experts.linear_fc1.lora_A.*.weight", - ".linear_fc2.lora_B.*.weight": ".mlp.experts.linear_fc2.lora_B.*.weight", + ".linear_fc1.lora_A.weight": ".mlp.experts.linear_fc1.lora_A.weight", + ".linear_fc2.lora_B.weight": ".mlp.experts.linear_fc2.lora_B.weight", }, - grouped_column_map={".linear_fc1.lora_B.*.weight": ".mlp.experts.linear_fc1.lora_B.*.weight"}, - grouped_row_map={".linear_fc2.lora_A.*.weight": ".mlp.experts.linear_fc2.lora_A.*.weight"}, + grouped_column_map={".linear_fc1.lora_B.weight": ".mlp.experts.linear_fc1.lora_B.weight"}, + grouped_row_map={".linear_fc2.lora_A.weight": ".mlp.experts.linear_fc2.lora_A.weight"}, ) @@ -166,6 +168,7 @@ def merge_configs(self, other: "DistParallelConfig") -> "DistParallelConfig": ], ) + mla_dist_config = DistParallelConfig( pre_process_weights=[MCORE_WORD_EMBEDDING], post_process_weights=[MCORE_LM_HEAD, "decoder.final_layernorm.weight"], @@ -206,6 +209,13 @@ def merge_configs(self, other: "DistParallelConfig") -> "DistParallelConfig": ).merge_configs(mtp_config) +megatron_llama_config = DistParallelConfig( + duplicated_weights=[".input_layernorm.weight"], + grouped_column_map={".linear_fc1.weight": ".mlp.weight1"}, + grouped_row_map={".linear_fc2.weight": ".mlp.weight2"}, +) + + dist_configs: Dict[str, List[DistParallelConfig]] = {} @@ -222,12 +232,27 @@ def get_dist_config(name) -> DistParallelConfig: return dist_config +lora_shared_moe_dist_config = DistParallelConfig( + duplicated_weights=[ + ".mlp.shared_experts.linear_fc1.lora_A.weight", + ".mlp.shared_experts.linear_fc2.lora_B.weight", + ], + column_parallel_weights=[ + ".mlp.shared_experts.linear_fc1.lora_B.weight", + ], + row_parallel_weights=[ + ".mlp.shared_experts.linear_fc2.lora_A.weight", + ], + swiglu_weights=[".mlp.shared_experts.linear_fc1.lora_B.weight"], +) + + shared_moe_dist_config = DistParallelConfig( duplicated_weights=[".mlp.shared_experts.gate_weight"], row_parallel_weights=[".mlp.shared_experts.linear_fc2.weight"], swiglu_weights=[".mlp.shared_experts.linear_fc1.weight"], te_to_local_key_map={".pre_mlp_layernorm.weight": ".pre_mlp_layernorm.weight"}, -) +).merge_configs(lora_shared_moe_dist_config) class DistConverter: @@ -264,6 +289,8 @@ def __init__( dist_config = get_dist_config(mca_config.hf_model_type) if self.use_te_grouped_moe: dist_config = dist_config.merge_configs(te_moe_config) + if is_megatron_llama(): + dist_config = dist_config.merge_configs(megatron_llama_config) self.config = dist_config self.layout: PipelineParallelLayerLayout = self.mca_config.pipeline_model_parallel_layout @@ -412,7 +439,7 @@ def get_pure_name(self, name: str): pure_name = self.config.local_to_te_key_map[pure_name] return pure_name - def _name_relocate(self, name: str, moe_index: Optional[int] = None): + def _name_relocate(self, name: str, moe_index: Optional[int] = None, moe_index_preprocessed: bool = False): pure_name = self.get_pure_name(name) if self.mca_config.transformer_impl == "local": if self.revert: # when revert to hf, convert to te name @@ -428,7 +455,8 @@ def _name_relocate(self, name: str, moe_index: Optional[int] = None): if self.revert: if self.mca_config.moe_grouped_gemm: pure_name = self.get_matched_name(pure_name, self.config.grouped_reverse_map) - moe_index = self.num_layers_for_expert * self.expert_model_parallel_rank + moe_index + if not moe_index_preprocessed: + moe_index = self.num_layers_for_expert * self.expert_model_parallel_rank + moe_index else: if self.mca_config.moe_grouped_gemm: moe_index = None @@ -467,10 +495,7 @@ def get_global_layer_index(self, local_layer_index: int, vp_stage: int): if self.layout is not None: return self.layout.get_layer_offset(vp_stage=vp_stage) + local_layer_index - chunk_index = ( - self.pipeline_model_parallel_rank - + vp_stage * self.mca_config.pipeline_model_parallel_size - ) + chunk_index = self.pipeline_model_parallel_rank + vp_stage * self.mca_config.pipeline_model_parallel_size global_layer_index = local_layer_index + chunk_index * self.num_layers_per_virtual_rank if self.mca_config.account_for_embedding_in_pipeline_split and chunk_index > 0: global_layer_index -= 1 @@ -521,13 +546,13 @@ def _convert_te_grouped_column(self, name: str, weights: "Tensor"): relocated_name = self._name_relocate(name) + str(moe_index) return {relocated_name: weights} - def _revert_te_grouped_column(self, name: str, weights: List["Tensor"]): + def _revert_te_grouped_column(self, name: str, weights: List["Tensor"], moe_index_preprocessed: bool = False): if self.swiglu: weight = self._revert_swiglu(weights) else: weight = self._revert_column_parallel(weights) moe_index = int(extract_suffix_number(name)) - return {self._name_relocate(name, moe_index=moe_index): weight} + return {self._name_relocate(name, moe_index=moe_index, moe_index_preprocessed=moe_index_preprocessed): weight} def _convert_grouped_column(self, name: str, weights: "Tensor"): if self.swiglu: @@ -546,7 +571,7 @@ def _convert_grouped_column(self, name: str, weights: "Tensor"): weights = [weight[1] for weight in weights] return {relocated_name: torch.stack(weights, dim=0).view(self.mca_config.hidden_size, -1)} - def _revert_grouped_column(self, name: str, weights: List["Tensor"], vp_stage: int): + def _revert_grouped_column(self, name: str, weights: List["Tensor"]): def _revert_grouped(weight: "Tensor"): weight = weight.view(self.num_layers_for_expert, self.mca_config.hidden_size, -1) expert_weights = torch.unbind(weight, dim=0) @@ -569,10 +594,12 @@ def _revert_column(weights: List["Tensor"]): for moe_index, weight in enumerate(ungrouped_weights) } - def handle_grouped_column(self, name: str, weights: Union["Tensor", List["Tensor"]]) -> Dict[str, "Tensor"]: + def handle_grouped_column( + self, name: str, weights: Union["Tensor", List["Tensor"]], moe_index_preprocessed: bool = False + ) -> Dict[str, "Tensor"]: if self.revert: if self.use_te_grouped_moe: - return self._revert_te_grouped_column(name, weights) + return self._revert_te_grouped_column(name, weights, moe_index_preprocessed=moe_index_preprocessed) return self._revert_grouped_column(name, weights) else: if self.use_te_grouped_moe: @@ -585,10 +612,10 @@ def _convert_te_grouped_row(self, name: str, weights: "Tensor"): relocated_name = self._name_relocate(name) + str(moe_index) return {relocated_name: weights} - def _revert_te_grouped_row(self, name: str, weights: List["Tensor"]): + def _revert_te_grouped_row(self, name: str, weights: List["Tensor"], moe_index_preprocessed: bool = False): weights = self._revert_row_parallel(weights) moe_index = int(extract_suffix_number(name)) - return {self._name_relocate(name, moe_index=moe_index): weights} + return {self._name_relocate(name, moe_index=moe_index, moe_index_preprocessed=moe_index_preprocessed): weights} def _convert_grouped_row(self, name: str, weights: "Tensor"): weights = self._convert_row_parallel(weights) @@ -620,10 +647,12 @@ def _revert_grouped(weight: "Tensor"): for moe_index, weight in enumerate(ungrouped_weights) } - def handle_grouped_row(self, name: str, weights: Union["Tensor", List["Tensor"]]) -> Dict[str, "Tensor"]: + def handle_grouped_row( + self, name: str, weights: Union["Tensor", List["Tensor"]], moe_index_preprocessed: bool = False + ) -> Dict[str, "Tensor"]: if self.revert: if self.use_te_grouped_moe: - return self._revert_te_grouped_row(name, weights) + return self._revert_te_grouped_row(name, weights, moe_index_preprocessed=moe_index_preprocessed) return self._revert_grouped_row(name, weights) else: if self.use_te_grouped_moe: @@ -644,7 +673,7 @@ def get_matched_name(self, name: str, weight_map: dict[str, Any]) -> Optional[st for key in weight_map: if fnmatch.fnmatch(name, key): name_pattern = weight_map[key] - return name_pattern[:name_pattern.find(".lora")] + name[name.find(".lora"):] + return name_pattern[: name_pattern.find(".lora")] + name[name.find(".lora") :] def get_local_moe_index(self, name: str) -> Optional[Union[int, List[int]]]: pure_name = remove_mca_weight_prefix(name) @@ -663,7 +692,10 @@ def get_global_moe_index(self, name: str) -> Optional[Union[int, List[int]]]: local_moe_index = self.get_local_moe_index(name) if local_moe_index is None: return None - local_to_global = lambda i: i + self.num_layers_for_expert * self.expert_model_parallel_rank + + def local_to_global(i): + return i + self.num_layers_for_expert * self.expert_model_parallel_rank + if isinstance(local_moe_index, int): return local_to_global(local_moe_index) else: @@ -694,6 +726,7 @@ def dist_convert( weights: Union["Tensor", List["Tensor"]], vp_stage: Optional[int] = None, layer_index_preprocessed: bool = False, + moe_index_preprocessed: bool = False, ) -> Dict[str, "Tensor"]: """ Convert weights for distributed parallelism. @@ -702,9 +735,12 @@ def dist_convert( name: Weight name weights: Weight tensor(s) vp_stage: Virtual pipeline stage - layer_index_preprocessed: If True, the name's layer index has already been preprocessed - for pipeline parallelism by the caller. If False (default), DistConverter will + layer_index_preprocessed: If True, the name's layer index has already been preprocessed + for pipeline parallelism by the caller. If False (default), DistConverter will handle the layer index conversion between global and local indices. + moe_index_preprocessed: If True, the name's moe index has already been preprocessed + for expert parallelism by the caller. If False (default), DistConverter will + handle the moe index conversion between global and local indices. """ if vp_stage is None: vp_stage = self.virtual_pipeline_model_parallel_rank @@ -730,9 +766,9 @@ def dist_convert( if self.mca_config.moe_grouped_gemm and self.name_match(pure_name, self.config.grouped_duplicated_weights): return self.handle_grouped_duplicated(name, weights) if self.mca_config.moe_grouped_gemm and self.name_match(pure_name, self.config.grouped_column_weights): - return self.handle_grouped_column(name, weights) + return self.handle_grouped_column(name, weights, moe_index_preprocessed=moe_index_preprocessed) if self.mca_config.moe_grouped_gemm and self.name_match(pure_name, self.config.grouped_row_weights): - return self.handle_grouped_row(name, weights) + return self.handle_grouped_row(name, weights, moe_index_preprocessed=moe_index_preprocessed) if self.swiglu and self.name_match(pure_name, self.config.swiglu_weights): return self.handle_swiglu(name, weights) if self.name_match(pure_name, self.config.duplicated_weights): diff --git a/mcore_adapter/src/mcore_adapter/models/converter/model_converter.py b/mcore_adapter/src/mcore_adapter/models/converter/model_converter.py index d9b302777..e572429b9 100644 --- a/mcore_adapter/src/mcore_adapter/models/converter/model_converter.py +++ b/mcore_adapter/src/mcore_adapter/models/converter/model_converter.py @@ -15,6 +15,7 @@ SAFE_WEIGHTS_NAME, WEIGHTS_INDEX_NAME, WEIGHTS_NAME, + is_peft_available, is_safetensors_available, ) @@ -28,11 +29,15 @@ gather_tensor_parallel, get_tensor_size, parse_size_to_int, + resize_embedding_layer, ) -from .dist_converter import DistConverter +from .dist_converter import MCORE_WORD_EMBEDDING, MCORE_LM_HEAD, DistConverter from .template import get_template +if is_peft_available(): + from peft import PeftModel, get_peft_model_state_dict + if is_safetensors_available(): from safetensors.torch import save_file as safe_save_file @@ -55,6 +60,7 @@ def __init__( to_hf: bool = False, verbose=False, efficient_mode: bool = False, + resized_vocab_size: int = None, ): self.mca_config = mca_config self.verbose = verbose @@ -74,6 +80,7 @@ def __init__( revert=to_hf, efficient_mode=efficient_mode, ) + self.resized_vocab_size = resized_vocab_size def log(self, msg): if self.verbose: @@ -134,6 +141,12 @@ def get_mca_state_dict(self, state_dict_iter, vp_stage: int): converted_state_dict = self.template.add_hf_weight(name, weight) if converted_state_dict is not None: for mca_name, mca_weight in converted_state_dict.items(): + # resize before tensor parallel conversion + if self.resized_vocab_size and ( + (mca_name == MCORE_WORD_EMBEDDING) or + (mca_name == MCORE_LM_HEAD and not self.mca_config.tie_embeddings_and_output_weights) + ): + mca_weight = resize_embedding_layer(mca_weight, self.resized_vocab_size) named_weights = self.dist_converter.dist_convert(mca_name, mca_weight, vp_stage=vp_stage) if named_weights is not None: mca_state_dict.update(named_weights) @@ -150,16 +163,25 @@ def get_mca_state_dict(self, state_dict_iter, vp_stage: int): def _mca_named_params_with_vp_stage(self, models): for vp_stage, model in enumerate(models): - mca_state_dict = model.state_dict_for_save_checkpoint() - mca_state_dict = {k: v for k, v in mca_state_dict.items() if not k.endswith("._extra_state")} - for mca_name, weight in sorted(mca_state_dict.items()): - yield vp_stage, mca_name, weight + if is_peft_available() and isinstance(model, PeftModel): + for adapter_name in model.peft_config.keys(): + mca_state_dict = get_peft_model_state_dict(model, model.state_dict_for_save_checkpoint(), adapter_name) + mca_state_dict = {k: v for k, v in mca_state_dict.items() if not k.endswith("._extra_state")} + for mca_name, weight in sorted(mca_state_dict.items()): + yield adapter_name, vp_stage, mca_name, weight + else: + mca_state_dict = model.state_dict_for_save_checkpoint() + mca_state_dict = {k: v for k, v in mca_state_dict.items() if not k.endswith("._extra_state")} + for mca_name, weight in sorted(mca_state_dict.items()): + yield None, vp_stage, mca_name, weight def convert_to_hf( self, mca_state_dict: Dict[str, list["Tensor"]], vp_stage: Optional[int] = None, layer_index_preprocessed: bool = False, + moe_index_preprocessed: bool = False, + **kwargs, ) -> Dict[str, "Tensor"]: """ Convert Mca state dict to HuggingFace format. @@ -167,9 +189,12 @@ def convert_to_hf( Args: mca_state_dict: Dictionary of mca weight names to tensor lists vp_stage: Virtual pipeline stage - layer_index_preprocessed: If True, the weight names' layer indices have already been - preprocessed for pipeline parallelism by the caller. If False (default), + layer_index_preprocessed: If True, the weight names' layer indices have already been + preprocessed for pipeline parallelism by the caller. If False (default), DistConverter will handle the layer index conversion between global and local indices. + moe_index_preprocessed: If True, the weight names' moe indices have already been + preprocessed for expert parallelism by the caller. If False (default), + DistConverter will handle the moe index conversion between global and local indices. """ if vp_stage is None: vp_stage = mpu.get_virtual_pipeline_model_parallel_rank() @@ -177,13 +202,21 @@ def convert_to_hf( hf_state_dict = {} for mca_name, weights in mca_state_dict.items(): merged_named_weights = self.dist_converter.dist_convert( - mca_name, weights, vp_stage=vp_stage, layer_index_preprocessed=layer_index_preprocessed + mca_name, + weights, + vp_stage=vp_stage, + layer_index_preprocessed=layer_index_preprocessed, + moe_index_preprocessed=moe_index_preprocessed, ) if merged_named_weights is None: continue converted = {} for merged_name, merged_weight in merged_named_weights.items(): - converted.update(self.template.add_mca_weight(merged_name, merged_weight)) + converted_state_dict = self.template.add_mca_weight(merged_name, merged_weight, **kwargs) + if converted_state_dict is not None: + converted.update(converted_state_dict) + else: + self.log(f"mca_name: {merged_name} added but not converted") hf_state_dict.update(converted or {}) return hf_state_dict @@ -193,6 +226,7 @@ def save_model_as_hf_inflight( save_directory: str, save_safetensors: bool = True, max_shard_size: Union[int, str] = MAX_SHARD_SIZE, + move_to_cpu: bool = False, ): assert self.dist_converter.revert, "save_model_as_hf_inflight only support to_hf ModelConverter" if not mpu.model_parallel_is_initialized(): @@ -208,50 +242,35 @@ def save_model_as_hf_inflight( expert_parallel = self.mca_config.expert_model_parallel_size > 1 only_need_expert = expert_parallel and mpu.get_expert_model_parallel_rank() > 0 - for vp_stage, mca_name, weight in self._mca_named_params_with_vp_stage(models): + last_adapter_name = None + for adapter_name, vp_stage, mca_name, weight in self._mca_named_params_with_vp_stage(models): if only_need_expert and not self.dist_converter.is_expert_parallel_weight(mca_name): continue weights = gather_tensor_parallel(weight, async_op=False) if weights is None: # only tp_rank0 need to convert and save continue + if move_to_cpu and isinstance(weights, list): + weights = [w.cpu() for w in weights] converted_state_dict = self.convert_to_hf(mca_state_dict={mca_name: weights}, vp_stage=vp_stage) - self.save_hf_shard_state_dict(shard_state, save_directory, converted_state_dict, save_safetensors) + self.save_hf_shard_state_dict( + shard_state, + os.path.join(save_directory, adapter_name) if adapter_name is not None else save_directory, + converted_state_dict, + save_safetensors, + ) - if mpu.get_tensor_model_parallel_rank() == 0: - self.save_shard_state_meta(shard_state, save_directory, save_safetensors) + if ( + adapter_name is not None + and adapter_name != last_adapter_name + and mpu.get_tensor_model_parallel_rank() == 0 + ): + self.save_shard_state_meta(shard_state, save_directory, save_safetensors) - def all_gather_weights_as_hf_inflight(self, models): - assert self.dist_converter.revert, "save_model_as_hf_inflight only support to_hf ModelConverter" + if adapter_name is not None: + last_adapter_name = adapter_name - expert_parallel = self.mca_config.expert_model_parallel_size > 1 - for vp_stage, mca_name, weight in self._mca_named_params_with_vp_stage(models): - moe_index = self.dist_converter.get_local_moe_index(mca_name) - group = ( - mpu.get_tensor_model_parallel_group() if moe_index is None else mpu.get_expert_tensor_parallel_group() - ) - if dist.get_world_size(group) == 1: - weights = [weight] - else: - weights = all_gather_tensors(weight, async_op=False, group=group) - hf_state_dict = self.convert_to_hf(mca_state_dict={mca_name: weights}, vp_stage=vp_stage) - for name, weight in hf_state_dict.items(): - if expert_parallel and moe_index is not None: - names = allgather_parallel_objs(name, group=mpu.get_expert_model_parallel_group()) - weights = all_gather_tensors( - weight, async_op=False, group=mpu.get_expert_model_parallel_group() - ) - for name, weight in zip(names, weights): - yield name, weight - else: - yield name, weight - - def all_gather_weights_as_hf_bucket(self, models, bucket_size: int = None): - bucket_manager = SendBucketManager(bucket_size or self._auto_bucket_size()) - for name, weight in self.all_gather_weights_as_hf_inflight(models): - yield from bucket_manager.push_tensor(weight, name=name) - last_meta, last_buffer = bucket_manager.pop_last_bucket() - if last_meta is not None: - yield last_meta, last_buffer + if mpu.get_tensor_model_parallel_rank() == 0: + self.save_shard_state_meta(shard_state, save_directory, save_safetensors) def _auto_bucket_size(self): # TODO: optimize this by max weight size diff --git a/mcore_adapter/src/mcore_adapter/models/converter/post_converter.py b/mcore_adapter/src/mcore_adapter/models/converter/post_converter.py index 61fc0cfcd..1bcd103a5 100644 --- a/mcore_adapter/src/mcore_adapter/models/converter/post_converter.py +++ b/mcore_adapter/src/mcore_adapter/models/converter/post_converter.py @@ -1,3 +1,5 @@ +import os +from collections import defaultdict from itertools import product from typing import TYPE_CHECKING, Optional @@ -12,25 +14,31 @@ AutoModelForCausalLM, AutoModelForImageTextToText, AutoModelForVision2Seq, + AutoModelForTextToWaveform, AutoProcessor, AutoTokenizer, ) from transformers.dynamic_module_utils import get_class_from_dynamic_module from transformers.models.auto.auto_factory import _get_model_class +from transformers.utils import is_peft_available from ...checkpointing import get_checkpoint_name, save_config_and_state_dict +from ...constants import ADAPTER_CONFIG_NAME from ...training_args import DistributingParallelArguments -from ...utils import get_logger, is_peft_available +from ...utils import get_logger from ..auto.config_auto import AutoConfig from .model_converter import ModelConverter from .template import get_template if is_peft_available(): - from peft import LoraConfig, PeftConfig, get_peft_model + from peft import LoraConfig, PeftConfig, get_peft_model, set_peft_model_state_dict if TYPE_CHECKING: + from transformers import PretrainedConfig + from ...training_args import DistributingParallelArguments + from ..model_config import McaModelConfig from .template import Template @@ -38,7 +46,12 @@ def _add_mca_state_dicts_to_hf( - model_converter: "ModelConverter", state_dicts, hf_state_dict, vp_stage: int, verbose: bool = True + model_converter: "ModelConverter", + state_dicts: list[dict[str, torch.Tensor] | dict[str, dict[str, torch.Tensor]]], + hf_state_dict: dict | dict[str, dict], + vp_stage: int, + verbose: bool = True, + **kwargs, ): def log(msg): if verbose: @@ -53,7 +66,7 @@ def log(msg): if mca_name.endswith("._extra_state"): continue weights = [state_dict[mca_name] if mca_name in state_dict else None for state_dict in state_dicts] - converted_state_dict = model_converter.convert_to_hf({mca_name: weights}, vp_stage=vp_stage) + converted_state_dict = model_converter.convert_to_hf({mca_name: weights}, vp_stage=vp_stage, **kwargs) if converted_state_dict is not None and len(converted_state_dict) > 0: for hf_name, hf_weight in converted_state_dict.items(): if hf_name in hf_state_dict: @@ -69,32 +82,16 @@ def log(msg): log(f"mca_name: {mca_name} added but not converted") -def convert_checkpoint_to_hf( - model_name_or_path: str, - save_directory: str, - adapter_name_or_path: Optional[str] = None, - torch_dtype: Optional["torch.dtype"] = None, - verbose: bool = True, -): - if is_lora := adapter_name_or_path is not None: - if not is_peft_available(): - raise ImportError("PEFT is not installed. Please install it with `pip install peft`") - ckpt_path = adapter_name_or_path - peft_config = PeftConfig.from_pretrained(adapter_name_or_path) - else: - ckpt_path = model_name_or_path - mca_config = AutoConfig.from_pretrained(ckpt_path) +def _load_mca_config_and_setup(checkpoint_path: str): + mca_config = AutoConfig.from_pretrained(checkpoint_path) if mca_config is None: raise ValueError("No mca config found in checkpoint") if mca_config.hf_model_type is None: raise ValueError("No hf model type found in mca config") - if is_lora: - setattr(mca_config, "lora_rank", peft_config.r) template: "Template" = get_template(mca_config.hf_model_type) hf_config = template.convert_mca_to_hf_config(mca_config) template.set_mca_config_for_ops(mca_config) - hf_state_dict = {} mpu.set_expert_model_parallel_world_size(mca_config.expert_model_parallel_size) mpu.set_pipeline_model_parallel_world_size(mca_config.pipeline_model_parallel_size) @@ -102,14 +99,24 @@ def convert_checkpoint_to_hf( if mca_config.virtual_pipeline_model_parallel_size is not None: mpu.set_virtual_pipeline_model_parallel_world_size(mca_config.virtual_pipeline_model_parallel_size) + return mca_config, hf_config + + +def _convert_state_dicts( + checkpoint_path: str, + mca_config: "McaModelConfig", + target_state_dict: dict, + verbose: bool = True, + adapter_name: str | None = None, + **kwargs, +): for pp_rank, ep_rank in product( range(mca_config.pipeline_model_parallel_size), range(mca_config.expert_model_parallel_size) ): state_dicts = [] - # TODO: use loader and support low_mem for tp_rank in range(mca_config.tensor_model_parallel_size): ckpt_name = get_checkpoint_name( - ckpt_path, + checkpoint_path, tensor_rank=tp_rank, pipeline_rank=pp_rank, pipeline_parallel=mca_config.pipeline_model_parallel_size > 1, @@ -117,10 +124,12 @@ def convert_checkpoint_to_hf( expert_parallel=mca_config.expert_model_parallel_size > 1, ) state_dicts.append(torch.load(ckpt_name, map_location="cpu")) + virtual_pipe_on = (mca_config.virtual_pipeline_model_parallel_size or 1) > 1 mpu.set_pipeline_model_parallel_rank(pp_rank) mpu.set_expert_model_parallel_rank(ep_rank) mpu.set_tensor_model_parallel_rank(0) + model_converter = ModelConverter( mca_config=mca_config, pipeline_model_parallel_rank=pp_rank, @@ -129,67 +138,50 @@ def convert_checkpoint_to_hf( verbose=verbose, to_hf=True, ) + for i in range(mca_config.virtual_pipeline_model_parallel_size or 1): if virtual_pipe_on: mpu.set_virtual_pipeline_model_parallel_rank(i) key = "model" + (str(i) if virtual_pipe_on else "") virtual_state_dicts = [sd.pop(key) for sd in state_dicts] _add_mca_state_dicts_to_hf( - model_converter, virtual_state_dicts, hf_state_dict, vp_stage=i, verbose=verbose + model_converter, + virtual_state_dicts, + target_state_dict, + vp_stage=i, + verbose=verbose, + **kwargs, ) + +def _get_hf_model_class(hf_config: "PretrainedConfig", mca_config: "McaModelConfig"): has_remote_code = hasattr(hf_config, "auto_map") and "AutoModelForCausalLM" in hf_config.auto_map model_class = AutoModelForCausalLM + if type(hf_config) in AutoModelForVision2Seq._model_mapping.keys(): model_class = AutoModelForVision2Seq elif type(hf_config) in AutoModelForImageTextToText._model_mapping.keys(): model_class = AutoModelForImageTextToText + elif type(hf_config) in AutoModelForTextToWaveform._model_mapping.keys(): + model_class = AutoModelForTextToWaveform if has_remote_code: class_ref = hf_config.auto_map["AutoModelForCausalLM"] model_class = get_class_from_dynamic_module(class_ref, mca_config.name_or_path) else: model_class = _get_model_class(hf_config, model_class._model_mapping) - if is_lora: - hf_config.save_pretrained(save_directory) - target_modules = set() - for name, _ in hf_state_dict.items(): - if ".lora_A." in name or ".lora_B." in name: - # TODO: support VLM lora - target_modules.add(name[:name.find(".lora")].split(".")[-1]) - target_modules = list(target_modules) - model = model_class.from_pretrained( - model_name_or_path, - config=hf_config, - torch_dtype=torch_dtype if torch_dtype is not None else mca_config.params_dtype, - trust_remote_code=True, - ) - lora_config = LoraConfig( - r=peft_config.r, - target_modules=target_modules, - lora_alpha=peft_config.lora_alpha, - lora_dropout=peft_config.lora_dropout, - use_rslora=peft_config.use_rslora, - modules_to_save=peft_config.modules_to_save, - ) - model = get_peft_model(model, lora_config) - model.base_model.model.load_state_dict(hf_state_dict, strict=False) - else: - model = model_class.from_pretrained( - None, - config=hf_config, - state_dict=hf_state_dict, - torch_dtype=torch_dtype if torch_dtype is not None else mca_config.params_dtype, - trust_remote_code=True, - ) - model.save_pretrained(save_directory) - mca_config.save_hf_auto_map_files(save_directory) - tokenizer = AutoTokenizer.from_pretrained(ckpt_path, trust_remote_code=True) + return model_class + + +def _save_tokenizer_and_processor(checkpoint_path: str, save_directory: str, verbose: bool = True): + tokenizer = AutoTokenizer.from_pretrained(checkpoint_path, trust_remote_code=True) try: - processor = AutoProcessor.from_pretrained(ckpt_path, trust_remote_code=True) + processor = AutoProcessor.from_pretrained(checkpoint_path, trust_remote_code=True) except Exception as e: - logger.info(f"Processor was not found: {e}.") + if verbose: + logger.info(f"Processor was not found: {e}.") processor = tokenizer + if processor is not None and "Processor" not in processor.__class__.__name__: processor = None @@ -200,6 +192,168 @@ def convert_checkpoint_to_hf( processor.save_pretrained(save_directory) +def convert_adapter_to_hf( + model_name_or_path: str, + adapter_name_or_path: str, + save_directory: str, + torch_dtype: Optional["torch.dtype"] = None, + verbose: bool = True, +): + adapter_names = ( + [ + folder_name + for folder_name in os.listdir(adapter_name_or_path) + if os.path.isdir(os.path.join(adapter_name_or_path, folder_name)) + and os.path.isfile(os.path.join(adapter_name_or_path, folder_name, ADAPTER_CONFIG_NAME)) + ] + if os.path.isdir(adapter_name_or_path) + else [] + ) + if not adapter_names: + raise ValueError(f"No LoRA adapters found in {adapter_name_or_path}") + + peft_configs = { + adapter_name: PeftConfig.from_pretrained(os.path.join(adapter_name_or_path, adapter_name)) + for adapter_name in adapter_names + } + + mca_config, hf_config = _load_mca_config_and_setup(adapter_name_or_path) + hf_state_dict = defaultdict(dict) + + # 转换每个 adapter 的权重 + for adapter_name, peft_config in peft_configs.items(): + adapter_checkpoint_path = os.path.join(adapter_name_or_path, adapter_name) + _convert_state_dicts( + adapter_checkpoint_path, + mca_config, + hf_state_dict[adapter_name], + verbose=verbose, + adapter_name=adapter_name, + lora_rank=peft_config.r, + ) + + # 创建模型并加载 adapter + model_class = _get_hf_model_class(hf_config, mca_config) + hf_config.save_pretrained(save_directory) + + model = model_class.from_pretrained( + model_name_or_path, + config=hf_config, + torch_dtype=torch_dtype if torch_dtype is not None else mca_config.params_dtype, + trust_remote_code=True, + ) + + # 加载第一个 adapter + adapter0_name = "default" if "default" in hf_state_dict else sorted(hf_state_dict.keys())[0] + target_modules = [ + name[: name.find(".lora")].split(".")[-1] + for name in hf_state_dict[adapter0_name].keys() + if ".lora_A." in name or ".lora_B." in name + ] + target_modules = list(set(target_modules)) + kwargs = {} + if mca_config.num_moe_experts is not None: # MoE model + rank_pattern = { + "down_proj": peft_configs[adapter0_name].r // mca_config.moe_router_topk, + "up_proj": peft_configs[adapter0_name].r // mca_config.moe_router_topk, + "gate_proj": peft_configs[adapter0_name].r // mca_config.moe_router_topk, + "w1": peft_configs[adapter0_name].r // mca_config.moe_router_topk, + "w2": peft_configs[adapter0_name].r // mca_config.moe_router_topk, + "w3": peft_configs[adapter0_name].r // mca_config.moe_router_topk, + } + kwargs["rank_pattern"] = rank_pattern + + lora_config = LoraConfig( + r=peft_configs[adapter0_name].r, + target_modules=target_modules, + lora_alpha=peft_configs[adapter0_name].lora_alpha, + lora_dropout=peft_configs[adapter0_name].lora_dropout, + use_rslora=peft_configs[adapter0_name].use_rslora, + modules_to_save=peft_configs[adapter0_name].modules_to_save, + **kwargs, + ) + model = get_peft_model(model, lora_config, adapter_name=adapter0_name) + set_peft_model_state_dict(model.base_model.model, hf_state_dict[adapter0_name], adapter_name=adapter0_name) + + # 加载其他 adapter + for adapter_name, state_dict in hf_state_dict.items(): + if adapter_name == adapter0_name: + continue + target_modules = [ + name[: name.find(".lora")].split(".")[-1] + for name in state_dict.keys() + if ".lora_A." in name or ".lora_B." in name + ] + target_modules = list(set(target_modules)) + kwargs = {} + if mca_config.num_moe_experts is not None: # MoE model + rank_pattern = { + "down_proj": peft_configs[adapter_name].r // mca_config.moe_router_topk, + "up_proj": peft_configs[adapter_name].r // mca_config.moe_router_topk, + "gate_proj": peft_configs[adapter_name].r // mca_config.moe_router_topk, + "w1": peft_configs[adapter_name].r // mca_config.moe_router_topk, + "w2": peft_configs[adapter_name].r // mca_config.moe_router_topk, + "w3": peft_configs[adapter_name].r // mca_config.moe_router_topk, + } + kwargs["rank_pattern"] = rank_pattern + + lora_config = LoraConfig( + r=peft_configs[adapter_name].r, + target_modules=target_modules, + lora_alpha=peft_configs[adapter_name].lora_alpha, + lora_dropout=peft_configs[adapter_name].lora_dropout, + use_rslora=peft_configs[adapter_name].use_rslora, + modules_to_save=peft_configs[adapter_name].modules_to_save, + **kwargs, + ) + model.add_adapter(adapter_name, lora_config) + set_peft_model_state_dict(model.base_model.model, state_dict, adapter_name=adapter_name) + + model.save_pretrained(save_directory) + mca_config.save_hf_auto_map_files(save_directory) + _save_tokenizer_and_processor(adapter_name_or_path, save_directory, verbose) + + +def convert_checkpoint_to_hf( + model_name_or_path: str, + save_directory: str, + adapter_name_or_path: Optional[str] = None, + torch_dtype: Optional["torch.dtype"] = None, + verbose: bool = True, +): + if adapter_name_or_path is not None: + if not is_peft_available(): + raise ImportError("PEFT is not installed. Please install it with `pip install peft`") + convert_adapter_to_hf( + model_name_or_path=model_name_or_path, + adapter_name_or_path=adapter_name_or_path, + save_directory=save_directory, + torch_dtype=torch_dtype, + verbose=verbose, + ) + return + + ckpt_path = model_name_or_path + mca_config, hf_config = _load_mca_config_and_setup(ckpt_path) + hf_state_dict = {} + + # 转换权重 + _convert_state_dicts(ckpt_path, mca_config, hf_state_dict, verbose=verbose) + + # 创建并保存模型 + model_class = _get_hf_model_class(hf_config, mca_config) + model = model_class.from_pretrained( + None, + config=hf_config, + state_dict=hf_state_dict, + torch_dtype=torch_dtype if torch_dtype is not None else mca_config.params_dtype, + trust_remote_code=True, + ) + model.save_pretrained(save_directory) + mca_config.save_hf_auto_map_files(save_directory) + _save_tokenizer_and_processor(ckpt_path, save_directory, verbose) + + def convert_checkpoint_to_mca( model_name_or_path: str, save_directory: str, diff --git a/mcore_adapter/src/mcore_adapter/models/converter/template.py b/mcore_adapter/src/mcore_adapter/models/converter/template.py index 9b509e820..040bb6f97 100644 --- a/mcore_adapter/src/mcore_adapter/models/converter/template.py +++ b/mcore_adapter/src/mcore_adapter/models/converter/template.py @@ -1,4 +1,5 @@ import json +import os import re from abc import ABC from dataclasses import dataclass, field @@ -35,7 +36,7 @@ class ConverOp(ABC): hf_names: Union[str, list] mca_names: Union[str, list] - mca_config: "TransformerConfig" = None + _mca_config: "TransformerConfig" = field(default=None, repr=False) def __post_init__(self): if isinstance(self.hf_names, str): @@ -53,6 +54,14 @@ def __call__(self, name_to_weight: Dict[str, torch.Tensor], mca_to_hf: bool = Fa else: return self.hf_to_mca(name_to_weight) + @property + def mca_config(self) -> "TransformerConfig": + return self._mca_config + + @mca_config.setter + def mca_config(self, value: "TransformerConfig"): + self._mca_config = value + @staticmethod def _name_to_pattern(name: str): return name.replace(".", "\.").replace("{}", "(.*)") @@ -282,6 +291,7 @@ class Template: prefix_name_to_weight: Dict[str, Dict[str, torch.Tensor]] = field(default_factory=dict) def __post_init__(self): + self.config_hf_to_mca = self.adjust_config_hf_to_mca() if self.config_mca_to_hf is None: self.config_mca_to_hf = {v: k for k, v in self.config_hf_to_mca.items()} self.hf_name_to_converter = {} @@ -303,6 +313,54 @@ def release(self): logger.warning(f"weights not converted {len(weights_not_converted)} {weights_not_converted}") self.prefix_name_to_weight = {} + def adjust_config_hf_to_mca(self): + return self.config_hf_to_mca + + def get_hf_config_value(self, hf_config, key, cfg_errs: List[str] = []): + for name in key.split("."): + if not hasattr(hf_config, name): + # warn instead of assert to be backward compatible + # some cfg not exist in hf_config, such as vision_token_id + logger.warning(f"{key=} not exists in hf_config for get_hf_config_value") + cfg_errs.append(key) + return + hf_config = getattr(hf_config, name) + return hf_config + + def set_hf_config_value(self, hf_config, key, value): + # hf_config is a dict from config.to_dict() by `to_json_string(use_diff=True)`, + # sub-configs with PretrainedConfig type would be convert to dict + # use_diff makes hf_config only contain items whose value is different from default + raw_hf_config = hf_config + names = key.split(".") + for i, name in enumerate(names): + if isinstance(hf_config, dict): + if name not in hf_config: + # to be backward compatible + # always put mca config value into hf config kw_args + logger.warning( + f"{key=} not exists in hf_config for set_hf_config_value, " + f"ignore this if no warning in get_hf_config_value" + ) + raw_hf_config[key] = value + if i == len(names) - 1: + hf_config[name] = value + else: + hf_config = hf_config[name] + else: + if not hasattr(hf_config, name): + # to be backward compatible + # always put mca config value into hf config kw_args + logger.warning( + f"{key=} not exists in hf_config for set_hf_config_value, " + f"ignore this if no warning in get_hf_config_value" + ) + raw_hf_config[key] = value + if i == len(names) - 1: + setattr(hf_config, name, value) + else: + hf_config = getattr(hf_config, name) + def convert_hf_to_mca_config(self, hf_config, **kw_args): from ...models.auto.config_auto import AutoConfig as AutoMcaModelConfig @@ -310,33 +368,34 @@ def convert_hf_to_mca_config(self, hf_config, **kw_args): return AutoMcaModelConfig.for_model(self.hf_model_type, **kw_args) def convert_hf_to_mca_config_kws(self, hf_config: "PretrainedConfig", **kw_args): - # TODO: support text_config - if hasattr(hf_config, "text_config"): - text_config = hf_config.text_config.to_dict() - for k, v in text_config.items(): - setattr(hf_config, k, v) - for k, v in self.config_hf_to_mca.items(): - if hasattr(hf_config, k): - kw_args[v] = getattr(hf_config, k) + cfg_errs = [] + cfg_value = self.get_hf_config_value(hf_config, k, cfg_errs) + if not cfg_errs: # cfg_value can be any, use cfg_errs to check + kw_args[v] = cfg_value kw_args["hf_model_type"] = self.hf_model_type kw_args["name_or_path"] = hf_config.name_or_path kw_args["hf_config_json"] = hf_config.to_json_string() return {**kw_args, **self.constant_mca_config} def convert_mca_to_hf_config(self, mca_config, **kw_args): + config_dict = json.loads(mca_config.hf_config_json) for k, v in self.config_mca_to_hf.items(): if hasattr(mca_config, k): - kw_args[v] = getattr(mca_config, k) + self.set_hf_config_value(config_dict, v, getattr(mca_config, k)) kw_args.update(self.constant_hf_config) kw_args["name_or_path"] = mca_config.name_or_path - config_dict = json.loads(mca_config.hf_config_json) kw_args = {**config_dict, **kw_args} kw_args["model_type"] = self.hf_model_type has_remote_code = "auto_map" in config_dict and "AutoConfig" in config_dict["auto_map"] if has_remote_code: class_ref = config_dict["auto_map"]["AutoConfig"] - config_class = get_class_from_dynamic_module(class_ref, mca_config.name_or_path) + pretrained_model_name_or_path = mca_config.name_or_path + automap_cache_path = mca_config.get_automap_cache() + read_cache = os.path.isdir(automap_cache_path) and any(f.endswith('.py') for f in os.listdir(automap_cache_path)) + if read_cache: + pretrained_model_name_or_path = automap_cache_path + config_class = get_class_from_dynamic_module(class_ref, pretrained_model_name_or_path) config_class.register_for_auto_class() return config_class.from_dict(kw_args) return AutoConfig.for_model(**kw_args) @@ -370,7 +429,7 @@ def add_hf_weight(self, name, weight): mca_prefix = convert_to_mca_prefix(weight_prefix, self.hf_layer_prefix, self.hf_moe_prefix) return {mca_prefix + name: weight for name, weight in conver_res.items()} - def add_mca_weight(self, name, weight): + def add_mca_weight(self, name, weight, **kwargs): weight_prefix = get_mca_weight_prefix(name) original_name = remove_mca_weight_prefix(name) if weight_prefix not in self.prefix_name_to_weight: @@ -378,7 +437,7 @@ def add_mca_weight(self, name, weight): self.prefix_name_to_weight[weight_prefix][original_name] = weight prefix_weights = self.prefix_name_to_weight[weight_prefix] if ".lora_A." in original_name or ".lora_B." in original_name: - op = self.get_lora_conver_op(original_name, self.mca_name_to_converter) + op = self.get_lora_conver_op(original_name, self.mca_name_to_converter, **kwargs) else: op = self.get_conver_op(original_name, self.mca_name_to_converter) name_to_weight = { @@ -403,9 +462,9 @@ def get_conver_op(self, name, pattern_to_conver_ops: Dict[str, ConverOp]): return pattern_to_conver_ops[pattern] raise ValueError(f"can not find conver op for {name} in {pattern_to_conver_ops}") - def get_lora_conver_op(self, name, pattern_to_conver_ops: Dict[str, ConverOp]): - lora_name = name[name.find(".lora"):] - name = name[:name.find(".lora")] + ".weight" + def get_lora_conver_op(self, name, pattern_to_conver_ops: Dict[str, ConverOp], lora_rank: int): + lora_name = name[name.find(".lora") :] + name = name[: name.find(".lora")] + ".weight" op = self.get_conver_op(name, pattern_to_conver_ops) if isinstance(op, RenameConverOp): op_class = RenameConverOp @@ -418,13 +477,13 @@ def get_lora_conver_op(self, name, pattern_to_conver_ops: Dict[str, ConverOp]): kwargs = {"dim": op.dim} elif isinstance(op, QKVConverOp): op_class = QKVConverOp - kwargs = {"hidden_size": op.mca_config.lora_rank} + kwargs = {"hidden_size": lora_rank} else: raise ValueError(f"can not find lora conver op for {name} in {pattern_to_conver_ops}") return op_class( hf_names=[hf_name.replace(".weight", lora_name) for hf_name in op.hf_names], mca_names=[mca_name.replace(".weight", lora_name) for mca_name in op.mca_names], - mca_config=op.mca_config, + _mca_config=op.mca_config, **kwargs, ) diff --git a/mcore_adapter/src/mcore_adapter/models/deepseek_v3/__init__.py b/mcore_adapter/src/mcore_adapter/models/deepseek_v3/__init__.py index a102871a3..b2a4be612 100644 --- a/mcore_adapter/src/mcore_adapter/models/deepseek_v3/__init__.py +++ b/mcore_adapter/src/mcore_adapter/models/deepseek_v3/__init__.py @@ -5,6 +5,7 @@ from ..converter.convert_utils import ( get_layer_index, get_mca_layer_index, + get_mca_mtp_layer_index, remove_weight_prefix, ) from ..converter.dist_converter import mla_dist_config, register_dist_config @@ -103,12 +104,12 @@ def add_hf_weight(self, name, weight): res[name] = weight return res - def add_mca_weight(self, name, weight): + def add_mca_weight(self, name, weight, **kwargs): name = self.revert_mtp_name(name) layer_index = get_mca_layer_index(name) if layer_index is not None and layer_index < self.mca_config.moe_layer_freq.count(0): name = name.replace("mlp.linear_fc1.layer_norm_", "pre_mlp_layernorm.") - name2weights = super().add_mca_weight(name, weight) + name2weights = super().add_mca_weight(name, weight, **kwargs) res = {} for name, weight in name2weights.items(): if ( @@ -141,17 +142,14 @@ def convert_mtp_name(self, name): name = name.replace("decoder", "mtp") pure_name = remove_weight_prefix(name, prefix="mtp.layers.") name = ( - "mtp.layers." - + str(mtp_layer_index) - + (".transformer_layer" if has_transformer_layer else "") - + pure_name + "mtp.layers." + str(mtp_layer_index) + (".transformer_layer" if has_transformer_layer else "") + pure_name ) return name def revert_mtp_name(self, name): if "mtp" in name: has_transformer_layer = "self_attention" in name or "mlp" in name or "input_layernorm" in name - mtp_layer_index = get_layer_index(name, prefix="mtp.layers.") + mtp_layer_index = get_mca_mtp_layer_index(name) pure_name = remove_weight_prefix(name, prefix="mtp.layers.") # only consider padding mtp for now... mca_layer_index = mtp_layer_index + self.mca_config.num_layers @@ -301,9 +299,7 @@ def revert_mtp_name(self, name): RenameConverOp(hf_names=".hnorm.weight", mca_names=".hnorm.weight"), RenameConverOp(hf_names=".eh_proj.weight", mca_names=".eh_proj.weight"), RenameConverOp(hf_names=".shared_head.norm.weight", mca_names=".final_layernorm.weight"), - RenameConverOp( - hf_names=".self_attn.o_proj.weight_scale_inv", mca_names=".self_attn.o_proj.weight_scale_inv" - ), + RenameConverOp(hf_names=".self_attn.o_proj.weight_scale_inv", mca_names=".self_attn.o_proj.weight_scale_inv"), RenameConverOp(hf_names=".post_attention_layernorm.weight", mca_names=".pre_mlp_layernorm.weight"), RenameConverOp(hf_names="model.norm.weight", mca_names="decoder.final_layernorm.weight"), RenameConverOp(hf_names=".mlp.gate.weight", mca_names=".mlp.router.weight"), diff --git a/mcore_adapter/src/mcore_adapter/models/glm4_moe/__init__.py b/mcore_adapter/src/mcore_adapter/models/glm4_moe/__init__.py index 70ac78e33..1cca1e914 100644 --- a/mcore_adapter/src/mcore_adapter/models/glm4_moe/__init__.py +++ b/mcore_adapter/src/mcore_adapter/models/glm4_moe/__init__.py @@ -5,8 +5,10 @@ from ..converter.convert_utils import ( get_layer_index, get_mca_layer_index, + get_mca_mtp_layer_index, + remove_weight_prefix, ) -from ..converter.dist_converter import DistParallelConfig, default_dist_config, register_dist_config +from ..converter.dist_converter import DistParallelConfig, default_dist_config, mtp_config, register_dist_config from ..converter.template import ( QKVBiasConverOp, QKVConverOp, @@ -18,6 +20,7 @@ from ..model_config import McaModelConfig from ..model_factory import McaGPTModel + class Glm4MoeTemplate(Template): def convert_hf_to_mca_config_kws(self, hf_config, **kw_args): partial_rotary_factor = getattr(hf_config, "partial_rotary_factor", None) @@ -79,6 +82,8 @@ def add_hf_weight(self, name, weight): res = {} for name, weight in name2weights.items(): layer_index = get_mca_layer_index(name) + if layer_index is not None and layer_index >= self.mca_config.num_layers: + name = self.convert_mtp_name(name) if layer_index is not None and layer_index < self.mca_config.moe_layer_freq.count(0): # dense layer use fused `TELayerNormColumnParallelLinear`, change the name if "pre_mlp_layernorm" in name: @@ -86,11 +91,12 @@ def add_hf_weight(self, name, weight): res[name] = weight return res - def add_mca_weight(self, name, weight): + def add_mca_weight(self, name, weight, **kwargs): + name = self.revert_mtp_name(name) layer_index = get_mca_layer_index(name) if layer_index is not None and layer_index < self.mca_config.moe_layer_freq.count(0): name = name.replace("mlp.linear_fc1.layer_norm_", "pre_mlp_layernorm.") - name2weights = super().add_mca_weight(name, weight) + name2weights = super().add_mca_weight(name, weight, **kwargs) res = {} for name, weight in name2weights.items(): if ( @@ -107,6 +113,39 @@ def add_mca_weight(self, name, weight): res[name] = weight return res + def hf_name_to_mca_names(self, hf_name): + mca_names = super().hf_name_to_mca_names(hf_name) + if mca_names is None: + return None + mtp_mca_names = [self.convert_mtp_name(mca_name) for mca_name in mca_names] + return mtp_mca_names + + def convert_mtp_name(self, name): + mca_layer_index = get_mca_layer_index(name) + if mca_layer_index is None or mca_layer_index < self.mca_config.num_layers: + return name + mtp_layer_index = mca_layer_index - self.mca_config.num_layers + has_transformer_layer = "self_attention" in name or "mlp" in name or "input_layernorm" in name + name = name.replace("decoder", "mtp") + pure_name = remove_weight_prefix(name, prefix="mtp.layers.") + name = ( + "mtp.layers." + str(mtp_layer_index) + (".transformer_layer" if has_transformer_layer else "") + pure_name + ) + return name + + def revert_mtp_name(self, name): + if "mtp" in name: + has_transformer_layer = "self_attention" in name or "mlp" in name or "input_layernorm" in name + mtp_layer_index = get_mca_mtp_layer_index(name) + pure_name = remove_weight_prefix(name, prefix="mtp.layers.") + # only consider padding mtp for now... + mca_layer_index = mtp_layer_index + self.mca_config.num_layers + name = ( + "decoder.layers." + + str(mca_layer_index) + + (pure_name.replace(".transformer_layer", "") if has_transformer_layer else pure_name) + ) + return name register_template( @@ -115,7 +154,7 @@ def add_mca_weight(self, name, weight): hf_moe_prefix=".mlp.experts.", template_class=Glm4MoeTemplate, hf_invalid_keys=[ - ".embed_tokens.weight", # skip layers.x.embed_tokens + ".embed_tokens.weight", # skip layers.x.embed_tokens ".shared_head.head.weight", ], config_hf_to_mca={ @@ -135,8 +174,10 @@ def add_mca_weight(self, name, weight): # MoE related "moe_intermediate_size": "moe_ffn_hidden_size", "decoder_sparse_step": "moe_layer_freq", - "n_routed_experts": "num_moe_experts", # diff + "n_routed_experts": "num_moe_experts", # diff "num_experts_per_tok": "moe_router_topk", + # MTP related + "num_nextn_predict_layers": "mtp_num_layers", }, constant_mca_config={ "swiglu": True, @@ -150,6 +191,7 @@ def add_mca_weight(self, name, weight): "qk_layernorm": False, "moe_router_enable_expert_bias": True, "moe_router_score_function": "sigmoid", + "mtp_loss_scaling_factor": 0.3, }, weight_converters=[ RenameConverOp(hf_names="lm_head.weight", mca_names="output_layer.weight"), @@ -157,13 +199,15 @@ def add_mca_weight(self, name, weight): RenameConverOp(hf_names=".input_layernorm.weight", mca_names=".self_attention.linear_qkv.layer_norm_weight"), RenameConverOp(hf_names=".self_attn.o_proj.weight", mca_names=".self_attention.linear_proj.weight"), RenameConverOp(hf_names=".post_attention_layernorm.weight", mca_names=".pre_mlp_layernorm.weight"), - RenameConverOp(hf_names=".mlp.down_proj.weight", mca_names=".mlp.linear_fc2.weight"), # first layer - StackConverOp(hf_names=[".mlp.gate_proj.weight", ".mlp.up_proj.weight"], mca_names=".mlp.linear_fc1.weight", dim=0), + RenameConverOp(hf_names=".mlp.down_proj.weight", mca_names=".mlp.linear_fc2.weight"), # first layer + StackConverOp( + hf_names=[".mlp.gate_proj.weight", ".mlp.up_proj.weight"], mca_names=".mlp.linear_fc1.weight", dim=0 + ), StackConverOp(hf_names=[".gate_proj.weight", ".up_proj.weight"], mca_names=".linear_fc1.weight", dim=0), RenameConverOp(hf_names=".down_proj.weight", mca_names=".linear_fc2.weight"), RenameConverOp(hf_names="model.norm.weight", mca_names="decoder.final_layernorm.weight"), RenameConverOp(hf_names=".mlp.gate.weight", mca_names=".mlp.router.weight"), - StackConverOp( # for shared + StackConverOp( # for shared hf_names=[".mlp.shared_experts.gate_proj.weight", ".mlp.shared_experts.up_proj.weight"], mca_names=".mlp.shared_experts.linear_fc1.weight", dim=0, @@ -189,23 +233,24 @@ def add_mca_weight(self, name, weight): register_config("glm4_moe", McaModelConfig) register_model("glm4_moe", McaGPTModel) -glm_dist_config = default_dist_config.merge_configs( +glm_dist_config = default_dist_config.merge_configs(mtp_config).merge_configs( DistParallelConfig( - duplicated_weights=[ - ".mlp.router.expert_bias", - ], - grouped_column_map={".linear_fc1.weight": ".mlp.experts.weight1"}, - grouped_row_map={".linear_fc2.weight": ".mlp.experts.weight2"}, - row_parallel_weights=[ - ".self_attention.linear_proj.weight", - ".mlp.shared_experts.linear_fc2.weight", - ".linear_fc2.weight", - ".mlp.linear_fc2.weight", - ], - swiglu_weights=[ - ".mlp.shared_experts.linear_fc1.weight", - ".linear_fc1.weight", - ".mlp.linear_fc1.weight", - ], -)) + duplicated_weights=[ + ".mlp.router.expert_bias", + ], + grouped_column_map={".linear_fc1.weight": ".mlp.experts.weight1"}, + grouped_row_map={".linear_fc2.weight": ".mlp.experts.weight2"}, + row_parallel_weights=[ + ".self_attention.linear_proj.weight", + ".mlp.shared_experts.linear_fc2.weight", + ".linear_fc2.weight", + ".mlp.linear_fc2.weight", + ], + swiglu_weights=[ + ".mlp.shared_experts.linear_fc1.weight", + ".linear_fc1.weight", + ".mlp.linear_fc1.weight", + ], + ) +) register_dist_config("glm4_moe", glm_dist_config) diff --git a/mcore_adapter/src/mcore_adapter/models/model_config.py b/mcore_adapter/src/mcore_adapter/models/model_config.py index ba11580f8..3ba6c2296 100644 --- a/mcore_adapter/src/mcore_adapter/models/model_config.py +++ b/mcore_adapter/src/mcore_adapter/models/model_config.py @@ -5,7 +5,7 @@ import json import os import shutil -from dataclasses import dataclass, field +from dataclasses import dataclass, field, fields from typing import TYPE_CHECKING, Literal, Optional import torch @@ -82,7 +82,26 @@ def to_json_file(self, json_file_path): def from_json_file(cls, json_file_path): with open(json_file_path, "r", encoding="utf-8") as reader: text = reader.read() - return cls(**json.loads(text)) + config_dict = json.loads(text) + + valid_field_names = {f.name for f in fields(cls)} + + filtered_config = {} + removed_keys = [] + for k, v in config_dict.items(): + if k in valid_field_names: + filtered_config[k] = v + else: + removed_keys.append(k) + + if removed_keys: + logger.warning( + f"Config loading from {json_file_path}: " + f"Ignoring deprecated/unknown properties: {removed_keys}. " + "This might be due to a Megatron version upgrade." + ) + + return cls(**filtered_config) def save_pretrained(self, save_directory: str): os.makedirs(save_directory, exist_ok=True) @@ -96,7 +115,9 @@ def save_hf_auto_map_files(self, save_directory: str): # name_or_path denotes the path of the from_pretrained model, i.e., where auto map files are located # should archive the auto map files in a cache path avoiding the remote name_or_path path has been cleaned automap_cache_path = self.get_automap_cache() - read_cache = os.path.isdir(automap_cache_path) and any(f.endswith('.py') for f in os.listdir(automap_cache_path)) + read_cache = os.path.isdir(automap_cache_path) and any( + f.endswith(".py") for f in os.listdir(automap_cache_path) + ) hf_files_path = automap_cache_path if read_cache else self.name_or_path if not (hf_files_path and os.path.isdir(hf_files_path)): return @@ -175,17 +196,19 @@ def distribute_config_match(self, other): raise NotImplementedError("distribute_config_match not implemented") def get_automap_cache(self): - return os.path.join(os.getenv("HUGGINGFACE_AUTOMAP_CACHE", HUGGINGFACE_AUTOMAP_CACHE), - hashlib.sha256(self.name_or_path.encode()).hexdigest()) + return os.path.join( + os.getenv("HUGGINGFACE_AUTOMAP_CACHE", HUGGINGFACE_AUTOMAP_CACHE), + hashlib.sha256(self.name_or_path.encode()).hexdigest(), + ) @dataclass class McaModelConfig(TransformerConfig, PretrainedConfig): - position_embedding_type: Literal["learned_absolute", "rope", "none"] = field( + position_embedding_type: Literal["learned_absolute", "rope", "mrope", "yarn", "none"] = field( default="rope", metadata={ "help": "Position embedding type.", - "choices": ["learned_absolute", "rope", "mrope", "none"], + "choices": ["learned_absolute", "rope", "mrope", "yarn", "none"], }, ) padded_vocab_size: Optional[int] = field( @@ -224,6 +247,14 @@ class McaModelConfig(TransformerConfig, PretrainedConfig): default=False, metadata={"help": "Apply rope scaling as used in llama 3.x."}, ) + rotary_scaling_factor: float = field( + default=8.0, + metadata={ + "help": "The scaling factor applied to the inverse frequencies when " + "1) the wavelength is greater than `low_freq_wavelen` prior to smoothing, " + "2) to all inverse frequencies during smoothing." + }, + ) transformer_impl: Literal["local", "transformer_engine"] = field( default="transformer_engine", metadata={ @@ -256,6 +287,24 @@ def squared_relu(x): self.pipeline_dtype = self.params_dtype self.batch_p2p_comm = not self.overlap_p2p_comm + # Initialize Yarn RoPE parameters when position_embedding_type is "yarn" + if self.position_embedding_type == "yarn": + # Dynamically add Yarn config attributes only when using yarn + if not hasattr(self, "yarn_beta_fast"): + self.yarn_beta_fast = 32 + if not hasattr(self, "yarn_beta_slow"): + self.yarn_beta_slow = 1 + if not hasattr(self, "yarn_rotary_scaling_factor"): + self.yarn_rotary_scaling_factor = 4 + if not hasattr(self, "yarn_original_max_position_embeddings"): + self.yarn_original_max_position_embeddings = 32768 + if not hasattr(self, "yarn_mscale"): + self.yarn_mscale = 1 + if not hasattr(self, "yarn_mscale_all_dim"): + self.yarn_mscale_all_dim = 0 + if not hasattr(self, "yarn_correction_range_round_to_int"): + self.yarn_correction_range_round_to_int = True + if ( self.recompute_granularity == "full" and self.recompute_method is None diff --git a/mcore_adapter/src/mcore_adapter/models/model_factory.py b/mcore_adapter/src/mcore_adapter/models/model_factory.py index f6792a7da..9b2e8686e 100644 --- a/mcore_adapter/src/mcore_adapter/models/model_factory.py +++ b/mcore_adapter/src/mcore_adapter/models/model_factory.py @@ -13,18 +13,28 @@ get_gpt_mtp_block_spec, ) from megatron.core.transformer.module import MegatronModule +from transformers.tokenization_utils import PreTrainedTokenizer +from transformers.utils import is_peft_available from ..checkpointing import load_state_dict_from_checkpoint, save_config_and_state_dict from ..platforms import current_platform -from ..utils import get_logger, is_peft_available +from ..utils import get_logger from .converter.convert_utils import MAX_SHARD_SIZE from .converter.model_converter import ModelConverter from .model_config import McaModelConfig -from .model_utils import ModuleUtilsMixin, RMSNorm, exists_hf_config, exists_mca_config, get_thd_data_on_this_cp_rank +from .model_utils import ( + ModuleUtilsMixin, + RMSNorm, + configure_resized_vocab_size, + exists_hf_config, + exists_mca_config, + get_thd_data_on_this_cp_rank, + mca_lora_logits_postprocess_hook, +) if is_peft_available(): - from peft import PeftModel + from peft import PeftModel, get_peft_model_state_dict, set_peft_model_state_dict if TYPE_CHECKING: @@ -48,9 +58,16 @@ def __init__(self, cls, config: "McaModelConfig", *args, **kwargs): def save_pretrained(self, save_directory: str): if len(self.models) == 1: if is_peft_available() and isinstance(self.models[0], PeftModel): - for _, peft_config in self.models[0].peft_config.items(): - peft_config.save_pretrained(save_directory) - return self.models[0].base_model.model.save_pretrained(save_directory) + for adapter_name, peft_config in self.models[0].peft_config.items(): + adapter_save_directory = os.path.join(save_directory, adapter_name) + peft_config.save_pretrained(adapter_save_directory) + peft_state_dict = get_peft_model_state_dict( + self.models[0], self.models[0].state_dict_for_save_checkpoint(), adapter_name + ) + self.models[0].base_model.model.save_pretrained( + adapter_save_directory, state_dict={"model": peft_state_dict} + ) + return self.config.save_pretrained(save_directory) return self.models[0].save_pretrained(save_directory) state_dict = {f"model{i}": model.state_dict_for_save_checkpoint() for i, model in enumerate(self.models)} return self.models[0].save_pretrained(save_directory, state_dict=state_dict) @@ -60,7 +77,19 @@ def load_state_dict(self, state_dict: Dict[str, torch.Tensor], strict: bool = Tr if "model" in state_dict: state_dict = state_dict["model"] if is_peft_available() and isinstance(self.models[0], PeftModel): - return self.models[0].base_model.model.load_state_dict(state_dict, strict=False) + all_missing_keys, all_unexpected_keys = [], [] + for adapter_name in self.models[0].peft_config.keys(): + ret = set_peft_model_state_dict( + self.models[0].base_model.model, + state_dict[adapter_name]["model"] + if "model" in state_dict[adapter_name] + else state_dict[adapter_name], + adapter_name, + ) + if not strict: + all_missing_keys.extend(ret[0]) + all_unexpected_keys.extend(ret[1]) + return all_missing_keys, all_unexpected_keys return self.models[0].load_state_dict(state_dict, strict=strict) all_missing_keys, all_unexpected_keys = [], [] for i, model in enumerate(self.models): @@ -134,19 +163,9 @@ def save_pretrained_as_hf( os.makedirs(save_directory, exist_ok=True) converter = ModelConverter(self.config, to_hf=True) converter.save_model_as_hf_inflight( - self.models, save_directory, save_safetensors=save_safetensors, max_shard_size=max_shard_size + self.models, save_directory, save_safetensors=save_safetensors, max_shard_size=max_shard_size, move_to_cpu=True, ) - def all_gather_weights_as_hf_inflight(self, models=None): - models = models or self.models - converter = ModelConverter(self.config, to_hf=True) - yield from converter.all_gather_weights_as_hf_inflight(models) - - def all_gather_weights_as_hf_bucket(self, models=None, bucket_size: int = None): - models = models or self.models - converter = ModelConverter(self.config, to_hf=True) - yield from converter.all_gather_weights_as_hf_bucket(models, bucket_size=bucket_size) - def get_batch_on_this_cp_rank(self, *args, **kwargs): return self.models[0].get_batch_on_this_cp_rank(*args, **kwargs) @@ -166,11 +185,18 @@ class PretrainedModel(MegatronModule, ModuleUtilsMixin): @classmethod def from_pretrained( - cls, model_name_or_path: str, args: "TrainingArguments" = None, use_cpu_initialization: bool = False + cls, model_name_or_path: str, args: "TrainingArguments" = None, use_cpu_initialization: bool = False, tokenizer: PreTrainedTokenizer = None, ) -> "VirtualModels": load_start_time = time.time() config = cls.config_class.from_pretrained(model_name_or_path, args) config.use_cpu_initialization = use_cpu_initialization + + resized_vocab_size = None + if tokenizer is not None: + resized_vocab_size = configure_resized_vocab_size(config.padded_vocab_size, len(tokenizer)) + if resized_vocab_size: + config.padded_vocab_size = resized_vocab_size + models = VirtualModels(cls, config=config) logger.info( @@ -186,6 +212,9 @@ def from_pretrained( dist_config_match = config.distribute_config_match(old_mca_config) if mca_ckpt_exist and dist_config_match: + if resized_vocab_size: + raise ValueError("The tokenizer length is longer than the vocab embedding size, and the resize embedding" \ + "layer is not supported loading mca ckpt. Please check the tokenizer and ckpt.") state_dict = load_state_dict_from_checkpoint(model_name_or_path) else: if not exists_hf_config(model_name_or_path): @@ -194,7 +223,7 @@ def from_pretrained( f"and not mca_ckpt_exist: {mca_ckpt_exist} or not dist_config_match: {dist_config_match}" ) state_dict = {} - converter = ModelConverter(config) + converter = ModelConverter(config, resized_vocab_size=resized_vocab_size) for i in range(len(models)): key = "model" if len(models) > 1: @@ -244,10 +273,9 @@ def get_batch_on_this_cp_rank(self, batch: Dict[str, "torch.Tensor"], dim3_keys: val.shape[seq_dim] // (2 * cp_size), *val.shape[(seq_dim + 1) :], ) - index = torch.tensor([cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True).to( - current_platform.device_type, - non_blocking=True - ) + index = torch.tensor( + [cp_rank, (2 * cp_size - cp_rank - 1)], device="cpu", pin_memory=True + ).to(current_platform.device_type, non_blocking=True) val = val.index_select(seq_dim, index) val = val.view(*val.shape[0:seq_dim], -1, *val.shape[(seq_dim + 2) :]) batch[key] = val @@ -290,7 +318,8 @@ def __init__(self, config: "McaModelConfig", **kwargs): rotary_percent=config.rotary_percent, rotary_base=config.rotary_base, rope_scaling=config.rotary_scaling, - mtp_block_spec=self._get_mtp_block_spec(config), + rope_scaling_factor=config.rotary_scaling_factor, + mtp_block_spec=self._get_mtp_block_spec(config, vp_stage=self.vp_stage), vp_stage=self.vp_stage, ) for param in self.parameters(): @@ -298,6 +327,9 @@ def __init__(self, config: "McaModelConfig", **kwargs): if not config.use_cpu_initialization: self.to(current_platform.current_device()) + if self.post_process or self.mtp_process: + self.output_layer.register_forward_hook(mca_lora_logits_postprocess_hook) + def _get_transformer_layer_spec(self, config: Optional["McaModelConfig"] = None): config = config or self.config use_te = config.transformer_impl == "transformer_engine" @@ -309,7 +341,7 @@ def _get_transformer_layer_spec(self, config: Optional["McaModelConfig"] = None) if not use_te and config.normalization == "RMSNorm": transformer_layer_spec.submodules.input_layernorm = RMSNorm transformer_layer_spec.submodules.pre_mlp_layernorm = RMSNorm - if hasattr(transformer_layer_spec.submodules.mlp.submodules, "shared_experts"): + if getattr(transformer_layer_spec.submodules.mlp.submodules, "shared_experts", None): transformer_layer_spec.submodules.mlp.submodules.shared_experts.params["gate"] = ( config.moe_use_shared_expert_gate ) @@ -327,12 +359,12 @@ def _get_transformer_layer_spec(self, config: Optional["McaModelConfig"] = None) module_spec.submodules.pre_mlp_layernorm = RMSNorm return module_spec - def _get_mtp_block_spec(self, config: Optional["McaModelConfig"] = None): + def _get_mtp_block_spec(self, config: Optional["McaModelConfig"] = None, vp_stage: Optional[int] = None): config = config or self.config if config.mtp_num_layers and config.mtp_num_layers > 0: transformer_layer_spec = self._get_transformer_layer_spec(config) use_te = config.transformer_impl == "transformer_engine" - spec = get_gpt_mtp_block_spec(config, transformer_layer_spec, use_te) + spec = get_gpt_mtp_block_spec(config, transformer_layer_spec, use_te, vp_stage=vp_stage) return spec else: return None diff --git a/mcore_adapter/src/mcore_adapter/models/model_utils.py b/mcore_adapter/src/mcore_adapter/models/model_utils.py index c7c83817e..3fcb7ebbb 100644 --- a/mcore_adapter/src/mcore_adapter/models/model_utils.py +++ b/mcore_adapter/src/mcore_adapter/models/model_utils.py @@ -109,6 +109,37 @@ def forward(self, hidden_states): return self.weight * hidden_states.to(input_dtype) +class _McaLoraLogitsHelper(torch.autograd.Function): + @staticmethod + def forward(ctx, logits: "torch.Tensor"): + return logits + + @staticmethod + def backward(ctx, grad_output: "torch.Tensor"): + if grad_output.size(1) == 1: + # tensor.contiguous() does not change strides[1] with shape [sequence_length, 1, vocab_size] + return grad_output.contiguous().view_as(grad_output) + return grad_output.contiguous() + + +def _mca_lora_logits_postprocess(logits: "torch.Tensor"): + """make sure grad_output is contiguous + Args: + logits: logits split across tensor parallel ranks + dimension is [sequence_length, batch_size, vocab_size/num_parallel_ranks] + Returns: + contiguous logits + (It's fine to change the order of sequence_length and batch_size in dimension) + """ + return _McaLoraLogitsHelper.apply(logits) + + +def mca_lora_logits_postprocess_hook(module, input, output): + logits, other = output + logits = _mca_lora_logits_postprocess(logits) + return logits, other + + def exists_hf_config(model_name_or_path: str) -> bool: return os.path.exists(os.path.join(model_name_or_path, "config.json")) @@ -125,7 +156,8 @@ def check_and_get_attention_backend_by_env(attention_backend: AttnBackend): fused_attn = os.getenv("NVTE_FUSED_ATTN", None) unfused_attn = os.getenv("NVTE_UNFUSED_ATTN", None) - is_set_as = lambda env, value: env is not None and env == value + def is_set_as(env, value): + return env is not None and env == value if is_set_as(flash_attn, "0") and is_set_as(fused_attn, "0") and is_set_as(unfused_attn, "0"): return AttnBackend.local @@ -142,7 +174,7 @@ def get_thd_data_on_this_cp_rank( batch: Dict[str, "torch.Tensor"], packed_seq_params: PackedSeqParams, dim3_keys: List[str] = ["attention_mask"] ): """Performs sharding for Context Parallelism in THD format""" - import transformer_engine # type: ignore + import transformer_engine # noqa: F401 import transformer_engine_torch as tex cp_size = mpu.get_context_parallel_world_size() @@ -162,3 +194,20 @@ def get_thd_data_on_this_cp_rank( batch[key] = batch[key].index_select(seq_dim, seq_idx) batch["packed_seq_params"] = packed_seq_params return batch + + +def configure_resized_vocab_size( + original_vocab_size: int, + tokenizer_len: int, + pad_to_multiple_of: int = 64, +): + if original_vocab_size >= tokenizer_len: + return None + new_vocab_size = ( + (tokenizer_len + pad_to_multiple_of - 1) // pad_to_multiple_of + ) * pad_to_multiple_of + logger.info( + f"Tokenizer length: {tokenizer_len} is greater than original vocab size: {original_vocab_size}. " + f"The vocab is resized to {new_vocab_size}." + ) + return new_vocab_size diff --git a/mcore_adapter/src/mcore_adapter/models/qwen2_5_vl/modeling_qwen2_5_vl.py b/mcore_adapter/src/mcore_adapter/models/qwen2_5_vl/modeling_qwen2_5_vl.py index 14481889a..9cac57025 100644 --- a/mcore_adapter/src/mcore_adapter/models/qwen2_5_vl/modeling_qwen2_5_vl.py +++ b/mcore_adapter/src/mcore_adapter/models/qwen2_5_vl/modeling_qwen2_5_vl.py @@ -26,6 +26,10 @@ def __init__(self, config: "Qwen2_5_VLConfig", **kwargs): attn_implementation="sdpa", torch_dtype=self.config.params_dtype, ).to(current_platform.current_device()) + # TODO: use_reentrant=True might cause error by twice forward/backward when + # training images and videos simultaneously, https://github.com/pytorch/pytorch/issues/81296 + if config.recompute_granularity == "full" and self.training: + self.vision_model.gradient_checkpointing_enable({"use_reentrant": False}) for param in self.vision_model.parameters(): setattr(param, "sequence_parallel", config.sequence_parallel) @@ -324,19 +328,7 @@ def forward( **kwargs, ) -> "torch.Tensor": force_vit_image = kwargs.pop("force_vit_image", False) - force_vit_video = kwargs.pop("force_vit_video", False) - - if position_ids is not None: - expected_shape = (3, input_ids.shape[0], input_ids.shape[1]) # (3, batch, seq_len) - if position_ids.shape != expected_shape: - if position_ids.shape == (input_ids.shape[0], input_ids.shape[1]): - position_ids, _ = self.get_rope_index( - input_ids, image_grid_thw, video_grid_thw, second_per_grid_ts, attention_mask - ) - else: - raise ValueError(f"Unexpected position_ids shape: {position_ids.shape}, " - f"expected: {expected_shape} or {(input_ids.shape[0], input_ids.shape[1])}") - + force_vit_video = kwargs.pop("force_vit_video", False) if position_ids is None and input_ids is not None: position_ids, _ = self.get_rope_index( input_ids, image_grid_thw, video_grid_thw, second_per_grid_ts, attention_mask diff --git a/mcore_adapter/src/mcore_adapter/models/qwen2_vl/modeling_qwen2_vl.py b/mcore_adapter/src/mcore_adapter/models/qwen2_vl/modeling_qwen2_vl.py index 3f68851b7..f42cb7e8b 100644 --- a/mcore_adapter/src/mcore_adapter/models/qwen2_vl/modeling_qwen2_vl.py +++ b/mcore_adapter/src/mcore_adapter/models/qwen2_vl/modeling_qwen2_vl.py @@ -26,6 +26,10 @@ def __init__(self, config: "Qwen2VLConfig", **kwargs): attn_implementation="sdpa", torch_dtype=self.config.params_dtype, ).to(current_platform.current_device()) + # TODO: use_reentrant=True might cause error by twice forward/backward when + # training images and videos simultaneously, https://github.com/pytorch/pytorch/issues/81296 + if config.recompute_granularity == "full" and self.training: + self.vision_model.gradient_checkpointing_enable({"use_reentrant": False}) for param in self.vision_model.parameters(): setattr(param, "sequence_parallel", config.sequence_parallel) @@ -303,16 +307,6 @@ def forward( ) -> "torch.Tensor": force_vit_image = kwargs.pop("force_vit_image", False) force_vit_video = kwargs.pop("force_vit_video", False) - - if position_ids is not None: - expected_shape = (3, input_ids.shape[0], input_ids.shape[1]) # (3, batch, seq_len) - if position_ids.shape != expected_shape: - if position_ids.shape == (input_ids.shape[0], input_ids.shape[1]): - position_ids, _ = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw) - else: - raise ValueError(f"Unexpected position_ids shape: {position_ids.shape}, " - f"expected: {expected_shape} or {(input_ids.shape[0], input_ids.shape[1])}") - if position_ids is None and input_ids is not None: position_ids, _ = self.get_rope_index(input_ids, image_grid_thw, video_grid_thw) diff --git a/mcore_adapter/src/mcore_adapter/models/qwen3/__init__.py b/mcore_adapter/src/mcore_adapter/models/qwen3/__init__.py index 0a5aced77..c75bf0315 100644 --- a/mcore_adapter/src/mcore_adapter/models/qwen3/__init__.py +++ b/mcore_adapter/src/mcore_adapter/models/qwen3/__init__.py @@ -10,6 +10,7 @@ ) from ..model_config import McaModelConfig from ..model_factory import McaGPTModel +from ...utils import is_megatron_llama register_config("qwen3", McaModelConfig) @@ -48,11 +49,18 @@ weight_converters=[ RenameConverOp(hf_names="lm_head.weight", mca_names="output_layer.weight"), RenameConverOp(hf_names="model.embed_tokens.weight", mca_names="embedding.word_embeddings.weight"), - RenameConverOp(hf_names=".input_layernorm.weight", mca_names=".self_attention.linear_qkv.layer_norm_weight"), + RenameConverOp( + hf_names=".input_layernorm.weight", + mca_names=".self_attention.linear_qkv.layer_norm_weight" + if not is_megatron_llama() else ".input_layernorm.weight" + ), RenameConverOp(hf_names=".self_attn.o_proj.weight", mca_names=".self_attention.linear_proj.weight"), RenameConverOp(hf_names=".self_attn.q_norm.weight", mca_names=".self_attention.q_layernorm.weight"), RenameConverOp(hf_names=".self_attn.k_norm.weight", mca_names=".self_attention.k_layernorm.weight"), - RenameConverOp(hf_names=".post_attention_layernorm.weight", mca_names=".mlp.linear_fc1.layer_norm_weight"), + RenameConverOp( + hf_names=".post_attention_layernorm.weight", + mca_names=".mlp.linear_fc1.layer_norm_weight" + if not is_megatron_llama() else ".pre_mlp_layernorm.weight"), RenameConverOp(hf_names="model.norm.weight", mca_names="decoder.final_layernorm.weight"), StackConverOp( hf_names=[".mlp.gate_proj.weight", ".mlp.up_proj.weight"], mca_names=".mlp.linear_fc1.weight", dim=0 diff --git a/mcore_adapter/src/mcore_adapter/models/qwen3_moe/__init__.py b/mcore_adapter/src/mcore_adapter/models/qwen3_moe/__init__.py index d752440f1..7c53a4313 100644 --- a/mcore_adapter/src/mcore_adapter/models/qwen3_moe/__init__.py +++ b/mcore_adapter/src/mcore_adapter/models/qwen3_moe/__init__.py @@ -10,6 +10,7 @@ ) from ..model_config import McaModelConfig from ..model_factory import McaGPTModel +from ...utils import is_megatron_llama register_config("qwen3_moe", McaModelConfig) @@ -56,7 +57,11 @@ weight_converters=[ RenameConverOp(hf_names="lm_head.weight", mca_names="output_layer.weight"), RenameConverOp(hf_names="model.embed_tokens.weight", mca_names="embedding.word_embeddings.weight"), - RenameConverOp(hf_names=".input_layernorm.weight", mca_names=".self_attention.linear_qkv.layer_norm_weight"), + RenameConverOp( + hf_names=".input_layernorm.weight", + mca_names=".self_attention.linear_qkv.layer_norm_weight" + if not is_megatron_llama() else ".input_layernorm.weight" + ), RenameConverOp(hf_names=".self_attn.o_proj.weight", mca_names=".self_attention.linear_proj.weight"), RenameConverOp(hf_names=".self_attn.q_norm.weight", mca_names=".self_attention.q_layernorm.weight"), RenameConverOp(hf_names=".self_attn.k_norm.weight", mca_names=".self_attention.k_layernorm.weight"), diff --git a/mcore_adapter/src/mcore_adapter/models/qwen3_next/__init__.py b/mcore_adapter/src/mcore_adapter/models/qwen3_next/__init__.py index b9ef623f6..12c3adc98 100644 --- a/mcore_adapter/src/mcore_adapter/models/qwen3_next/__init__.py +++ b/mcore_adapter/src/mcore_adapter/models/qwen3_next/__init__.py @@ -11,6 +11,7 @@ ) from ..converter.template import ( ConverOp, + CopyConverOp, QKVConverOp, RenameConverOp, StackConverOp, @@ -37,6 +38,8 @@ def _mca_to_hf(self, weights): class NextQKVConverOp(QKVConverOp): """query weight used for calculating query_states and gate""" def _hf_to_mca(self, weights): + if self.hidden_size is None: + self.hidden_size = self.mca_config.hidden_size q_weight, k_weight, v_weight = weights nh = self.mca_config.num_attention_heads ng = self.mca_config.num_query_groups @@ -49,19 +52,21 @@ def _hf_to_mca(self, weights): v_weight.reshape((ng, dim, -1)), ], dim=1, - ).reshape((-1, self.mca_config.hidden_size)) + ).reshape((-1, self.hidden_size)) return mca_qkv_weight def _mca_to_hf(self, weights): + if self.hidden_size is None: + self.hidden_size = self.mca_config.hidden_size qkv_weight = weights[0] ng = self.mca_config.num_query_groups nh = self.mca_config.num_attention_heads dim = self.mca_config.kv_channels qkv_weight = qkv_weight.reshape((ng, dim * (nh // ng * 2 + 2), -1)) qkv_weights = torch.split(qkv_weight, [dim * nh // ng * 2, dim, dim], dim=1) - q_weight = qkv_weights[0].reshape((-1, self.mca_config.hidden_size)) - k_weight = qkv_weights[1].reshape((-1, self.mca_config.hidden_size)) - v_weight = qkv_weights[2].reshape((-1, self.mca_config.hidden_size)) + q_weight = qkv_weights[0].reshape((-1, self.hidden_size)) + k_weight = qkv_weights[1].reshape((-1, self.hidden_size)) + v_weight = qkv_weights[2].reshape((-1, self.hidden_size)) return [q_weight, k_weight, v_weight] @@ -95,14 +100,39 @@ def add_hf_weight(self, name, weight): return {f"decoder.layers.{layer_idx}.input_layernorm.weight": weight} return super().add_hf_weight(name, weight) - def add_mca_weight(self, name, weight): + def add_mca_weight(self, name, weight, **kwargs): pattern = r"^decoder\.layers\.(\d+)\.input_layernorm\.weight$" match = re.match(pattern, name) if not match: - return super().add_mca_weight(name, weight) + return super().add_mca_weight(name, weight, **kwargs) layer_idx = int(match.group(1)) if match else None return {f"model.layers.{layer_idx}.input_layernorm.weight": weight} + def get_lora_conver_op(self, name, pattern_to_conver_ops: dict[str, ConverOp], lora_rank: int): + lora_name = name[name.find(".lora") :] + name = name[: name.find(".lora")] + ".weight" + op = self.get_conver_op(name, pattern_to_conver_ops) + if isinstance(op, RenameConverOp): + op_class = RenameConverOp + kwargs = {} + elif "lora_A" in lora_name: + op_class = CopyConverOp + kwargs = {} + elif isinstance(op, StackConverOp): + op_class = StackConverOp + kwargs = {"dim": op.dim} + elif isinstance(op, NextQKVConverOp): + op_class = NextQKVConverOp + kwargs = {"hidden_size": lora_rank} + else: + raise ValueError(f"can not find lora conver op for {name} in {pattern_to_conver_ops}") + return op_class( + hf_names=[hf_name.replace(".weight", lora_name) for hf_name in op.hf_names], + mca_names=[mca_name.replace(".weight", lora_name) for mca_name in op.mca_names], + _mca_config=op.mca_config, + **kwargs, + ) + register_template( "qwen3_next", diff --git a/mcore_adapter/src/mcore_adapter/models/qwen3_next/modeling_qwen3_next.py b/mcore_adapter/src/mcore_adapter/models/qwen3_next/modeling_qwen3_next.py index d064c708a..40bba96b2 100644 --- a/mcore_adapter/src/mcore_adapter/models/qwen3_next/modeling_qwen3_next.py +++ b/mcore_adapter/src/mcore_adapter/models/qwen3_next/modeling_qwen3_next.py @@ -12,10 +12,11 @@ from megatron.core.transformer.transformer_layer import get_transformer_layer_offset from torch.nn import functional as F +from ...platforms import current_platform from ..auto.modeling_auto import register_model from ..model_factory import McaGPTModel from .config_qwen3_next import Qwen3NextConfig -from ...platforms import current_platform + # based on qwen3next code in transformers class Qwen3NextRMSNorm(nn.Module): @@ -132,7 +133,7 @@ def fix_query_key_value_ordering(self, mixed_qkvz, mixed_ba): return query, key, value, z, b, a def forward(self, hidden_states: torch.Tensor, **kwargs) -> Tuple[torch.Tensor, Optional[torch.Tensor]]: - hidden_states = hidden_states.transpose(0, 1) # [b, s, h] + hidden_states = hidden_states.transpose(0, 1) # [b, s, h] # Set up dimensions for reshapes later batch_size, seq_len, _ = hidden_states.shape @@ -264,7 +265,7 @@ def forward( ] try: - import transformer_engine # pylint: disable=unused-import + import transformer_engine # noqa: F401 from megatron.core.extensions.transformer_engine import SplitAlongDim except ImportError: SplitAlongDim = None diff --git a/mcore_adapter/src/mcore_adapter/models/qwen3_omni/__init__.py b/mcore_adapter/src/mcore_adapter/models/qwen3_omni/__init__.py new file mode 100644 index 000000000..4fdae9671 --- /dev/null +++ b/mcore_adapter/src/mcore_adapter/models/qwen3_omni/__init__.py @@ -0,0 +1,142 @@ +from dataclasses import dataclass + +from ..converter.dist_converter import ( + DistParallelConfig, + default_dist_config, + register_dist_config, + shared_moe_dist_config, +) +from ..converter.template import ( + QKVBiasConverOp, + QKVConverOp, + RenameConverOp, + StackConverOp, + Template, + register_template, +) +from .config_qwen3_omni import Qwen3OmniMoeConfig +from .modeling_qwen3_omni import Qwen3OmniMoeModel + + +@dataclass +class Qwen3OmniMoeTemplate(Template): + def adjust_config_hf_to_mca(self): + non_text_config_keys = set( + list(filter(lambda k: k.endswith("_token_id"), self.config_hf_to_mca.keys())) + + ["position_id_per_seconds", "vision_config", "audio_config"] + ) + audio_output_config_keys = ["enable_audio_output", "talker_config", "code2wav_config"] + new_config_hf_to_mca = {} + for hf_key, mca_key in self.config_hf_to_mca.items(): + new_hf_key = hf_key + if hf_key not in audio_output_config_keys: + if hf_key not in non_text_config_keys: + new_hf_key = "text_config." + new_hf_key + new_hf_key = "thinker_config." + new_hf_key + new_config_hf_to_mca[new_hf_key] = mca_key + return new_config_hf_to_mca + + +register_dist_config( + "qwen3_omni_moe", + default_dist_config.merge_configs(shared_moe_dist_config).merge_configs( + DistParallelConfig( + pre_process_weights=["vision_model.*", "audio_model.*"], + post_process_weights=["talker.*", "code2wav.*"], + duplicated_weights=["vision_model.*", "audio_model.*", "talker.*", "code2wav.*"], + ) + ), +) + + +# NOTE: thinking and instruct both use qwen3_omni_moe as model_type and Qwen3OmniMoeForConditionalGeneration +# as architecture, thus both hf config and weight key has thinker prefix. And it seems the processor cannot +# use list fps thus video should be processed by one by one. +# TODO: Should we use "thinker" for naming template/config/model, would there exist confilicts if we support +# instruct model since thinking and instruct both use qwen3_omni_moe +register_template( + "qwen3_omni_moe", + hf_layer_prefix="thinker.model.layers.", + hf_moe_prefix=".mlp.experts.", + template_class=Qwen3OmniMoeTemplate, # Qwen3VLMoeTemplate, + # hf has hierarchical config for multi-modal models while mca has flat config + config_hf_to_mca={ + "max_position_embeddings": "max_sequence_length", + "hidden_size": "hidden_size", + "attention_bias": "add_qkv_bias", + "head_dim": "kv_channels", + "num_attention_heads": "num_attention_heads", + "num_key_value_heads": "num_query_groups", + "num_hidden_layers": "num_layers", + "rms_norm_eps": "layernorm_epsilon", + "vocab_size": "padded_vocab_size", + "attention_dropout": "attention_dropout", + "rope_theta": "rotary_base", + "rope_scaling": "rope_scaling", + "intermediate_size": "ffn_hidden_size", + "tie_word_embeddings": "tie_embeddings_and_output_weights", + # MoE related + "moe_intermediate_size": "moe_ffn_hidden_size", + "decoder_sparse_step": "moe_layer_freq", + "num_experts": "num_moe_experts", + "num_experts_per_tok": "moe_router_topk", + "router_aux_loss_coef": "moe_aux_loss_coeff", + # ait ralated, only need for usage in get_rope_index + "audio_token_id": "audio_token_id", + "audio_start_token_id": "audio_start_token_id", + # "audio_end_token_id": "audio_start_token_id", + # vit related, only need for usage in get_rope_index + "image_token_id": "image_token_id", + "video_token_id": "video_token_id", + "vision_start_token_id": "vision_start_token_id", + # "vision_end_token_id": "vision_end_token_id", + "position_id_per_seconds": "position_id_per_seconds", + "vision_config": "vision_config", + "audio_config": "audio_config", + "enable_audio_output": "enable_audio_output", + "talker_config": "talker_config", + "code2wav_config": "code2wav_config", + }, + constant_mca_config={ + "swiglu": True, + "position_embedding_type": "mrope", # TM-ROPE + "normalization": "RMSNorm", + "add_bias_linear": False, + "hidden_dropout": 0.0, + "rotary_percent": 1.0, + "moe_router_load_balancing_type": "aux_loss", + "moe_router_pre_softmax": False, + "qk_layernorm": True, + }, + weight_converters=[ + RenameConverOp(hf_names="thinker.model.embed_tokens.weight", mca_names="embedding.word_embeddings.weight"), + RenameConverOp(hf_names="thinker.model.norm.weight", mca_names="decoder.final_layernorm.weight"), + RenameConverOp(hf_names="thinker.lm_head.weight", mca_names="output_layer.weight"), + RenameConverOp(hf_names=".input_layernorm.weight", mca_names=".self_attention.linear_qkv.layer_norm_weight"), + # attention weights + QKVConverOp( + hf_names=[".self_attn.q_proj.weight", ".self_attn.k_proj.weight", ".self_attn.v_proj.weight"], + mca_names=".self_attention.linear_qkv.weight", + ), + QKVBiasConverOp( + hf_names=[".self_attn.q_proj.bias", ".self_attn.k_proj.bias", ".self_attn.v_proj.bias"], + mca_names=".self_attention.linear_qkv.bias", + ), # attention_bias is false actually + RenameConverOp(hf_names=".self_attn.o_proj.weight", mca_names=".self_attention.linear_proj.weight"), + RenameConverOp(hf_names=".self_attn.q_norm.weight", mca_names=".self_attention.q_layernorm.weight"), + RenameConverOp(hf_names=".self_attn.k_norm.weight", mca_names=".self_attention.k_layernorm.weight"), + RenameConverOp(hf_names=".post_attention_layernorm.weight", mca_names=".pre_mlp_layernorm.weight"), + # moe weights + RenameConverOp(hf_names=".mlp.gate.weight", mca_names=".mlp.router.weight"), + StackConverOp(hf_names=[".gate_proj.weight", ".up_proj.weight"], mca_names=".linear_fc1.weight", dim=0), + RenameConverOp(hf_names=".down_proj.weight", mca_names=".linear_fc2.weight"), + RenameConverOp(hf_names="thinker.visual.{}", mca_names="vision_model.{}"), + # add audio model to make it can be saved and used in hf + # although the audio_model weights can be put into template.hf_invalid_keys + RenameConverOp(hf_names="thinker.audio_tower.{}", mca_names="audio_model.{}"), + RenameConverOp(hf_names="talker.{}", mca_names="talker.{}"), + RenameConverOp(hf_names="code2wav.{}", mca_names="code2wav.{}"), + ], +) + +__all__ = ["Qwen3OmniMoeConfig", "Qwen3OmniMoeModel"] diff --git a/mcore_adapter/src/mcore_adapter/models/qwen3_omni/config_qwen3_omni.py b/mcore_adapter/src/mcore_adapter/models/qwen3_omni/config_qwen3_omni.py new file mode 100644 index 000000000..ba1ef0eb5 --- /dev/null +++ b/mcore_adapter/src/mcore_adapter/models/qwen3_omni/config_qwen3_omni.py @@ -0,0 +1,69 @@ +from dataclasses import dataclass, field +from typing import Optional + +from transformers import PretrainedConfig + +from ...utils import get_logger +from ..auto.config_auto import register_config +from ..model_config import McaModelConfig + + +logger = get_logger(__name__) + +@register_config("qwen3_omni_moe") +@dataclass +class Qwen3OmniMoeConfig(McaModelConfig): + audio_token_id: int = 151675 + image_token_id: int = 151655 + video_token_id: int = 151656 + position_id_per_seconds: int = 13 + audio_start_token_id: int = 151669 + vision_start_token_id: int = 151652 + vision_config: Optional[dict] = field( + default=None, + metadata={"help": "Vision model config."}, + ) + audio_config: Optional[dict] = field( + default=None, + metadata={"help": "audio model config."}, + ) + # text_config: Optional[dict] = field( + # default=None, + # metadata={"help": "Text model config."}, + # ) + enable_audio_output: bool = False + talker_config: Optional[dict] = field( + default=None, + metadata={"help": "talker model config."}, + ) + code2wav_config: Optional[dict] = field( + default=None, + metadata={"help": "code2wav model config."}, + ) + rope_scaling: Optional[dict] = field( + default=None, + metadata={"help": "Rope scaling."}, + ) + + def __post_init__(self): + super().__post_init__() + from transformers.models.qwen3_omni_moe.configuration_qwen3_omni_moe import Qwen3OmniMoeVisionEncoderConfig + + if isinstance(self.audio_config, PretrainedConfig): + self.audio_config = self.audio_config.to_dict() + if isinstance(self.vision_config, PretrainedConfig): + self.vision_config = self.vision_config.to_dict() + if isinstance(self.talker_config, PretrainedConfig): + self.talker_config = self.talker_config.to_dict() + if isinstance(self.code2wav_config, PretrainedConfig): + self.code2wav_config = self.code2wav_config.to_dict() + vision_config_obj = Qwen3OmniMoeVisionEncoderConfig(**self.vision_config) + self.merge_size = vision_config_obj.spatial_merge_size + self.pixel_values_dim = ( + vision_config_obj.patch_size + * vision_config_obj.patch_size + * vision_config_obj.in_channels + * vision_config_obj.temporal_patch_size + ) # 1536 + assert "mrope_section" in self.rope_scaling, "mrope_section is required" + self.mrope_section = self.rope_scaling.get("mrope_section") diff --git a/mcore_adapter/src/mcore_adapter/models/qwen3_omni/modeling_qwen3_omni.py b/mcore_adapter/src/mcore_adapter/models/qwen3_omni/modeling_qwen3_omni.py new file mode 100644 index 000000000..e0ed698e8 --- /dev/null +++ b/mcore_adapter/src/mcore_adapter/models/qwen3_omni/modeling_qwen3_omni.py @@ -0,0 +1,305 @@ +import types +from typing import Optional, List + +import torch +from megatron.core import mpu + +from ..auto.modeling_auto import register_model +from ..qwen3_vl.modeling_qwen3_vl import Qwen3VLGPTModel, Qwen3VLModel +from .config_qwen3_omni import Qwen3OmniMoeConfig + + +@register_model("qwen3_omni_moe") +class Qwen3OmniMoeModel(Qwen3VLModel): + config_class = Qwen3OmniMoeConfig + + def __init__(self, config: "Qwen3OmniMoeConfig", **kwargs): + from transformers.models.qwen3_omni_moe.configuration_qwen3_omni_moe import ( + Qwen3OmniMoeAudioEncoderConfig, + Qwen3OmniMoeVisionEncoderConfig, + ) + from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( + Qwen3OmniMoeAudioEncoder, + Qwen3OmniMoePreTrainedModelForConditionalGeneration, + Qwen3OmniMoeVisionEncoder, + _get_feat_extract_output_lengths, + ) + + Qwen3VLGPTModel.__init__(self, config, **kwargs) + + if mpu.get_pipeline_model_parallel_rank() == 0 and self.vp_stage == 0: + assert self.decoder.num_layers_per_pipeline_rank >= len( + config.vision_config.get("deepstack_visual_indexes", [8, 16, 24]) + ), "Current pp and vp not support deepstack" + + if self.pre_process: + # add audio model to make it can be saved and used in hf + # although the audio_model weights can be put into template.hf_invalid_keys + self.audio_model = Qwen3OmniMoeAudioEncoder._from_config( + Qwen3OmniMoeAudioEncoderConfig(**config.audio_config), + attn_implementation="sdpa", + torch_dtype=self.config.params_dtype, + ).to(torch.cuda.current_device()) + for param in self.audio_model.parameters(): + setattr(param, "sequence_parallel", config.sequence_parallel) + self.vision_model = Qwen3OmniMoeVisionEncoder._from_config( + Qwen3OmniMoeVisionEncoderConfig(**config.vision_config), + attn_implementation="sdpa", + torch_dtype=self.config.params_dtype, + ).to(torch.cuda.current_device()) + # TODO: use_reentrant=True might cause error by twice forward/backward when + # training images and videos simultaneously, https://github.com/pytorch/pytorch/issues/81296 + if config.recompute_granularity == "full" and self.training: + self.vision_model.gradient_checkpointing_enable({"use_reentrant": False}) + for param in self.vision_model.parameters(): + setattr(param, "sequence_parallel", config.sequence_parallel) + + if self.post_process: + if config.enable_audio_output: + # not support talker with audio output yet + from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( + Qwen3OmniMoeTalkerForConditionalGeneration, + Qwen3OmniMoeCode2Wav, + ) + from transformers.models.qwen3_omni_moe.modeling_qwen3_omni_moe import ( + Qwen3OmniMoeTalkerConfig, + Qwen3OmniMoeCode2WavConfig, + ) + self.talker = Qwen3OmniMoeTalkerForConditionalGeneration._from_config( + Qwen3OmniMoeTalkerConfig(**config.talker_config), + torch_dtype=self.config.params_dtype, + ).to(torch.cuda.current_device()) + self.code2wav = Qwen3OmniMoeCode2Wav._from_config( + Qwen3OmniMoeCode2WavConfig(**config.code2wav_config), + torch_dtype=self.config.params_dtype, + ).to(torch.cuda.current_device()) + + # construct get_rope_index needed method and attrs + self.get_rope_index = types.MethodType( + Qwen3OmniMoePreTrainedModelForConditionalGeneration.get_rope_index, self + ) + self.get_llm_pos_ids_for_vision = types.MethodType( + Qwen3OmniMoePreTrainedModelForConditionalGeneration.get_llm_pos_ids_for_vision, self + ) + self.spatial_merge_size = self.config.merge_size + + self._get_feat_extract_output_lengths = _get_feat_extract_output_lengths + + def construct_inputs_embeds( + self, + input_ids: "torch.LongTensor", + inputs_embeds: "torch.FloatTensor", + pixel_values: "torch.Tensor", + grid_thw: "torch.LongTensor", + pixel_values_videos: "torch.Tensor", + video_grid_thw: "torch.LongTensor", + input_features: "torch.Tensor", + feature_lens: "torch.Tensor", + feature_attention_mask: "torch.Tensor", + input_ranges: List[List[int]], + image_token_id: int, + video_token_id: int, + audio_token_id: int, + ): + """ + inputs_embeds: [s, b, h] or [s/tp, b, h] when sequence parallel + ranges: sequence range + """ + visual_pos_masks, deepstack_visual_embeds = None, None + # TODO: same as qwen3-vl, only support images or videos since no deepstack_visual_embeds merge process currently + # maybe merge images and videos first to run vision_model and get deepstack_visual_embeds for images and videos simultaneously + assert pixel_values is None or pixel_values_videos is None, ( + "inputs with both images and videos are not supported temporarily" + ) + if pixel_values is not None: + inputs_embeds, visual_pos_masks, deepstack_visual_embeds = super().construct_inputs_embeds( + input_ids, + inputs_embeds, + pixel_values, + grid_thw, + input_ranges, + image_token_id, + ) + elif pixel_values_videos is not None: + inputs_embeds, visual_pos_masks, deepstack_visual_embeds = super().construct_inputs_embeds( + input_ids, + inputs_embeds, + pixel_values_videos, + video_grid_thw, + input_ranges, + video_token_id, + ) + + if input_features is None: + return inputs_embeds, visual_pos_masks, deepstack_visual_embeds + + # for audio input embeds + # (bs, freqs, frames) -> (total_frames, freqs) + input_features = input_features.permute(0, 2, 1)[feature_attention_mask.bool()] + # TODO: audio can be treated as chunks of frames with chunk_size for sp/cp actually, + # chunk_size = 100 * (self.n_window_infer // (self.n_window * 2)) + # temporarily only split audios instead of chunks to simplify which may cause duplicated calculation for same audio + # maybe scatter chunks to sp/cp group for load balance furthermore + feat_mask = input_ids == audio_token_id + feat_culens = feature_lens.cumsum(dim=0, dtype=torch.int32).tolist() # use list + feat_embeds_culens = self._get_feat_extract_output_lengths(feature_lens).cumsum(dim=0, dtype=torch.int32) + required_feat = [] # features to vision tower + required_feat_lens = [] # feature lengths to vision tower + valid_feat_embeds_nums = [] # indicate the ranges of needed feature embeds + added_feat_indexes = [] # feature indexes included in input_ranges + for i in range(feat_mask.shape[0]): + for inputs_start, inputs_end in input_ranges: + # same as qwen-vl, get features included in a sub-range corresponding to each sample + valid_feat_embeds_start = feat_mask[:i].sum().item() + valid_feat_embeds_start += feat_mask[i, :inputs_start].sum().item() + embeds_num = feat_mask[i, inputs_start:inputs_end].sum().item() + valid_feat_embeds_end = valid_feat_embeds_start + embeds_num + used_embeds_culen_start = 0 # embeds seqlens before this sub-range + new_embeds_culen_start = 0 # embeds seqlens new added in this sub-range, new_embeds_seqlen_start >= used_embeds_seqlen_start + added_culen_before_used = 0 # embeds seqlens in before sub-ranges of input_ranges + embed_culen_end = feat_embeds_culens[-1] + for feat_index, feat_embeds_culen in enumerate(feat_embeds_culens): + if valid_feat_embeds_start < feat_embeds_culen: # included in current sub-range + if feat_index not in added_feat_indexes: + # included in current sub-range and have not been added before, add it + required_feat_lens.append(feature_lens[feat_index]) + # maybe extend together at last, while mapping from embeds length to feature length is not direct + required_feat.append( + input_features[ + (0 if feat_index == 0 else feat_culens[feat_index - 1]) : feat_culens[feat_index] + ] + ) + added_feat_indexes.append(feat_index) + else: + # included in current sub-range but have been added by previous sub-range of this sample, skip it + new_embeds_culen_start = feat_embeds_culen + else: # not included in current sub-range + used_embeds_culen_start = feat_embeds_culen + new_embeds_culen_start = feat_embeds_culen + if feat_index in added_feat_indexes: # included in before sub-ranges of input_ranges + before_culen = 0 if feat_index == 0 else feat_embeds_culens[feat_index - 1].item() + added_culen_before_used += feat_embeds_culen - before_culen + if valid_feat_embeds_end <= feat_embeds_culen: + embed_culen_end = feat_embeds_culen + break + + # embeds offset in range for this sub-range: offset_in_range = offset_in_start_feat + emb_len_of_pre_subranges + embeds_needed_start = valid_feat_embeds_start - used_embeds_culen_start + added_culen_before_used + embeds_needed_end = valid_feat_embeds_end - used_embeds_culen_start + added_culen_before_used + if embeds_needed_start < embeds_needed_end: + valid_feat_embeds_nums.append((embeds_needed_start, embeds_needed_end)) + + if len(valid_feat_embeds_nums) == 0: + # should we use dummy feature input to handle this, _handle_missing_visual is used in qwen-vl + return inputs_embeds, visual_pos_masks, deepstack_visual_embeds + + required_feat = torch.cat(required_feat, dim=0) + required_feat_lens = torch.stack(required_feat_lens, dim=0) + feat_model_dtype = self.audio_model.layers[0].fc1.weight.dtype + required_feat = required_feat.type(feat_model_dtype) + # convert to (freqs, total_frames) for input_features to use audio_tower from hf + required_feat = required_feat.permute(1, 0) + feat_embeds = self.audio_model(required_feat, required_feat_lens) + feat_embeds = feat_embeds.last_hidden_state.to(inputs_embeds.device, inputs_embeds.dtype) + feat_mask = torch.cat( + [feat_mask[:, inputs_start:inputs_end] for inputs_start, inputs_end in input_ranges], dim=1 + ) + needed_feat_embeds_num = feat_mask.sum().item() + needed_feat_embeds = torch.zeros( + [needed_feat_embeds_num] + list(feat_embeds.shape[1:]), + dtype=inputs_embeds.dtype, + device=inputs_embeds.device, + ) + + added_num = 0 + for start, end in valid_feat_embeds_nums: + embeds_num = end - start + needed_feat_embeds[added_num : added_num + embeds_num] = feat_embeds[start:end] + added_num += embeds_num + assert added_num == needed_feat_embeds_num + + inputs_embeds = inputs_embeds.transpose(0, 1) # [s, b, h] -> [b, s, h] + feat_mask = feat_mask.unsqueeze(-1).expand_as(inputs_embeds) + inputs_embeds = inputs_embeds.masked_scatter(feat_mask, needed_feat_embeds) + inputs_embeds = inputs_embeds.transpose(0, 1).contiguous() + + return inputs_embeds, visual_pos_masks, deepstack_visual_embeds + + def forward( + self, + input_ids: "torch.Tensor", + position_ids: Optional["torch.Tensor"] = None, + attention_mask: Optional["torch.Tensor"] = None, + decoder_input: Optional["torch.Tensor"] = None, + labels: Optional["torch.Tensor"] = None, + pixel_values: Optional["torch.Tensor"] = None, + pixel_values_videos: Optional["torch.Tensor"] = None, + image_grid_thw: Optional["torch.LongTensor"] = None, + video_grid_thw: Optional["torch.LongTensor"] = None, + use_audio_in_video: Optional[bool] = None, + video_second_per_grid: Optional[torch.Tensor] = None, + input_features: Optional["torch.Tensor"] = None, + feature_attention_mask: Optional["torch.Tensor"] = None, + **kwargs, + ) -> "torch.Tensor": + force_vit_image = kwargs.pop("force_vit_image", False) + force_vit_video = kwargs.pop("force_vit_video", False) + feature_lens = None + if position_ids is None and input_ids is not None: + if feature_attention_mask is not None: + feature_lens = torch.sum(feature_attention_mask, dim=1) + position_ids, _ = self.get_rope_index( + input_ids, + image_grid_thw, + video_grid_thw, + attention_mask=torch.ones(input_ids.shape, dtype=input_ids.dtype, device=input_ids.device), + use_audio_in_video=use_audio_in_video, + audio_seqlens=feature_lens, + second_per_grids=video_second_per_grid, + ) + + cp_batch = { + "input_ids": input_ids, + "attention_mask": attention_mask, + } + if self.config.context_parallel_size > 1: + cp_batch = {k: v.clone() if v is not None else None for k, v in cp_batch.items()} + cp_batch = super(Qwen3VLModel, self).get_batch_on_this_cp_rank(cp_batch, dim3_keys=[]) + + if not self.pre_process or decoder_input is not None: + return super(Qwen3VLModel, self).forward( + decoder_input=decoder_input, labels=labels, position_ids=position_ids, **cp_batch, **kwargs + ) + + inputs_ranges = self.get_input_ranges(input_ids.shape[1]) + + inputs_embeds = self.embedding(input_ids=cp_batch["input_ids"], position_ids=None) + + if pixel_values is not None or pixel_values_videos is not None: + inputs_embeds, visual_pos_masks, deepstack_visual_embeds = self.construct_inputs_embeds( + input_ids, + inputs_embeds, + pixel_values, + image_grid_thw, + pixel_values_videos, + video_grid_thw, + input_features, + feature_lens, + feature_attention_mask, + inputs_ranges, + self.config.image_token_id, + self.config.video_token_id, + self.config.audio_token_id, + ) + elif force_vit_image or force_vit_video: + inputs_embeds, visual_pos_masks, deepstack_visual_embeds = self._handle_missing_visual(inputs_embeds) + + return super(Qwen3VLModel, self).forward( + decoder_input=inputs_embeds, + labels=labels, + position_ids=position_ids, + visual_pos_masks=visual_pos_masks, + deepstack_visual_embeds=deepstack_visual_embeds, + **cp_batch, + **kwargs, + ) diff --git a/mcore_adapter/src/mcore_adapter/models/qwen3_vl/__init__.py b/mcore_adapter/src/mcore_adapter/models/qwen3_vl/__init__.py index 054c12697..fdc277056 100644 --- a/mcore_adapter/src/mcore_adapter/models/qwen3_vl/__init__.py +++ b/mcore_adapter/src/mcore_adapter/models/qwen3_vl/__init__.py @@ -1,9 +1,12 @@ +from dataclasses import dataclass + from ..converter.dist_converter import DistParallelConfig, default_dist_config, register_dist_config from ..converter.template import ( QKVBiasConverOp, QKVConverOp, RenameConverOp, StackConverOp, + Template, register_template, ) from .config_qwen3_vl import Qwen3VLConfig @@ -20,9 +23,34 @@ ), ) + +@dataclass +class Qwen3VLTemplate(Template): + def adjust_config_hf_to_mca(self): + # NOTE: for `tie_word_embeddings`, + # in qwen3-vl model like Qwen/Qwen3-VL-4B-Instruct, tie_word_embeddings + # exists both in inner and outer of text_config, and both are True + # in qwen3-vl-moe model like Qwen/Qwen3-VL-30B-A3B-Instruct, tie_word_embeddings + # in outer of text_config is False while it uses the default value True in the + # inner of text_config + # currently, both use tie_word_embeddings in the outter of text_config + non_text_config_keys = set( + list(filter(lambda k: k.endswith("_token_id"), self.config_hf_to_mca.keys())) + + ["vision_config", "tie_word_embeddings"] + ) + new_config_hf_to_mca = {} + for hf_key, mca_key in self.config_hf_to_mca.items(): + new_hf_key = hf_key + if hf_key not in non_text_config_keys: + new_hf_key = "text_config." + new_hf_key + new_config_hf_to_mca[new_hf_key] = mca_key + return new_config_hf_to_mca + + register_template( "qwen3_vl", hf_layer_prefix="model.language_model.layers.", + template_class=Qwen3VLTemplate, config_hf_to_mca={ "max_position_embeddings": "max_sequence_length", "hidden_size": "hidden_size", diff --git a/mcore_adapter/src/mcore_adapter/models/qwen3_vl/modeling_qwen3_vl.py b/mcore_adapter/src/mcore_adapter/models/qwen3_vl/modeling_qwen3_vl.py index fc0494362..5702f3f4a 100644 --- a/mcore_adapter/src/mcore_adapter/models/qwen3_vl/modeling_qwen3_vl.py +++ b/mcore_adapter/src/mcore_adapter/models/qwen3_vl/modeling_qwen3_vl.py @@ -154,6 +154,10 @@ def __init__(self, config: "Qwen3VLConfig", **kwargs): attn_implementation="sdpa", torch_dtype=self.config.params_dtype, ).to(torch.cuda.current_device()) + # TODO: use_reentrant=True might cause error by twice forward/backward when + # training images and videos simultaneously, https://github.com/pytorch/pytorch/issues/81296 + if config.recompute_granularity == "full" and self.training: + self.vision_model.gradient_checkpointing_enable({"use_reentrant": False}) for param in self.vision_model.parameters(): setattr(param, "sequence_parallel", config.sequence_parallel) diff --git a/mcore_adapter/src/mcore_adapter/models/qwen3_vl/rope_utils.py b/mcore_adapter/src/mcore_adapter/models/qwen3_vl/rope_utils.py index 61ed6e3bd..cff5d2ab7 100644 --- a/mcore_adapter/src/mcore_adapter/models/qwen3_vl/rope_utils.py +++ b/mcore_adapter/src/mcore_adapter/models/qwen3_vl/rope_utils.py @@ -3,7 +3,6 @@ import torch from megatron.core import parallel_state from megatron.core.models.common.embeddings.rope_utils import ( - _apply_rotary_pos_emb_bshd, get_pos_emb_on_this_cp_rank, ) from torch import nn diff --git a/mcore_adapter/src/mcore_adapter/models/qwen3_vl/transformer_block.py b/mcore_adapter/src/mcore_adapter/models/qwen3_vl/transformer_block.py index 02775d448..d9543cc47 100644 --- a/mcore_adapter/src/mcore_adapter/models/qwen3_vl/transformer_block.py +++ b/mcore_adapter/src/mcore_adapter/models/qwen3_vl/transformer_block.py @@ -21,7 +21,7 @@ try: - import transformer_engine.pytorch as te # pylint: disable=unused-import + import transformer_engine.pytorch as te # noqa: F401 HAVE_TE = True except ImportError: diff --git a/mcore_adapter/src/mcore_adapter/models/qwen3_vl_moe/__init__.py b/mcore_adapter/src/mcore_adapter/models/qwen3_vl_moe/__init__.py new file mode 100644 index 000000000..282ecf9ff --- /dev/null +++ b/mcore_adapter/src/mcore_adapter/models/qwen3_vl_moe/__init__.py @@ -0,0 +1,190 @@ +from dataclasses import dataclass +from typing import TYPE_CHECKING + +import torch + +from ..auto.config_auto import register_config +from ..auto.modeling_auto import register_model +from ..converter.convert_utils import ( + convert_to_hf_prefix, + get_mca_moe_index, + get_mca_weight_prefix, + remove_mca_weight_prefix, +) +from ..converter.dist_converter import ( + DistParallelConfig, + default_dist_config, + register_dist_config, + shared_moe_dist_config, +) +from ..converter.template import ( + ConverOp, + QKVBiasConverOp, + QKVConverOp, + RenameConverOp, + StackedTensors, + Template, + register_template, +) +from ..qwen3_vl import Qwen3VLConfig, Qwen3VLModel, Qwen3VLTemplate + + +if TYPE_CHECKING: + from megatron.core.transformer import TransformerConfig + + +@dataclass +class SplitConverOp(ConverOp): + def __post_init__(self): + super().__post_init__() + assert len(self.hf_names) == 1, f"SplitConverOp only support one name {self.hf_names}" + + @property + def mca_config(self) -> "TransformerConfig": + return self._mca_config + + @mca_config.setter + def mca_config(self, value: "TransformerConfig"): + self._mca_config = value + if len(self.mca_names) == 1: + mca_name = self.mca_names[0] + num_splits = self._mca_config.num_moe_experts + self.mca_names = [str(i) + mca_name for i in range(num_splits)] + + def _hf_to_mca(self, weights): + return list(torch.unbind(weights[0].transpose(1, 2).contiguous(), dim=0)) + + def _mca_to_hf(self, weights): + if isinstance(weights[0], StackedTensors): + return torch.stack([torch.cat(weight.tensors) for weight in weights], dim=0).transpose(1, 2).contiguous() + return torch.stack(weights, dim=0).transpose(1, 2).contiguous() + + +@dataclass +class SplitStackConverOp(SplitConverOp): + def _hf_to_mca(self, weights): + return [ + StackedTensors(torch.chunk(w, 2, dim=0), dim=0) + for w in torch.unbind(weights[0].transpose(1, 2).contiguous(), dim=0) + ] + + +register_config("qwen3_vl_moe", Qwen3VLConfig) +register_model("qwen3_vl_moe", Qwen3VLModel) +register_dist_config( + "qwen3_vl_moe", + default_dist_config.merge_configs(shared_moe_dist_config).merge_configs( + DistParallelConfig( + pre_process_weights=["vision_model.*"], + duplicated_weights=["vision_model.*"], + ) + ), +) + + +@dataclass +class Qwen3VLMoeTemplate(Qwen3VLTemplate): + def add_mca_weight(self, name, weight, **kwargs): + weight_prefix = get_mca_weight_prefix(name) + original_name = remove_mca_weight_prefix(name) + moe_layer_index = get_mca_moe_index(name) + # Since experts weights are stacked in qwen3_vl_moe, + # we need to add the moe index to the original name to + # ensure all experts weights have the same weight_prefix + if moe_layer_index is not None: + original_name = str(moe_layer_index) + original_name + weight_prefix = name[: -len(original_name)] + if weight_prefix not in self.prefix_name_to_weight: + self.prefix_name_to_weight[weight_prefix] = {} + self.prefix_name_to_weight[weight_prefix][original_name] = weight + prefix_weights = self.prefix_name_to_weight[weight_prefix] + # However, when looking up the converter, we still use the original name without moe index + # This is because mca_name_to_converter is built before mca_names reset which happens at + # model converter init. + original_name = remove_mca_weight_prefix(name) + if ".lora_A." in original_name or ".lora_B." in original_name: + op = self.get_lora_conver_op(original_name, self.mca_name_to_converter, **kwargs) + else: + op = self.get_conver_op(original_name, self.mca_name_to_converter) + name_to_weight = { + name: prefix_weights.pop(name) + for name in list(prefix_weights.keys()) + if op.is_required_name(name, mca_name=True) + } + conver_res = op(name_to_weight, mca_to_hf=True) + if conver_res is None: + # not ready to convert + self.prefix_name_to_weight[weight_prefix].update(name_to_weight) + return conver_res + hf_prefix = convert_to_hf_prefix(weight_prefix, self.hf_layer_prefix, self.hf_moe_prefix) + return {hf_prefix + name: weight for name, weight in conver_res.items()} + + +register_template( + "qwen3_vl_moe", + hf_layer_prefix="model.language_model.layers.", + hf_moe_prefix=".mlp.experts.", + template_class=Qwen3VLMoeTemplate, + config_hf_to_mca={ + "max_position_embeddings": "max_sequence_length", + "hidden_size": "hidden_size", + "attention_bias": "add_qkv_bias", + "head_dim": "kv_channels", + "num_attention_heads": "num_attention_heads", + "num_key_value_heads": "num_query_groups", + "num_hidden_layers": "num_layers", + "rms_norm_eps": "layernorm_epsilon", + "vocab_size": "padded_vocab_size", + "attention_dropout": "attention_dropout", + "rope_theta": "rotary_base", + "intermediate_size": "ffn_hidden_size", + "tie_word_embeddings": "tie_embeddings_and_output_weights", + # MoE related + "moe_intermediate_size": "moe_ffn_hidden_size", + "decoder_sparse_step": "moe_layer_freq", + "num_experts": "num_moe_experts", + "num_experts_per_tok": "moe_router_topk", + "router_aux_loss_coef": "moe_aux_loss_coeff", + # vit related + "vision_start_token_id": "vision_start_token_id", + "vision_end_token_id": "vision_end_token_id", + "vision_token_id": "vision_token_id", + "image_token_id": "image_token_id", + "video_token_id": "video_token_id", + "vision_config": "vision_config", + "rope_scaling": "rope_scaling", + }, + constant_mca_config={ + "swiglu": True, + "position_embedding_type": "mrope", + "normalization": "RMSNorm", + "add_bias_linear": False, + "hidden_dropout": 0.0, + "rotary_percent": 1.0, + "qk_layernorm": True, + }, + weight_converters=[ + RenameConverOp(hf_names="lm_head.weight", mca_names="output_layer.weight"), + RenameConverOp( + hf_names="model.language_model.embed_tokens.weight", mca_names="embedding.word_embeddings.weight" + ), + RenameConverOp(hf_names=".input_layernorm.weight", mca_names=".self_attention.linear_qkv.layer_norm_weight"), + RenameConverOp(hf_names=".self_attn.o_proj.weight", mca_names=".self_attention.linear_proj.weight"), + RenameConverOp(hf_names=".self_attn.q_norm.weight", mca_names=".self_attention.q_layernorm.weight"), + RenameConverOp(hf_names=".self_attn.k_norm.weight", mca_names=".self_attention.k_layernorm.weight"), + RenameConverOp(hf_names=".post_attention_layernorm.weight", mca_names=".pre_mlp_layernorm.weight"), + RenameConverOp(hf_names="model.language_model.norm.weight", mca_names="decoder.final_layernorm.weight"), + SplitStackConverOp(hf_names="gate_up_proj", mca_names=".linear_fc1.weight"), + SplitConverOp(hf_names="down_proj", mca_names=".linear_fc2.weight"), + RenameConverOp(hf_names=".mlp.gate.weight", mca_names=".mlp.router.weight"), + QKVConverOp( + hf_names=[".self_attn.q_proj.weight", ".self_attn.k_proj.weight", ".self_attn.v_proj.weight"], + mca_names=".self_attention.linear_qkv.weight", + ), + QKVBiasConverOp( + hf_names=[".self_attn.q_proj.bias", ".self_attn.k_proj.bias", ".self_attn.v_proj.bias"], + mca_names=".self_attention.linear_qkv.bias", + ), + RenameConverOp(hf_names="model.visual.{}", mca_names="vision_model.{}"), + ], +) diff --git a/mcore_adapter/src/mcore_adapter/parallel_functions/vocab_parallel.py b/mcore_adapter/src/mcore_adapter/parallel_functions/vocab_parallel.py index 3c0179b68..452a7092b 100644 --- a/mcore_adapter/src/mcore_adapter/parallel_functions/vocab_parallel.py +++ b/mcore_adapter/src/mcore_adapter/parallel_functions/vocab_parallel.py @@ -98,11 +98,13 @@ def forward(ctx, vocab_parallel_logits: "torch.Tensor", target: "torch.Tensor"): def backward(ctx, grad_output: "torch.Tensor"): exp_logits, target_mask, sum_exp_logits, masked_target_1d = ctx.saved_tensors - grad_input = -exp_logits / sum_exp_logits.unsqueeze(dim=-1) + exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1)) + exp_logits.neg_() + grad_input = exp_logits grad_2d = grad_input.view(-1, grad_input.size()[-1]) arange_1d = torch.arange(start=0, end=grad_2d.size()[0], device=grad_input.device) grad_2d[arange_1d, masked_target_1d] += 1 - target_mask.view(-1).float() - grad_input = grad_input * grad_output.unsqueeze(dim=-1) + grad_input.mul_(grad_output.unsqueeze(dim=-1)) return grad_input, None diff --git a/mcore_adapter/src/mcore_adapter/patcher.py b/mcore_adapter/src/mcore_adapter/patcher.py new file mode 100644 index 000000000..3fd3d3c1b --- /dev/null +++ b/mcore_adapter/src/mcore_adapter/patcher.py @@ -0,0 +1,160 @@ +import math +import sys +from typing import Optional + +import torch +from torch.distributed._shard.metadata import ShardMetadata +from torch.distributed._shard.sharding_spec._internals import ( + _check_shard_metadata_pair_overlap +) +from torch.distributed.checkpoint.default_planner import ( + _check_box_bounds, + _check_box_overlap, +) +from torch.distributed.checkpoint.metadata import ( + BytesStorageMetadata, + Metadata, +) +from torch.distributed.checkpoint.planner import SavePlan +from bisect import bisect_right, insort + +from .utils import get_logger + + +logger = get_logger(__name__) + + +def patch_torch_find_nd_overlapping_shards(): + """ + Ref: https://github.com/pytorch/pytorch/issues/166941 + https://github.com/pytorch/pytorch/pull/167073 + """ + def _find_nd_overlapping_shards( + shards: list[ShardMetadata], sharded_dims: list[int] + ) -> Optional[tuple[int, int]]: + """Find overlapping shards using sweep-line algorithm.""" + if len(shards) <= 1: + return None + + dims = len(sharded_dims) + if dims == 0: + return None + + sweep_dim_idx = 0 + if dims > 1: + max_size = 0 + for i, dim in enumerate(sharded_dims): + dim_size = shards[0].shard_offsets[dim] + shards[0].shard_sizes[dim] + if dim_size > max_size: + max_size = dim_size + sweep_dim_idx = i + sweep_dim = sharded_dims[sweep_dim_idx] + + sorted_indices = sorted( + range(len(shards)), + key=lambda idx: ( + shards[idx].shard_offsets[sweep_dim], + *(shards[idx].shard_offsets[d] for d in sharded_dims if d != sweep_dim), + ), + ) + active: list[tuple[int, int]] = [] + + for idx in sorted_indices: + current = shards[idx] + start = current.shard_offsets[sweep_dim] + end = start + current.shard_sizes[sweep_dim] + + cutoff = bisect_right(active, (start, sys.maxsize)) + if cutoff: + del active[:cutoff] + + for _, other_idx in active: + other = shards[other_idx] + + if _check_shard_metadata_pair_overlap(current, other): + return (other_idx, idx) + insort(active, (end, idx)) + return None + + torch.distributed._shard.sharding_spec._internals._find_nd_overlapping_shards = _find_nd_overlapping_shards + + +def patch_torch_validate_global_plan(): + """ + Related: https://github.com/pytorch/pytorch/issues/163548 + https://github.com/pytorch/pytorch/pull/166820 + """ + def _validate_global_plan(global_plan: list[SavePlan], metadata: Metadata) -> bool: + all_good = True + for key, value in metadata.state_dict_metadata.items(): + if isinstance(value, BytesStorageMetadata): + continue + if len(value.size) == 0: + continue + chunks = value.chunks + chunks_volume = 0 + for chunk in chunks: + # Compute the volume + if not _check_box_bounds(value.size, chunk): + logger.warning( + """ + key:%s has out of bounds chunk: + tensor-size:%s chunk: %s + """, + key, + value.size, + chunk, + ) + all_good = False + chunks_volume += math.prod(chunk.sizes) + + if len(chunks) > 1: + dims = len(value.size) + # sweep_dim = max(range(dims), default=0, key=lambda d: value.size[d]) + sweep_dim = 0 # use default sweep_dim, avoid degarding to O(N^2) + sorted_indices = sorted( + range(len(chunks)), + key=lambda idx: ( + chunks[idx].offsets[sweep_dim], + *(chunks[idx].offsets[d] for d in range(dims)), + ), + ) + active: list[tuple[int, int]] = [] + for idx in sorted_indices: + current = chunks[idx] + start = current.offsets[sweep_dim] + end = start + current.sizes[sweep_dim] + + cutoff = bisect_right(active, (start, sys.maxsize)) + if cutoff: + del active[:cutoff] + + for _, other_idx in active: + other = chunks[other_idx] + if _check_box_overlap(current, other): + logger.warning( + "key:%s has overlapping chunks: %s %s", + key, + current, + other, + ) + all_good = False + + insort(active, (end, idx)) + + # Check whether combined chunk cover the whole tensor + tensor_volume = math.prod(value.size) + if len(global_plan) > 1 and chunks_volume != tensor_volume: + logger.warning( + """ + key:%s invalid fill tensor-volume: + %s chunks-volume: %s + """, + key, + tensor_volume, + chunks_volume, + ) + all_good = False + + return all_good + torch.distributed.checkpoint.default_planner._validate_global_plan = _validate_global_plan diff --git a/mcore_adapter/src/mcore_adapter/platforms/__init__.py b/mcore_adapter/src/mcore_adapter/platforms/__init__.py index e62bd224b..ca92058d6 100644 --- a/mcore_adapter/src/mcore_adapter/platforms/__init__.py +++ b/mcore_adapter/src/mcore_adapter/platforms/__init__.py @@ -1,13 +1,12 @@ import torch -from .platform import Platform +from ..utils import get_logger +from .cpu import CpuPlatform from .cuda import CudaPlatform from .npu import NpuPlatform +from .platform import Platform from .rocm import RocmPlatform from .unknown import UnknownPlatform -from .cpu import CpuPlatform - -from ..utils import get_logger logger = get_logger(__name__) diff --git a/mcore_adapter/src/mcore_adapter/platforms/cpu.py b/mcore_adapter/src/mcore_adapter/platforms/cpu.py index 3034200f2..13b905dff 100644 --- a/mcore_adapter/src/mcore_adapter/platforms/cpu.py +++ b/mcore_adapter/src/mcore_adapter/platforms/cpu.py @@ -1,5 +1,6 @@ -from .platform import Platform from ..utils import get_logger +from .platform import Platform + logger = get_logger(__name__) @@ -39,7 +40,3 @@ def get_vllm_run_time_env_vars(cls, gpu_rank: str) -> dict: "VLLM_ALLOW_INSECURE_SERIALIZATION": "1", } return env_vars - - @classmethod - def apply_ulysses_patch(cls) -> None: - return diff --git a/mcore_adapter/src/mcore_adapter/platforms/cuda.py b/mcore_adapter/src/mcore_adapter/platforms/cuda.py index 84bc65c2a..9c598ec1f 100644 --- a/mcore_adapter/src/mcore_adapter/platforms/cuda.py +++ b/mcore_adapter/src/mcore_adapter/platforms/cuda.py @@ -1,8 +1,11 @@ -from .platform import Platform -from ..utils import get_logger +import os import torch +from ..utils import get_logger +from .platform import Platform + + logger = get_logger(__name__) @@ -35,8 +38,9 @@ def get_custom_env_vars(cls) -> dict: "VLLM_ALLOW_INSECURE_SERIALIZATION": "1", "TORCHINDUCTOR_COMPILE_THREADS": "2", "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", - "NCCL_CUMEM_ENABLE": "0", # https://github.com/NVIDIA/nccl/issues/1234 + "NCCL_CUMEM_ENABLE": os.getenv("NCCL_CUMEM_ENABLE", "0"), # https://github.com/NVIDIA/nccl/issues/1234 "NCCL_NVLS_ENABLE": "0", + "NVTE_BWD_LAYERNORM_SM_MARGIN": os.getenv('NVTE_BWD_LAYERNORM_SM_MARGIN', "0"), } return env_vars @@ -45,7 +49,8 @@ def get_vllm_worker_class(cls): try: from vllm import envs - if envs.VLLM_USE_V1: + # VLLM_USE_V1 is deprecated in vllm>=0.11.1 + if not hasattr(envs, "VLLM_USE_V1") or envs.VLLM_USE_V1: from vllm.v1.worker.gpu_worker import Worker logger.info("Successfully imported vLLM V1 Worker.") @@ -63,13 +68,8 @@ def get_vllm_worker_class(cls): def get_vllm_run_time_env_vars(cls, gpu_rank: str) -> dict: env_vars = { "PYTORCH_CUDA_ALLOC_CONF": "", - "VLLM_ALLOW_INSECURE_SERIALIZATION":"1", + "VLLM_ALLOW_INSECURE_SERIALIZATION": "1", "CUDA_VISIBLE_DEVICES": f"{gpu_rank}", "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES": "1", } return env_vars - - @classmethod - def apply_ulysses_patch(cls) -> None: - from roll.utils.context_parallel import apply_ulysses_patch - apply_ulysses_patch() diff --git a/mcore_adapter/src/mcore_adapter/platforms/npu.py b/mcore_adapter/src/mcore_adapter/platforms/npu.py index fbecd2f8c..e74d59714 100644 --- a/mcore_adapter/src/mcore_adapter/platforms/npu.py +++ b/mcore_adapter/src/mcore_adapter/platforms/npu.py @@ -1,5 +1,6 @@ -from .platform import Platform from ..utils import get_logger +from .platform import Platform + logger = get_logger(__name__) @@ -47,7 +48,8 @@ def get_vllm_worker_class(cls): try: from vllm import envs - if envs.VLLM_USE_V1: + # VLLM_USE_V1 is deprecated in vllm>=0.11.1 + if not hasattr(envs, "VLLM_USE_V1") or envs.VLLM_USE_V1: from vllm_ascend.worker.worker_v1 import NPUWorker as Worker logger.info("Successfully imported vLLM V1 Worker.") @@ -69,7 +71,3 @@ def get_vllm_run_time_env_vars(cls, gpu_rank: str) -> dict: "RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES": "1", } return env_vars - - @classmethod - def apply_ulysses_patch(cls) -> None: - return diff --git a/mcore_adapter/src/mcore_adapter/platforms/platform.py b/mcore_adapter/src/mcore_adapter/platforms/platform.py index 1f53f0627..0617fc427 100644 --- a/mcore_adapter/src/mcore_adapter/platforms/platform.py +++ b/mcore_adapter/src/mcore_adapter/platforms/platform.py @@ -1,8 +1,10 @@ -import torch import os +import torch + from ..utils import get_logger + logger = get_logger(__name__) @@ -175,23 +177,3 @@ def get_vllm_run_time_env_vars(cls, gpu_rank: str) -> dict: provide framework-specific environment variables. """ raise NotImplementedError - - @classmethod - def apply_ulysses_patch(cls) -> None: - """ - Apply the Ulysses attention runtime patch to the current environment. - - This method sets up or modifies the necessary environment variables, flags, - or other runtime configurations to enable the Ulysses-optimized attention operations - in vLLM. It ensures that models using the Ulysses attention implementation - can run efficiently on the target hardware. - - Returns: - dict: A dictionary containing the environment variables that were applied - or modified to enable Ulysses attention support. - - Raises: - NotImplementedError: This method should be implemented by subclasses to - provide framework- and hardware-specific Ulysses patching. - """ - raise NotImplementedError diff --git a/mcore_adapter/src/mcore_adapter/platforms/rocm.py b/mcore_adapter/src/mcore_adapter/platforms/rocm.py index 0df5fdfa6..01ac286df 100644 --- a/mcore_adapter/src/mcore_adapter/platforms/rocm.py +++ b/mcore_adapter/src/mcore_adapter/platforms/rocm.py @@ -1,7 +1,8 @@ -from .platform import Platform +import torch + from ..utils import get_logger +from .platform import Platform -import torch logger = get_logger(__name__) @@ -34,17 +35,14 @@ def get_custom_env_vars(cls) -> dict: "VLLM_ALLOW_INSECURE_SERIALIZATION": "1", # These VLLM related enviroment variables are related to backend. maybe used afterwards. # "VLLM_USE_TRITON_FLASH_ATTN":"0", - "VLLM_ROCM_USE_AITER":"1", + # "VLLM_ROCM_USE_AITER":"1", # "VLLM_ROCM_USE_AITER_MOE":"1", # "VLLM_ROCM_USE_AITER_ASMMOE":"1", # "VLLM_ROCM_USE_AITER_PAGED_ATTN":"1", # "RAY_DEBUG": "legacy", - "VLLM_USE_V1": "1", + "VLLM_USE_V1": "0", "TORCHINDUCTOR_COMPILE_THREADS": "2", "PYTORCH_HIP_ALLOC_CONF": "expandable_segments:True", - "SAFETENSORS_FAST_GPU":"1", - "VLLM_ROCM_USE_AITER_MHA":"0", - "VLLM_ALLOW_LONG_MAX_MODEL_LEN":"1", # "NCCL_DEBUG_SUBSYS":"INIT,COLL", # "NCCL_DEBUG":"INFO", # "NCCL_DEBUG_FILE":"rccl.%h.%p.log", @@ -76,7 +74,8 @@ def get_vllm_worker_class(cls): try: from vllm import envs - if envs.VLLM_USE_V1: + # VLLM_USE_V1 is deprecated in vllm>=0.11.1 + if not hasattr(envs, "VLLM_USE_V1") or envs.VLLM_USE_V1: from vllm.v1.worker.gpu_worker import Worker logger.info("Successfully imported vLLM V1 Worker.") @@ -104,8 +103,3 @@ def get_vllm_run_time_env_vars(cls, gpu_rank: str) -> dict: # "NCCL_P2P_DISABLE":"1", } return env_vars - - @classmethod - def apply_ulysses_patch(cls) -> None: - from roll.utils.context_parallel import apply_ulysses_patch - apply_ulysses_patch() \ No newline at end of file diff --git a/mcore_adapter/src/mcore_adapter/platforms/unknown.py b/mcore_adapter/src/mcore_adapter/platforms/unknown.py index 156aa9851..14adb6498 100644 --- a/mcore_adapter/src/mcore_adapter/platforms/unknown.py +++ b/mcore_adapter/src/mcore_adapter/platforms/unknown.py @@ -1,7 +1,8 @@ -from .platform import Platform +import torch + from ..utils import get_logger +from .platform import Platform -import torch logger = get_logger(__name__) @@ -36,19 +37,20 @@ def get_custom_env_vars(cls) -> dict: # So we set a small timeout for PullObjectsAndGetFromPlasmaStore to avoid holding store_client lock # too long. "RAY_get_check_signal_interval_milliseconds": "1", - "VLLM_ALLOW_INSECURE_SERIALIZATION":"1", + "VLLM_ALLOW_INSECURE_SERIALIZATION": "1", "TORCHINDUCTOR_COMPILE_THREADS": "2", "HGGC_ENABLE_KERNEL_COPY": "0", "NCCL_PF_U2MM_HOST": "0", } return env_vars - + @classmethod def get_vllm_worker_class(cls): try: from vllm import envs - if envs.VLLM_USE_V1: + # VLLM_USE_V1 is deprecated in vllm>=0.11.1 + if not hasattr(envs, "VLLM_USE_V1") or envs.VLLM_USE_V1: from vllm.v1.worker.gpu_worker import Worker logger.info("Successfully imported vLLM V1 Worker.") @@ -63,14 +65,9 @@ def get_vllm_worker_class(cls): raise RuntimeError("vLLM is not installed or not properly configured.") from e @classmethod - def get_vllm_run_time_env_vars(cls, gpu_rank:str) -> dict: + def get_vllm_run_time_env_vars(cls, gpu_rank: str) -> dict: env_vars = { - "PYTORCH_CUDA_ALLOC_CONF" : "", - "VLLM_ALLOW_INSECURE_SERIALIZATION":"1", + "PYTORCH_CUDA_ALLOC_CONF": "", + "VLLM_ALLOW_INSECURE_SERIALIZATION": "1", } return env_vars - - @classmethod - def apply_ulysses_patch(cls) -> None: - from roll.utils.context_parallel import apply_ulysses_patch - apply_ulysses_patch() \ No newline at end of file diff --git a/mcore_adapter/src/mcore_adapter/trainer/trainer.py b/mcore_adapter/src/mcore_adapter/trainer/trainer.py index 16dc346ba..3b37239a9 100644 --- a/mcore_adapter/src/mcore_adapter/trainer/trainer.py +++ b/mcore_adapter/src/mcore_adapter/trainer/trainer.py @@ -1,7 +1,6 @@ import math import os import random -import shutil import sys import time import warnings @@ -27,6 +26,7 @@ get_moe_layer_wise_logging_tracker, reduce_aux_losses_tracker_across_ranks, ) +from megatron.core.transformer.multi_token_prediction import MTPLossLoggingHelper from torch._tensor import Tensor from torch.utils.data import DataLoader, Dataset, RandomSampler from transformers import PreTrainedTokenizerBase @@ -47,11 +47,13 @@ set_seed, speed_metrics, ) +from transformers.utils import is_peft_available -from ..platforms import current_platform from ..checkpointing import get_checkpoint_dir, load_state_dict_from_checkpoint -from ..constants import DIST_OPTIMIZER_DIR, IGNORE_INDEX +from ..constants import ADAPTER_CONFIG_NAME, DIST_OPTIMIZER_DIR, IGNORE_INDEX from ..initialize import initialize_megatron +from ..patcher import patch_torch_find_nd_overlapping_shards, patch_torch_validate_global_plan +from ..platforms import current_platform from ..training_args import TrainingArguments from ..utils import distributed_reduce, get_logger from .utils import ( @@ -67,6 +69,11 @@ from ..models import VirtualModels + +if is_peft_available(): + from peft import PeftModel + + logger = get_logger(__name__) @@ -81,6 +88,8 @@ def __init__( args: TrainingArguments = None, **kwargs, ): + patch_torch_find_nd_overlapping_shards() + patch_torch_validate_global_plan() initialize_megatron(args=args) self.args = args super().__init__( @@ -247,6 +256,8 @@ def _prepare_train_inputs(self, data_iterator: Iterator) -> Dict[str, Tensor | A def _pre_compute_loss(self, data_iterator: Iterator, model: DistributedDataParallel): inputs = self._prepare_train_inputs(data_iterator) loss_mask = (inputs["labels"] != IGNORE_INDEX).float() + if "loss_mask" not in inputs: + inputs["loss_mask"] = loss_mask output_tensor = model(**inputs) return output_tensor, loss_mask @@ -384,7 +395,9 @@ def _stream_eval_inputs(self, eval_dataloader: DataLoader, standard_batch_size: max_seq_length = 0 standard_batch_size = standard_batch_size or self.args.per_device_eval_batch_size - pad_func = lambda x, length: [self._pad_batched_inputs(i, length) for i in x] + def pad_func(x, length): + return [self._pad_batched_inputs(i, length) for i in x] + end_flag = torch.tensor(0, device=self.args.device) for inputs in eval_dataloader: main_inputs = inputs[self.model.main_input_name] @@ -495,8 +508,27 @@ def create_scheduler(self, num_training_steps: int, optimizer: "MegatronOptimize def _load_from_checkpoint(self, resume_from_checkpoint, model=None): # TODO: support resume _CUDA_RNG_STATE_TRACKER (which is needed for dropout/init model weights) model = model or self.model - logger.info(f"Loading model from {resume_from_checkpoint}.") - state_dict = load_state_dict_from_checkpoint(resume_from_checkpoint) + if isinstance(model[0], PeftModel): + state_dict = {} + adapter_subdirs = ( + [ + folder_name + for folder_name in os.listdir(resume_from_checkpoint) + if os.path.isdir(os.path.join(resume_from_checkpoint, folder_name)) + and os.path.isfile(os.path.join(resume_from_checkpoint, folder_name, ADAPTER_CONFIG_NAME)) + ] + if os.path.isdir(resume_from_checkpoint) + else [] + ) + if adapter_subdirs: + for subdir_name in adapter_subdirs: + peft_id = os.path.join(resume_from_checkpoint, subdir_name) + logger.info(f"Loading adapter from {peft_id}.") + peft_state_dict = load_state_dict_from_checkpoint(peft_id) + state_dict[subdir_name] = peft_state_dict + else: + logger.info(f"Loading model from {resume_from_checkpoint}.") + state_dict = load_state_dict_from_checkpoint(resume_from_checkpoint) assert state_dict is not None, "No model state_dict found in checkpoint." model.load_state_dict(state_dict) @@ -643,6 +675,11 @@ def _prepare_train_loop( else: self.state.save_steps = args.save_steps + # ckpt loading + if resume_from_checkpoint is not None: + if self.is_fsdp_enabled: + self._load_from_checkpoint(resume_from_checkpoint, self.model) + self._load_optimizer_and_scheduler(resume_from_checkpoint) # Train! @@ -912,6 +949,20 @@ def _maybe_log_save_evaluate( clear_aux_losses_tracker() + mtp_losses = {} + if self.model.config.mtp_num_layers is not None and self.model.config.mtp_num_layers > 0: + if self.control.should_log: + MTPLossLoggingHelper.reduce_loss_in_tracker() + tracker = MTPLossLoggingHelper.tracker + loss_scale = 1 / self.args.gradient_accumulation_steps + MTPLossLoggingHelper.track_mtp_metrics( + loss_scale, + iteration=self.state.global_step, # Not used when total_loss_dict is provided + writer=None, + wandb_writer=None, + total_loss_dict=mtp_losses, + ) + if self.control.should_log and self.state.global_step > self._globalstep_last_logged: logs = {} loss = tr_loss.clone().detach() @@ -927,16 +978,12 @@ def _maybe_log_save_evaluate( # reset tr_loss to zero tr_loss -= tr_loss - # logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) - if self.args.calculate_per_token_loss: - logs["loss"] = round(tr_loss_scalar, 4) - else: - logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) - + logs["loss"] = round(tr_loss_scalar / (self.state.global_step - self._globalstep_last_logged), 4) if grad_norm is not None: logs["grad_norm"] = grad_norm.detach().item() if isinstance(grad_norm, torch.Tensor) else grad_norm logs["learning_rate"] = self._get_learning_rate() logs.update(moe_losses) + logs.update(mtp_losses) if metrics_tensors is not None and len(self.metrics_keys) > 1: # metrics except loss metrics = self.gather_metrics(metrics_tensors) metrics.pop("loss", None) @@ -954,6 +1001,9 @@ def _maybe_log_save_evaluate( if self.control.should_save: self._save_checkpoint(model, trial) self.control = self.callback_handler.on_save(self.args, self.state, self.control) + ckpt_id = f"{PREFIX_CHECKPOINT_DIR}-{self.state.global_step}" + checkpoint_path = os.path.join(self.args.output_dir, ckpt_id) + self.upload_to_mos(ckpt_id, checkpoint_path) if eval_or_save: self.enable_ddp_forward_pre_hook() diff --git a/mcore_adapter/src/mcore_adapter/trainer/utils.py b/mcore_adapter/src/mcore_adapter/trainer/utils.py index cbdefa891..dc4e8a928 100644 --- a/mcore_adapter/src/mcore_adapter/trainer/utils.py +++ b/mcore_adapter/src/mcore_adapter/trainer/utils.py @@ -1,4 +1,4 @@ -from typing import TYPE_CHECKING, List, Dict +from typing import TYPE_CHECKING import torch from megatron.core.optimizer_param_scheduler import OptimizerParamScheduler @@ -84,8 +84,6 @@ def check_pack_seq_aligned(attention_mask: "torch.Tensor", align_size: int): False ``` """ - bsz = attention_mask.size(0) - dtype, device = attention_mask.dtype, attention_mask.device max_num = torch.max(attention_mask).item() is_valid = True for i in range(max_num): @@ -107,7 +105,7 @@ def step(self, increment=1): super().step(increment) self._last_lr = [group["lr"] for group in self.optimizer.param_groups] - def get_last_lr(self) -> List[float]: + def get_last_lr(self) -> list[float]: """Return last computed learning rate by current scheduler.""" return self._last_lr @@ -115,8 +113,10 @@ def get_last_lr(self) -> List[float]: def get_megatron_lr_scheduler(args: "TrainingArguments", num_training_steps: int, optimizer: "MegatronOptimizer"): scheduler_type_map = { # hf to megatron "constant_with_warmup": "constant", + "inverse_sqrt": "inverse-square-root", "cosine_with_min_lr": "cosine", - "wsd": "WSD", + "cosine_warmup_with_min_lr": "cosine", + "warmup_stable_decay": "WSD", } lr_scheduler_kwargs = args.lr_scheduler_kwargs or {} max_lr = lr_scheduler_kwargs.get("max_lr", args.learning_rate) @@ -127,6 +127,15 @@ def get_megatron_lr_scheduler(args: "TrainingArguments", num_training_steps: int lr_decay_style = scheduler_type_map.get(lr_scheduler_type, lr_scheduler_type) if lr_decay_style not in ["constant", "cosine", "linear", "inverse-square-root", "WSD"]: raise ValueError(f"lr_scheduler_type {lr_scheduler_type} is not supported") + kwargs = {} + if lr_decay_style == "WSD": + wsd_decay_steps = lr_scheduler_kwargs.get("wsd_decay_steps", None) + lr_wsd_decay_style = lr_scheduler_kwargs.get("lr_wsd_decay_style", None) + assert wsd_decay_steps is not None, "wsd_decay_steps is required for WSD" + kwargs = { + "wsd_decay_steps": wsd_decay_steps, + "lr_wsd_decay_style": lr_wsd_decay_style, + } return MegatronLRScheduler( optimizer, @@ -140,4 +149,5 @@ def get_megatron_lr_scheduler(args: "TrainingArguments", num_training_steps: int end_wd=args.weight_decay, wd_incr_style="constant", wd_incr_steps=0, + **kwargs, ) diff --git a/mcore_adapter/src/mcore_adapter/training_args.py b/mcore_adapter/src/mcore_adapter/training_args.py index 251fe9b34..b63554ab1 100644 --- a/mcore_adapter/src/mcore_adapter/training_args.py +++ b/mcore_adapter/src/mcore_adapter/training_args.py @@ -14,6 +14,16 @@ @dataclass class DistributingParallelArguments: + """ + NOTE: + - Most arguments should default to None to avoid overwriting checkpoint configurations + - Only training-only parameters (not affecting model checkpoints) should have non-None defaults (e.g., `variable_seq_lengths`) + - This class has high priority and will override config values read from checkpoints + - For minor configurations, consider using the `additional_configs` instead of adding adding new fields + + CONFIGURATION EFFECTS: + Arguments are passed to TransformerConfig during model loading from hf/megatron checkpoints + """ tensor_model_parallel_size: Optional[int] = field( default=None, metadata={"help": "Degree of tensor model parallelism."}, @@ -202,6 +212,25 @@ class DistributingParallelArguments: "choices": ["local", "transformer_engine"], }, ) + fp8_recipe: Optional[str] = field( + default=None, + metadata={ + "help": "FP8 recipe as defined in mcore. If None, FP8 is not used. Supported recipes: " + "'mxfp8' on blackwell, 'blockwise' on hopper. Other recipes are not tested yet.", + # NOTE: mxfp8 does not work with moe recompute_modules if moe is used. + }, + ) + fp8_param: bool = field( + default=False, + # TODO: fp8_param does not work with mxfp8 for now, check TE support later. + metadata={"help": "If true, use fp8 weights during training instead of bf16."}, + ) + fp8: Optional[str] = field( + default=None, + metadata={ + "help": "FP8 format to use. Supported formats: 'e4m3', 'hybrid'. Do not change if unsure", + }, + ) additional_configs: Optional[Union[dict, str]] = field( default_factory=dict, metadata={ diff --git a/mcore_adapter/src/mcore_adapter/utils.py b/mcore_adapter/src/mcore_adapter/utils.py index c56fdb830..b8a60b5a3 100644 --- a/mcore_adapter/src/mcore_adapter/utils.py +++ b/mcore_adapter/src/mcore_adapter/utils.py @@ -1,4 +1,3 @@ -import importlib.util import logging import sys from typing import Any, Mapping @@ -67,9 +66,14 @@ def divide(numerator, denominator): return numerator // denominator -def _is_package_available(name: str) -> bool: - return importlib.util.find_spec(name) is not None - - -def is_peft_available() -> bool: - return _is_package_available("peft") +def is_megatron_llama(): + """ + Check if the installed package is megatron-llama-core rather than megatron-core. + Use cached_value to avoid re-checking the package. + """ + if not hasattr(is_megatron_llama, "cached_value"): + from importlib.metadata import distributions + is_megatron_llama.cached_value = any( + dist.metadata.get('Name') == 'megatron-llama-core' for dist in distributions() + ) + return is_megatron_llama.cached_value diff --git a/mcore_adapter/tools/convert.py b/mcore_adapter/tools/convert.py index 1a5ebab0b..e1c0ae8d1 100644 --- a/mcore_adapter/tools/convert.py +++ b/mcore_adapter/tools/convert.py @@ -3,7 +3,7 @@ from typing import Optional import torch -from transformers import AutoConfig, AutoTokenizer, HfArgumentParser +from transformers import AutoConfig, HfArgumentParser from mcore_adapter.models.converter.post_converter import convert_checkpoint_to_hf, convert_checkpoint_to_mca from mcore_adapter.training_args import DistributingParallelArguments @@ -16,6 +16,7 @@ @dataclass class ConvertArguments: checkpoint_path: str + adapter_path: str | None = field(default=None) output_path: str = field(default="./output") bf16: bool = field(default=False) fp16: bool = field(default=False) @@ -27,13 +28,19 @@ def __post_init__(self): if self.bf16 and self.fp16: raise ValueError("bf16 and fp16 cannot be both True.") + def convert_mca_to_hf(convert_args: ConvertArguments): torch_dtype = None if convert_args.bf16: torch_dtype = torch.bfloat16 elif convert_args.fp16: torch_dtype = torch.float16 - convert_checkpoint_to_hf(convert_args.checkpoint_path, convert_args.output_path, torch_dtype=torch_dtype) + convert_checkpoint_to_hf( + convert_args.checkpoint_path, + convert_args.output_path, + adapter_name_or_path=convert_args.adapter_path, + torch_dtype=torch_dtype, + ) config = AutoConfig.from_pretrained(convert_args.output_path, trust_remote_code=True) if convert_args.convert_model_max_length is not None: @@ -42,12 +49,16 @@ def convert_mca_to_hf(convert_args: ConvertArguments): config.save_pretrained(convert_args.output_path) logger.info(f"\n ==============HF config===========: \n {config}") + def main(): convert_args, dist_args = HfArgumentParser( [ConvertArguments, DistributingParallelArguments] ).parse_args_into_dataclasses() - mca_config_path = os.path.join(convert_args.checkpoint_path, "mca_config.json") + if convert_args.adapter_path is not None: + mca_config_path = os.path.join(convert_args.adapter_path, "mca_config.json") + else: + mca_config_path = os.path.join(convert_args.checkpoint_path, "mca_config.json") from_mca = os.path.exists(mca_config_path) if not from_mca: diff --git a/requirements_common.txt b/requirements_common.txt index 1bff312ab..5af345be3 100644 --- a/requirements_common.txt +++ b/requirements_common.txt @@ -1,4 +1,4 @@ -ray[default,cgraph] # vllm required ray[default,cgraph]>=2.48.0 +ray[default,cgraph]==2.48.0 # vllm required ray[default,cgraph]>=2.48.0 numpy<2.0a0,>=1.25 tensordict sympy diff --git a/roll/__init__.py b/roll/__init__.py index e69de29bb..a874f1fc7 100644 --- a/roll/__init__.py +++ b/roll/__init__.py @@ -0,0 +1,11 @@ +# set RAY_DEDUP_LOGS=0 before importing ray +import os +os.environ["RAY_DEDUP_LOGS"] = os.getenv("RAY_DEDUP_LOGS", "1") + +# Enable deterministic mode if DETERMINISTIC_MODE environment variable is set +if os.getenv("DETERMINISTIC_MODE", "0") == "1": + import torch + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + torch.use_deterministic_algorithms(True, warn_only=False) + print("Deterministic mode enabled") diff --git a/roll/configs/base_config.py b/roll/configs/base_config.py index dd918673f..bd84be971 100644 --- a/roll/configs/base_config.py +++ b/roll/configs/base_config.py @@ -3,15 +3,32 @@ import sys from dataclasses import dataclass, field from datetime import datetime -from typing import Dict, Literal, Optional, Union +from typing import Dict, Literal, Optional, Union, List -from roll.configs.worker_config import WorkerConfig, is_colocated -from roll.utils.config_utils import validate_megatron_batch_size, calculate_megatron_dp_size +from roll.configs.worker_config import WorkerConfig, is_actor_infer_overlapping_with_any_cluster +from roll.platforms import current_platform +from roll.utils.config_utils import (calculate_megatron_dp_size, + validate_megatron_batch_size) from roll.utils.logging import get_logger - logger = get_logger() +@dataclass +class RolloutMockConfig: + """Configuration for rollout dump/mock mechanism for precision alignment testing.""" + enable: bool = field( + default=False, + metadata={"help": "Enable rollout dump/mock mechanism for precision alignment testing"} + ) + mode: Literal["dump", "mock"] = field( + default="dump", + metadata={"help": "dump: save rollout data, mock: load pre-recorded data"} + ) + dump_dir: str = field( + default="./rollout_mock_dumps", + metadata={"help": "Storage directory for rollout dump/mock data"} + ) + @dataclass class ScheduleConfig: generate_opt_level: int = field( @@ -35,10 +52,14 @@ class ScheduleConfig: max_additional_running_prompts: int = field( default=16, metadata={"help": "The additional number of running prompts, beyond batch_size."} ) + user_defined_rollout_loop_cls: str = field( + default="roll.distributed.scheduler.user_defined_rollout_loop.UserDefinedRolloutLoop", + metadata={"help": "Path to class UserDefinedRolloutLoop."} + ) @dataclass -class BaseConfig: +class BaseConfig(ScheduleConfig): exp_name: str = field( default=os.path.basename(sys.argv[0])[: -len(".py")], @@ -136,10 +157,6 @@ class BaseConfig: default=None, metadata={"help": "The maximum length of the sequence to be padded."}, ) - alive_check_interval: int = field( - default=10, - metadata={"help": "The interval of worker alive check."} - ) profiler_timeline: bool = field(default=False, metadata={"help": "Whether to use profiler mode or not."}) profiler_memory: bool = field(default=False, metadata={"help": "Whether to use profiler memory or not."}) report_length_and_rewards: bool = field(default=False, metadata={"help": "Whether to report lengths and rewards of prompts in each epoch."}) @@ -156,6 +173,10 @@ class BaseConfig: default_factory=dict, metadata={"help": "system environment variables."} ) + model_update_buffer_size_mb: int = field( + default=1024, + metadata={"help": "Buffer size in MB for model update operations (e.g., 1024 = 1GB)."} + ) num_nodes: int = field( default=1, metadata={"help": "Number of nodes available for distributed training."} @@ -172,6 +193,10 @@ class BaseConfig: default=None, metadata={"help": "snapshot_download func source type, such as MODELSCOPE, HUGGINGFACE_HUB."}, ) + rollout_mock: Optional[RolloutMockConfig] = field( + default=None, + metadata={"help": "Rollout mock configuration for precision alignment testing."} + ) def to_dict(self): @@ -251,6 +276,15 @@ def __post_init__(self): from ..platforms import current_platform self.num_gpus_per_node = current_platform.device_count() + if hasattr(self, 'actor_train') and isinstance(self.actor_train, WorkerConfig): + self.actor_train.system_envs.update({k: v for k, v in self.system_envs.items() if k not in self.actor_train.system_envs}) + if hasattr(self, 'actor_infer') and isinstance(self.actor_infer, WorkerConfig): + self.actor_infer.system_envs.update({k: v for k, v in self.system_envs.items() if k not in self.actor_infer.system_envs}) + if hasattr(self, 'reference') and isinstance(self.reference, WorkerConfig): + self.reference.system_envs.update({k: v for k, v in self.system_envs.items() if k not in self.reference.system_envs}) + if hasattr(self, 'critic') and isinstance(self.critic, WorkerConfig): + self.critic.system_envs.update({k: v for k, v in self.system_envs.items() if k not in self.critic.system_envs}) + # Validate rollout_batch_size divisibility for Megatron data parallelism if hasattr(self, 'actor_train') and isinstance(self.actor_train, WorkerConfig) and self.actor_train.strategy_args is not None: strategy_name = self.actor_train.strategy_args.strategy_name @@ -271,6 +305,15 @@ def __post_init__(self): f"Skipping DP validation for non-Megatron actor_train strategy: {strategy_name}" ) + if hasattr(self, 'actor_infer') and isinstance(self.actor_infer, WorkerConfig) and self.actor_infer.strategy_args is not None: + strategy_name = self.actor_infer.strategy_args.strategy_name + assert strategy_name in ["vllm", "sglang"] + # Use max_running_requests+1 to reserve extra one for abort_requests. + # 1000 is ray_constants.DEFAULT_MAX_CONCURRENCY_ASYNC. + max_concurrency = max(self.max_running_requests + 1, 1000) + self.actor_infer.max_concurrency = max(self.actor_infer.max_concurrency, max_concurrency) + logger.info(f"Set max_concurrency of actor_infer to {self.actor_infer.max_concurrency}") + # the required num nodes total_devices = [] for attribute_name in dir(self): @@ -293,18 +336,62 @@ def set_max_steps(self, max_steps: int): if hasattr(attribute, "training_args"): setattr(attribute.training_args, "max_steps", max_steps) - def validate_worker_config(self): - # check if current worker supports sequence packing - allowed_names = { - 'student', 'teacher', 'sft_train', - } - for attr_name in dir(self): - attr = getattr(self, attr_name) - if isinstance(attr, WorkerConfig) and attr.use_sequence_packing: - if attr.name not in allowed_names: - raise ValueError( - f"Worker '{attr.name}' (from field '{attr_name}') don't support use sequence packing now" - ) +@dataclass +class TrainInferISWeightConfig: + enabled: bool = field( + default=False, + metadata={"help": "Whether to generate train-infer IS weight and store it into batch (train_infer_is_weight)."}, + ) + weight_type: Literal["token", "segment", "geometric", "sequence"] = field( + default="token", + metadata={"help": "Granularity for IS weight: token / segment / geometric / sequence."}, + ) + upper_bound: Optional[float] = field( + default=1.2, + metadata={"help": "Upper bound (clamp) for IS weight. Set to None to disable clamping."}, + ) + detach: bool = field( + default=True, + metadata={"help": "Detach IS weight tensor to prevent gradient flow (recommended)."}, + ) + + +@dataclass +class TrainInferFilterConfig: + enabled: bool = field( + default=False, + metadata={"help": "Whether to enable this filter rule (applied to response_mask)."}, + ) + agg_type: Literal["token", "segment", "geometric", "sequence"] = field( + default="token", + metadata={"help": "Aggregation level used for filtering: token / segment / geometric / sequence."}, + ) + + ratio_enabled: bool = field( + default=True, + metadata={"help": "Whether to apply ratio-based filtering (exp(old_logp - infer_logp))."}, + ) + ratio_low: float = field(default=0.8, metadata={"help": "Lower threshold for ratio filtering."}) + ratio_high: float = field(default=1.2, metadata={"help": "Upper threshold for ratio filtering."}) + + diff_enabled: bool = field( + default=False, + metadata={"help": "Whether to apply diff-based filtering (exp(old) - exp(infer))."}, + ) + diff_low: float = field(default=-0.2, metadata={"help": "Lower threshold for diff filtering."}) + diff_high: float = field(default=0.2, metadata={"help": "Upper threshold for diff filtering."}) + + +@dataclass +class TrainInferCorrectionConfig: + is_weight: TrainInferISWeightConfig = field( + default_factory=TrainInferISWeightConfig, + metadata={"help": "Config for generating train-infer IS weight (stored in batch)."}, + ) + filters: List[TrainInferFilterConfig] = field( + default_factory=list, + metadata={"help": "A list of filter rules applied sequentially to response_mask."}, + ) @dataclass class PPOConfig(BaseConfig): @@ -325,6 +412,7 @@ class PPOConfig(BaseConfig): reference: WorkerConfig = field( default_factory=WorkerConfig, metadata={"help": "Configuration for the reference role."} ) + reward: WorkerConfig = field(default_factory=WorkerConfig, metadata={"help": "Configuration for reward inference."}) async_generation_ratio: float = field( default=0, @@ -405,8 +493,20 @@ class PPOConfig(BaseConfig): enable_old_logprobs_recompute: bool = field(default=False, metadata={"help": "Enable old_logprobs computation optimization for disable caching"}) force_disable_old_logprobs_recompute: bool = field(default=False, metadata={"help": "Force disable old_logprobs computation optimization for disable caching, priority is higher than enable_old_logprobs_recompute"}) + train_infer_correction: TrainInferCorrectionConfig = field( + default_factory=TrainInferCorrectionConfig, + metadata={ + "help": ( + "Train-infer correction config for off-policy/mismatch handling. " + "Pipeline will compute train_infer_is_weight from old_log_probs vs infer_logprobs " + "and optionally apply filters to response_mask." + ) + }, + ) + def __post_init__(self): super().__post_init__() + assert self.async_generation_ratio == 0 or self.generate_opt_level == 1 if ( self.actor_train.model_args.model_name_or_path is None @@ -433,6 +533,8 @@ def __post_init__(self): self.enable_reference = True if self.force_disable_old_logprobs_recompute: self.enable_old_logprobs_recompute = False + elif self.adv_estimator in ['step_reinforce', "gigpo"]: + self.enable_old_logprobs_recompute = True else: self.set_old_logprobs_status() @@ -448,23 +550,33 @@ def set_max_steps(self, max_steps: int): * self.critic.training_args.gradient_accumulation_steps ) # 没有除dp_size,需要在分布式环境初始化后再除 - self.actor_train.training_args.max_steps = max_steps * ( - self.rollout_batch_size + # 先计算总的训练步数,最后再除以 backward_batch_size + self.actor_train.training_args.max_steps = max(1, ( + max_steps + * self.rollout_batch_size * self.actor_infer.generating_args.num_return_sequences * self.ppo_epochs // actor_backward_batch_size - ) - self.critic.training_args.max_steps = max_steps * ( - self.rollout_batch_size + )) + self.critic.training_args.max_steps = max(1, ( + max_steps + * self.rollout_batch_size * self.actor_infer.generating_args.num_return_sequences // critic_backward_batch_size - ) + )) logger.info(f"pipeline max_steps: {self.max_steps} to {max_steps}") logger.info(f"actor train max_steps without dp_size: {self.actor_train.training_args.max_steps}") logger.info(f"critic train max_steps without dp_size: {self.critic.training_args.max_steps}") self.max_steps = max_steps + def _get_effective_cp_size_ulysses(self, configured_ulysses_size: Optional[int]) -> int: + if not configured_ulysses_size or configured_ulysses_size <= 1: + return 1 + if current_platform.apply_ulysses_patch() is not None: + return configured_ulysses_size + return 1 + def set_old_logprobs_status(self): batch_size = self.rollout_batch_size * self.actor_infer.generating_args.num_return_sequences actor_backward_batch_size = ( @@ -474,7 +586,13 @@ def set_old_logprobs_status(self): dp_size = 1 if self.actor_train.strategy_args is not None: if self.actor_train.strategy_args.strategy_name == "deepspeed_train": - dp_size = len(self.actor_train.device_mapping) + configured_ulysses_size = getattr(self.actor_train.model_args, 'ulysses_size', None) or 1 + cp_size = self._get_effective_cp_size_ulysses(configured_ulysses_size) + dp_size = len(self.actor_train.device_mapping) // cp_size + elif self.actor_train.strategy_args.strategy_name in ("fsdp2_train", "fsdp2_infer"): + configured_ulysses_size = getattr(self.actor_train.model_args, 'ulysses_size', None) or 1 + cp_size = self._get_effective_cp_size_ulysses(configured_ulysses_size) + dp_size = len(self.actor_train.device_mapping) // cp_size elif self.actor_train.strategy_args.strategy_name == "megatron_train": strategy_config = self.actor_train.strategy_args.strategy_config tp = strategy_config.get('tensor_model_parallel_size', 1) @@ -504,6 +622,11 @@ def async_pipeline(self) -> bool: return self.async_generation_ratio > 0 @property - def is_train_infer_colocated(self) -> bool: - """Whether actor_train and actor_infer are colocated.""" - return is_colocated(self.actor_train, self.actor_infer) + def is_actor_infer_colocated(self) -> bool: + """Whether actor_infer are colocated with any other clusters (exclude reward).""" + return is_actor_infer_overlapping_with_any_cluster( + actor_infer=self.actor_infer, + actor_train=self.actor_train, + reference=self.reference, + critic=self.critic + ) diff --git a/roll/configs/data_args.py b/roll/configs/data_args.py index 54ff17a87..921ecd089 100644 --- a/roll/configs/data_args.py +++ b/roll/configs/data_args.py @@ -36,7 +36,6 @@ class DataArguments: id: Optional[str] = field(default="id", metadata={"help": "Which column in file to use as id"}) prompt: Optional[str] = field(default=None, metadata={"help": "Which column in file to use as prompt"}) response: Optional[str] = field(default="solution", metadata={"help": "Which column in file to use as label"}) - # image: Optional[str] = field(default='image', metadata={"help": "Which column in file to use as image"}) messages: Optional[str] = field(default=None, metadata={"help": "Which column in file to use as messages"}) def __post_init__(self): diff --git a/roll/configs/generating_args.py b/roll/configs/generating_args.py index 68cf88d17..848ce7af3 100644 --- a/roll/configs/generating_args.py +++ b/roll/configs/generating_args.py @@ -58,6 +58,10 @@ class GeneratingArguments: default=None, metadata={"help": "Whether to include the stop strings in output text."}, ) + logprobs: Optional[int] = field( + default=0, + metadata={"help": "The number of logprobs to return. Set None to not return logprobs."}, + ) def to_dict(self) -> Dict[str, Any]: args = asdict(self) diff --git a/roll/configs/model_args.py b/roll/configs/model_args.py index ce300250a..c9b8b8446 100644 --- a/roll/configs/model_args.py +++ b/roll/configs/model_args.py @@ -17,6 +17,14 @@ class LoraArguments: "help": "Name(s) of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint." }, ) + autocast_adapter_dtype: bool = field( + default=True, + metadata={ + "help": "Whether to autocast the adapter dtype. Defaults to `True`. Right now, " + "this will only cast adapter weights using float16 or bfloat16 to float32, " + "as this is typically required for stable training, and only affect select PEFT tuners." + }, + ) lora_alpha: Optional[int] = field( default=None, metadata={"help": "The scale factor for LoRA fine-tuning (default: lora_rank * 2)."}, @@ -69,17 +77,26 @@ class ModelArguments(LoraArguments): default=False, metadata={"help": "Whether or not to disable gradient checkpointing."}, ) + gradient_checkpointing_use_reentrant: Optional[bool] = field( + default=None, + metadata={ + "help": ( + "Gradient checkpointing implementation toggle for torch.utils.checkpoint.\n" + "- None (default): auto (use reentrant=True for MoE models; otherwise False)\n" + ) + }, + ) device_map: Optional[str] = field( default="balanced", metadata={"help": "transformer's from_pretrained device map"} ) dtype: Optional[Literal["fp32", "bf16", "fp16"]] = field( default="bf16", metadata={"help": "Set model dtype as fp32, bf16, or fp16, otherwise use config's torch_dtype"} ) - model_type: Optional[Literal["auto_sequence_classification", "auto_token_classification", "trl", "diffusion_module"]] = field( + model_type: Optional[ + Literal["auto_sequence_classification", "auto_token_classification", "trl", "diffusion_module"] + ] = field( default=None, - metadata={ - "help": "reward model type." - }, + metadata={"help": "reward model type."}, ) num_labels: Optional[int] = field( default=1, @@ -100,9 +117,7 @@ class ModelArguments(LoraArguments): ) ulysses_size: Optional[int] = field( default=1, - metadata={ - "help": "The group size for Ulysses attention." - }, + metadata={"help": "The group size for Ulysses attention."}, ) def __post_init__(self): diff --git a/roll/configs/training_args.py b/roll/configs/training_args.py index 4b7ad2661..92f1d81e3 100644 --- a/roll/configs/training_args.py +++ b/roll/configs/training_args.py @@ -92,6 +92,10 @@ class TrainingArguments: metadata={"help": "Linear warmup over warmup_ratio fraction of total steps."} ) warmup_steps: int = field(default=0, metadata={"help": "Linear warmup over warmup_steps."}) + save_hf_model: bool = field( + default=False, + metadata={"help": "Save model as hf format."} + ) def get_warmup_steps(self, num_training_steps: int): """ diff --git a/roll/configs/worker_config.py b/roll/configs/worker_config.py index 0c7c9ea12..5ceb721a5 100644 --- a/roll/configs/worker_config.py +++ b/roll/configs/worker_config.py @@ -4,24 +4,70 @@ from roll.configs import DataArguments, GeneratingArguments, ModelArguments from roll.configs.training_args import TrainingArguments from roll.utils.logging import get_logger - - logger = get_logger() @dataclass class StrategyArguments: strategy_name: Literal[ - "deepspeed_train", "hf_infer", "deepspeed_infer", "vllm", "sglang", "megatron_infer", "megatron_train", "mock_infer", "diffusion_deepspeed_train" + "deepspeed_train", + "hf_infer", + "deepspeed_infer", + "vllm", + "sglang", + "megatron_infer", + "megatron_train", + "diffusion_deepspeed_train", + "fsdp2_train", + "fsdp2_infer", ] = field( default="deepspeed_train", metadata={ - "help": "The name of the strategy. Options: 'deepspeed_train', 'diffusion_deepspeed_train', 'hf_infer', 'deepspeed_infer', 'mock_infer', 'vllm', 'sglang', " - "'megatron_infer', 'megatron_train'." + "help": "The name of the strategy. Options: 'deepspeed_train', 'diffusion_deepspeed_train', 'hf_infer', 'deepspeed_infer', 'vllm', 'sglang', " + "'megatron_infer', 'megatron_train', 'fsdp2_train', 'fsdp2_infer'." }, ) strategy_config: Optional[Dict] = field( - default_factory=dict, metadata={"help": "Configuration dictionary for the strategy."} + default_factory=dict, + metadata={"help": "Configuration dictionary for the strategy."}, + ) + +@dataclass +class SequencePackingConfig: + algorithm: str = field( + default="none", + metadata={"help": "Sequence packing algorithm: 'none' (default partitioning) or 'load_balance' " + "(redistribute sentences across microbatches for better load balancing). " + "Note: 'load_balance' requires proper loss scaling as microbatches contain " + "different numbers of sentences."} + ) + + max_packed_sequence_length_forward: int = field( + default=None, + metadata={"help": "Maximum sequence length after packing sentences in a microbatch during inference. " + "With context parallelism enabled, each CP rank handles " + "max_packed_sequence_length_forward // cp_size."} + ) + + max_packed_sequence_length_train: int = field( + default=None, + metadata={"help": "Maximum sequence length after packing sentences in a microbatch during training. " + "With context parallelism enabled, each CP rank handles " + "max_packed_sequence_length_train // cp_size."} + ) + + min_num_micro_batches_forward: int = field( + default=1, + metadata={"help": "Minimum number of microbatches per mini-batch during inference. " + "Used with 'load_balance' algorithm to control samples per microbatch " + "and memory usage."} + ) + + min_num_micro_batches_train: int = field( + default=1, + metadata={"help": "Minimum number of microbatches per mini-batch (per gradient update) during training. " + "Used with 'load_balance' algorithm to control samples per microbatch " + "and memory usage."} ) @@ -31,13 +77,10 @@ class WorkerConfig: default=None, metadata={"help": "name of this role."}, ) - worker_cls: Optional[str] = field( - default=None, - metadata={"help": "The class of the worker."} - ) + worker_cls: Optional[str] = field(default=None, metadata={"help": "The class of the worker."}) pg_variant: Optional[str] = field( default=None, - metadata={"help": "The variant of the policy gradient."} + metadata={"help": "The variant of the policy gradient."}, ) model_args: ModelArguments = field( default_factory=ModelArguments, @@ -45,23 +88,21 @@ class WorkerConfig: ) training_args: TrainingArguments = field( default_factory=TrainingArguments, - metadata={"help": "Training-related arguments."} + metadata={"help": "Training-related arguments."}, ) data_args: DataArguments = field( default=None, - metadata={"help": "Data-related arguments; optional and can be None."} + metadata={"help": "Data-related arguments; optional and can be None."}, ) generating_args: GeneratingArguments = field( default=None, - metadata={"help": "Arguments for generating output; optional and can be None."} + metadata={"help": "Arguments for generating output; optional and can be None."}, ) strategy_args: StrategyArguments = field( default=None, - metadata={"help": "The strategy configuration, encapsulated in a StrategyArguments object."} + metadata={"help": "The strategy configuration, encapsulated in a StrategyArguments object."}, ) - world_size: int = field( - default=None, - metadata={"help": "The number of role clusters."}) + world_size: int = field(default=None, metadata={"help": "The number of role clusters."}) device_mapping: Union[List[int], str] = field( default=None, metadata={ @@ -70,49 +111,33 @@ class WorkerConfig: "If device_mapping is None, the worker uses cpu only." }, ) - num_gpus_per_worker: int = field( - default=1, - metadata={"help": "The number of gpu per worker."} - ) - model_update_frequency: int = field( - default=1, - metadata={"help": "Frequency of model updates."} - ) - model_update_method: Literal["nccl", "rpc"] = field( - default="nccl", - metadata={ - "help": "The method of model updates. Options: 'nccl', 'rpc', rpc only for RTP recently." - }, - ) - infer_batch_size: int = field( - default=16, - metadata={"help": "Batch size for inference."} - ) + num_gpus_per_worker: int = field(default=1, metadata={"help": "The number of gpu per worker."}) + model_update_frequency: int = field(default=1, metadata={"help": "Frequency of model updates."}) + infer_batch_size: int = field(default=16, metadata={"help": "Batch size for inference."}) backend_timeout: int = field( default=30, - metadata={"help": "minutes for dist backend communicating."} + metadata={"help": "minutes for dist backend communicating."}, ) system_envs: dict = field( default_factory=dict, - metadata={"help": "system environment variables for this worker."} + metadata={"help": "system environment variables for this worker."}, ) topr_positive_weight: float = field( default=1.0, - metadata={"help": "Weight for positive samples in TOPR loss."} + metadata={"help": "Weight for positive samples in TOPR loss."}, ) topr_negative_weight: float = field( default=1.0, - metadata={"help": "Weight for negative samples in TOPR loss."} - ) - use_remove_padding: bool = field( - default=False, - metadata={"help": "Remove tail padding token in a micro batch, don't pack sequences(different from verl). must set `variable_seq_lengths` for megatron."} + metadata={"help": "Weight for negative samples in TOPR loss."}, ) + max_concurrency: int = field(default=1, metadata={"help": "max_concurrency of this Ray Actor"}) use_dynamic_batching_in_train: bool = field( default=False, - metadata={"help": "Dynamic batching is a feature designed to group sequences of similar lengths into batches, " - "minimizing padding and improving computational and memory efficiency."} + metadata={ + "help": "Dynamic batching is a feature designed to group sequences of similar lengths into batches, " + "minimizing padding and improving computational and memory efficiency." + }, ) max_tokens_per_microbatch_in_train: int = field( default=0, @@ -122,38 +147,77 @@ class WorkerConfig: "This config must be set when using dynamic batching. " "Recommended value: sequence_length × 2 × micro_batch_size." ) - } + }, ) - sequence_length_round_in_train:int = field( + sequence_length_round_in_train: int = field( default=4, - metadata={"help": "The value to round up to when truncating the sequence length." - "Note: This config must be set when using dynamic batching."} + metadata={ + "help": "The value to round up to when truncating the sequence length." + "Note: This config must be set when using dynamic batching." + }, ) use_dynamic_batching_in_infer: bool = field( default=False, - metadata={"help": "Dynamic batching is a feature designed to group sequences of similar lengths into batches, " - "minimizing padding and improving computational and memory efficiency."} + metadata={ + "help": "Dynamic batching is a feature designed to group sequences of similar lengths into batches, " + "minimizing padding and improving computational and memory efficiency." + }, ) - max_tokens_per_microbatch_in_infer:int = field( + max_tokens_per_microbatch_in_infer: int = field( default=None, - metadata={"help": "Set the maximum number of tokens for each micro-batch. " - "Note: This config must be set when using dynamic batching."} + metadata={ + "help": "Set the maximum number of tokens for each micro-batch. " + "Note: This config must be set when using dynamic batching." + }, ) - sequence_length_round_in_infer:int = field( + sequence_length_round_in_infer: int = field( default=4, - metadata={"help": "The value to round up to when truncating the sequence length." - "Note: This config must be set when using dynamic batching."} + metadata={ + "help": "The value to round up to when truncating the sequence length." + "Note: This config must be set when using dynamic batching." + }, ) offload_nccl: bool = field( default=False, - metadata={"help": "Whether offload nccl buffer to save gpu memory."} + metadata={"help": "Whether offload nccl buffer to save gpu memory."}, ) # sequence packing use_sequence_packing: bool = field( default=False, - metadata={"help": "Concatenates multiple sequences into a single “packed” sequence, eliminating most padding. " - "Only supported in the megatron strategy"} + metadata={ + "help": "Concatenates multiple sequences into a single “packed” sequence, eliminating most padding. " + "Only supported in the megatron strategy" + }, + ) + + sequence_packing_args: SequencePackingConfig = field( + default= SequencePackingConfig(), + metadata={ + "help": "Sequence packing related arguments " + } + ) + + + logits_in_fp32: bool = field( + default=True, + metadata={ + "help": "Force logits dtype to Float" + } + ) + + apply_loss_scale: bool = field( + default=True, + metadata={ + "help": ( + "Whether to multiply the aggregated loss by the global loss_scale (typically the total number of " + "micro-batches in a global step, i.e., DP×GA) to cancel the backend’s default gradient-mean behavior " + "under Data Parallel + Gradient Accumulation. This restores a sum-over-microbatches semantics so the " + "resulting gradients are equivalent to computing the loss on the full global batch at once with the " + "global denominator (especially important with variable-length inputs/sequence packing). Disable only " + "if you already apply an equivalent scaling elsewhere or your backend does not average across DP/GA." + ) + } ) def __post_init__(self): @@ -193,15 +257,34 @@ def __post_init__(self): elif self.model_args.dtype == "fp16": self.training_args.fp16 = True -def is_colocated(actor_train: WorkerConfig, actor_infer: WorkerConfig): - train_devices = set(actor_train.device_mapping or []) + + +def is_actor_infer_overlapping_with_any_cluster(actor_infer: WorkerConfig, actor_train: WorkerConfig = None, reference: WorkerConfig = None, critic: WorkerConfig = None) -> bool: + """ + Check if actor_infer overlaps with ANY of the provided clusters. + + Args: + actor_infer: The actor_infer WorkerConfig + actor_train: The actor_train WorkerConfig (optional) + reference: The reference WorkerConfig (optional) + critic: The critic WorkerConfig (optional) + + Returns: + True if actor_infer overlaps with any provided cluster, False otherwise + """ infer_devices = set(actor_infer.device_mapping or []) - if train_devices.issuperset(infer_devices): - return True - if train_devices.intersection(infer_devices): - # TODO: raise here - # raise ValueError( - # f"train and infer share some devices, but train not cover infer. {train_devices=} {infer_devices=}" - # ) - return False + + clusters = { + 'actor_train': actor_train, + 'reference': reference, + 'critic': critic + } + + for cluster_name, cluster_config in clusters.items(): + if cluster_config is not None: + cluster_devices = set(cluster_config.device_mapping or []) + if infer_devices.intersection(cluster_devices): + return True + return False + diff --git a/roll/datasets/collator.py b/roll/datasets/collator.py index 8eba22ac1..47e3f6bb5 100644 --- a/roll/datasets/collator.py +++ b/roll/datasets/collator.py @@ -119,12 +119,19 @@ class DataCollatorWithPaddingForMM: answer_key: Optional[str] = "ground_truth" image_key: Optional[str] = "image" image_flag_key: Optional[str] = "image_flag" + video_key: Optional[str] = "video" + video_flag_key: Optional[str] = "video_flag" + image_placeholder: Optional[str] = None + image_token: str = "<|vision_start|><|image_pad|><|vision_end|>" + video_placeholder: Optional[str] = None + video_token: str = "<|vision_start|><|video_pad|><|vision_end|>" padding: Union[bool, str, PaddingStrategy] = True max_length: Optional[int] = None pad_to_multiple_of: Optional[int] = None padded_keys: List[str] = field(default_factory=lambda: ["input_ids", "attention_mask", "labels"]) extra_unpadded_keys: List[str] = field(default_factory=lambda: []) return_tensors: str = "pt" + return_infer_inputs: bool = True # whether to include infer engine inputs which differs with train def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: assert self.tokenizer and self.processor @@ -136,14 +143,24 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: # cannot process as batch directly though processor output as batch # since pixel_values would be packed among batch images while DataProto # requires all data fields has same batch size - # if image is None, model_inputs would not inlcude image feature field + # if image is None, model_inputs would not include image feature field + prompt = feature[self.prompt_key] + if not isinstance(prompt, str): + prompt = self.processor.apply_chat_template(prompt, tokenize=False, add_generation_prompt=True) + if self.image_placeholder: + prompt = prompt.replace(self.image_placeholder, self.image_token) + if self.video_placeholder: + prompt = prompt.replace(self.video_placeholder, self.video_token) + # TODO: support video model_inputs: BatchFeature = self.processor( images=feature[self.image_key] if self.image_key and (not self.image_flag_key or feature[self.image_flag_key]) else None, - text=feature[self.prompt_key], + text=prompt, ) - for key in ["prompt", "position_ids", "rope_deltas"]: # remove unnecessary feature + if not isinstance(model_inputs, BatchFeature): + model_inputs = BatchFeature(data=model_inputs) + for key in ["prompt", "position_ids", "rope_deltas"]: # remove unnecessary feature if key in model_inputs: model_inputs.pop(key) for key in filter(lambda k: k in model_inputs, self.padded_keys): @@ -159,22 +176,23 @@ def __call__(self, features: List[Dict[str, Any]]) -> Dict[str, Any]: # concat at dim=0 before model forward un_padded_features["multi_modal_inputs"].append(dict(model_inputs)) # inputs for infer engine, not tensors - un_padded_features["multi_modal_data"].append( - { - "prompt_token_ids": # different with input_ids - self.tokenizer.encode(feature[self.prompt_key], add_special_tokens=False), - "multi_modal_data": { - "image": [feature[self.image_key]] - if not isinstance(feature[self.image_key], list) - else feature[self.image_key] - }, - } - if (not self.image_flag_key or feature[self.image_flag_key]) and feature[self.image_key] - else { - "prompt_token_ids": # different with input_ids - self.tokenizer.encode(feature[self.prompt_key], add_special_tokens=False), - } - ) + if self.return_infer_inputs: + un_padded_features["multi_modal_data"].append( + { + "prompt_token_ids": # different with input_ids + self.tokenizer.encode(prompt, add_special_tokens=False), + "multi_modal_data": { + "image": [feature[self.image_key]] + if not isinstance(feature[self.image_key], list) + else feature[self.image_key] + }, + } + if (not self.image_flag_key or feature[self.image_flag_key]) and feature[self.image_key] + else { + "prompt_token_ids": # different with input_ids + self.tokenizer.encode(prompt, add_special_tokens=False), + } + ) if self.answer_key: un_padded_features[self.answer_key].append(feature[self.answer_key]) if self.extra_unpadded_keys: diff --git a/roll/datasets/dataset.py b/roll/datasets/dataset.py index 9da7a8ecd..32097af0b 100644 --- a/roll/datasets/dataset.py +++ b/roll/datasets/dataset.py @@ -1,5 +1,5 @@ import os -from typing import Callable, Dict, Union, List +from typing import Callable, Union from datasets import Dataset, IterableDataset, load_dataset @@ -9,11 +9,19 @@ logger = get_logger() -REGISTERED_DATASETS: Dict[str, Callable[[List[str], str, dict], Union[Dataset, IterableDataset]]] = {} +FILEEXT2TYPE = { + "arrow": "arrow", + "csv": "csv", + "json": "json", + "jsonl": "json", + "parquet": "parquet", + "txt": "text", +} +REGISTERED_DATASETS: dict[str, Callable[[list[str], str, dict], Union[Dataset, IterableDataset]]] = {} def register_dataset(key: str): - def decorator(func: Callable[[List[str], str, dict], Union[Dataset, IterableDataset]]): + def decorator(func: Callable[[list[str], str, dict], Union[Dataset, IterableDataset]]): if key in REGISTERED_DATASETS: raise ValueError(f"Dataset type '{key}' already exists!") REGISTERED_DATASETS[key] = func @@ -29,20 +37,14 @@ def get_dataset(data_args: "DataArguments"): data_files = [] dataset_dir = getattr(data_args, "dataset_dir", ".") dataset_type = getattr(data_args, "dataset_type", "default") - FILEEXT2TYPE = { - "arrow": "arrow", - "csv": "csv", - "json": "json", - "jsonl": "json", - "parquet": "parquet", - "txt": "text", - } if isinstance(data_name, list): local_path = "" else: local_path: str = os.path.join(dataset_dir, data_name) - - if os.path.isdir(local_path): + if dataset_type in ("odps",): + data_path = dataset_type + data_files.extend(data_name) + elif os.path.isdir(local_path): for file_name in os.listdir(local_path): data_files.append(os.path.join(local_path, file_name)) if data_path is None: @@ -69,21 +71,13 @@ def get_dataset(data_args: "DataArguments"): logger.info(f"load_data_files: {chr(10)} {chr(10).join(data_files)}") logger.info(f"prompt column: {data_args.prompt} label column: {data_args.response}") - return REGISTERED_DATASETS[data_path](data_files, split='train') + return REGISTERED_DATASETS[data_path](data_files, split="train") -def create_local_dataset(dataset_name: Union[List[str], str], - split: str = "train", - dataset_kwargs: Dict = None) -> Union[Dataset, IterableDataset]: +def create_local_dataset( + dataset_name: Union[list[str], str], split: str = "train", dataset_kwargs: dict = None +) -> Union[Dataset, IterableDataset]: data_files = [] - FILEEXT2TYPE = { - "arrow": "arrow", - "csv": "csv", - "json": "json", - "jsonl": "json", - "parquet": "parquet", - "txt": "text", - } data_path = None logger.info(f"load dataset: {dataset_name}") @@ -93,7 +87,7 @@ def create_local_dataset(dataset_name: Union[List[str], str], if data_path is None: data_path = FILEEXT2TYPE.get(file_name.split(".")[-1], None) elif data_path != FILEEXT2TYPE.get(file_name.split(".")[-1], None): - raise ValueError("File types should be identical.") + raise ValueError(f"File types should be identical. {data_path=} NOT {file_name=}") logger.info(f"load dataset files: {data_files}") elif os.path.isfile(dataset_name): # is file data_files.append(dataset_name) @@ -105,6 +99,13 @@ def create_local_dataset(dataset_name: Union[List[str], str], data_path = FILEEXT2TYPE.get(file_name.split(".")[-1], None) elif data_path != FILEEXT2TYPE.get(file_name.split(".")[-1], None): raise ValueError("File types should be identical.") + elif isinstance(dataset_name, str) and dataset_name.startswith("odps://"): + # TODO: How to separate ODPS and AILake. + data_path = "odps" + data_files.extend(dataset_name) + elif isinstance(dataset_name, str) and dataset_name.startswith("ailake://"): + data_path = "ailake" + data_files.extend(dataset_name) else: dataset = load_dataset(dataset_name) logger.info(f"Loaded: {dataset=}") @@ -122,44 +123,34 @@ def create_local_dataset(dataset_name: Union[List[str], str], @register_dataset("default") @register_dataset("json") def default_json_dataset( - data_files: "DataPaths", - split: str = "train", - **kwargs + data_files: "DataPaths", split: str = "train", **kwargs ) -> Union["Dataset", "IterableDataset"]: return load_dataset("json", data_files=data_files, **kwargs)[split] @register_dataset("arrow") def default_arrow_dataset( - data_files: "DataPaths", - split: str = "train", - **kwargs + data_files: "DataPaths", split: str = "train", **kwargs ) -> Union["Dataset", "IterableDataset"]: return load_dataset("arrow", data_files=data_files, **kwargs)[split] @register_dataset("csv") def default_csv_dataset( - data_files: "DataPaths", - split: str = "train", - **kwargs + data_files: "DataPaths", split: str = "train", **kwargs ) -> Union["Dataset", "IterableDataset"]: return load_dataset("csv", data_files=data_files, **kwargs)[split] @register_dataset("parquet") def default_parquet_dataset( - data_files: "DataPaths", - split: str = "train", - **kwargs + data_files: "DataPaths", split: str = "train", **kwargs ) -> Union["Dataset", "IterableDataset"]: return load_dataset("parquet", data_files=data_files, **kwargs)[split] @register_dataset("text") def default_text_dataset( - data_files: "DataPaths", - split: str = "train", - **kwargs + data_files: "DataPaths", split: str = "train", **kwargs ) -> Union["Dataset", "IterableDataset"]: return load_dataset("text", data_files=data_files, **kwargs)[split] diff --git a/roll/datasets/global_dataset.py b/roll/datasets/global_dataset.py index 8e1338d0f..26ec3a1c6 100644 --- a/roll/datasets/global_dataset.py +++ b/roll/datasets/global_dataset.py @@ -1,3 +1,4 @@ +import asyncio import random from typing import Dict, Optional, Any, Callable @@ -44,6 +45,7 @@ async def get_data_item(self, seed: int, **kwargs) -> Dict: if seed is not None: self.idx = random.randint(0, len(self.dataset) - 1) else: + self.idx += 1 if self.idx == len(self.dataset): self.epoch += 1 self.dataset = self.dataset.shuffle(seed=self.epoch) @@ -79,4 +81,6 @@ async def reset(self): refs = [] for dataset_name, dataset_ref in self.global_dataset_dict.items(): refs.append(dataset_ref.reset.remote()) - ray.get(refs) + if refs: + # async + await asyncio.gather(*refs) diff --git a/roll/distributed/executor/cluster.py b/roll/distributed/executor/cluster.py index 7267819b4..15920ee8f 100644 --- a/roll/distributed/executor/cluster.py +++ b/roll/distributed/executor/cluster.py @@ -2,6 +2,8 @@ from typing import List, Type, Dict, Union, Any import ray +import ray._private.ray_constants as ray_constants +from ray._private.async_compat import has_async_methods from ray._private.worker import RemoteFunctionNoArgs from ray.runtime_env import RuntimeEnv from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy @@ -23,6 +25,7 @@ from roll.utils.import_utils import safe_import_class from roll.utils.logging import get_logger + logger = get_logger() @@ -46,6 +49,7 @@ def __init__( else: self.worker_cls = worker_cls self.resource_manager = resource_manager + self.placement_groups = None self.worker_config = worker_config self.workers: List[Any] = [] @@ -80,6 +84,17 @@ def tp_size(self): def pp_size(self): return self.worker_rank_info[0].pp_size + @property + def cp_size(self): + return self.worker_rank_info[0].cp_size + + @property + def vp_size(self): + if 'virtual_pipeline_model_parallel_size' in self.worker_config.strategy_args.strategy_config: + return self.worker_config.strategy_args.strategy_config['virtual_pipeline_model_parallel_size'] + else: + return 1 + @property def worker_rank_info(self) -> List[RankInfo]: if not self._worker_rank_info or not self.initialized: @@ -96,11 +111,20 @@ def _create_workers(self): device_mapping=self.worker_config.device_mapping, world_size=self.worker_config.world_size ) logger.debug(f"placement_groups: {placement_groups}") + self.placement_groups = placement_groups for rank, pgs in enumerate(placement_groups): deploy_pg = pgs[0] pg_zero_gpu_ranks = sorted([pg["gpu_rank"] for pg in pgs if pg["node_rank"] == deploy_pg["node_rank"]]) - worker_name = f"{self.cluster_name}-{rank}" + + # Include GPU IDs in worker name for timeline visualization + # Format: actor_train-0-G0 (single GPU) or actor_infer-0-G01 (TP=2) + if pg_zero_gpu_ranks and deploy_pg["gpu_rank"] is not None: + gpu_str = "".join(str(g) for g in pg_zero_gpu_ranks) + worker_name = f"{self.cluster_name}-{rank}-G{gpu_str}" + else: + # CPU-only workers + worker_name = f"{self.cluster_name}-{rank}" env_vars = { "WORLD_SIZE": str(self.world_size), "RANK": str(rank), @@ -121,12 +145,21 @@ def _create_workers(self): runtime_env = RuntimeEnv(env_vars=env_vars) self.worker_config.resource_placement_groups = pgs + if has_async_methods(self.worker_cls.__ray_metadata__.modified_class): + max_concurrency = (self.worker_config.max_concurrency if self.worker_config.max_concurrency > 1 + else ray_constants.DEFAULT_MAX_CONCURRENCY_ASYNC) + logger.info(f"set max_concurrency to {max_concurrency} for worker {type(self.worker_cls)}") + else: + assert self.worker_config.max_concurrency == 1 + max_concurrency = 1 + worker_options = { "scheduling_strategy": PlacementGroupSchedulingStrategy(placement_group=deploy_pg["placement_group"]), "name": worker_name, "namespace": RAY_NAMESPACE, "runtime_env": runtime_env, "num_cpus": 0.01, + "max_concurrency": max_concurrency, } if current_platform.ray_device_key == "GPU": diff --git a/roll/distributed/executor/model_update_group.py b/roll/distributed/executor/model_update_group.py index 46e019fe1..3ea8effd4 100644 --- a/roll/distributed/executor/model_update_group.py +++ b/roll/distributed/executor/model_update_group.py @@ -1,158 +1,41 @@ -import itertools -import json -import time -from collections import defaultdict -from typing import List, Any - import ray +from roll.configs.base_config import PPOConfig from roll.distributed.executor.cluster import Cluster from roll.distributed.scheduler.protocol import DataProto -from roll.utils.functionals import reduce_metrics +from roll.utils.functionals import reduce_metrics_list class ModelUpdateGroup: - - def __init__(self, src_cluster: Cluster, tgt_cluster: Cluster, frequency: int = 1): - self.src_cluster: Any = src_cluster - self.tgt_cluster: Any = tgt_cluster + def __init__(self, src_cluster: Cluster, tgt_cluster: Cluster, pipeline_config: PPOConfig, frequency=1): + self.src_cluster = src_cluster + self.tgt_cluster = tgt_cluster self.frequency = frequency + self.pipeline_config = pipeline_config self.model_update_name = f"model_update/{self.src_cluster.cluster_name}_2_{self.tgt_cluster.cluster_name}" + train_devices = set(src_cluster.worker_config.device_mapping or []) + infer_devices = set(tgt_cluster.worker_config.device_mapping or []) - # 存src actor -> tgt actors的映射 (src_actor, tgt_actors) - # 相同pp_rank的comm_plan是可以并发执行的,全部并发执行需要探索一下 - # Dict[pp_rank, Dict[src_actor_rank, List[tgt_actor_rank]]] - self.broadcast_comm_pan = defaultdict(lambda: defaultdict(list)) - - # 用于相同gpu的actor发送 - self.p2p_comm_plan = defaultdict(lambda: defaultdict(list)) - - self.make_comm_plan() - self.make_collective_group() - - def make_comm_plan(self): - """ - comm_plan demo: - { - "0": - { - "0": [ - {"rank": 0, "device": {"rank": 1, "node_rank": 0, "gpu_rank": 1}}, - {"rank": 1, "device": {"rank": 0, "node_rank": 0, "gpu_rank": 2}}, - {"rank": 2, "device": {"rank": 0, "node_rank": 0, "gpu_rank": 4}}, - {"rank": 3, "device": {"rank": 0, "node_rank": 0, "gpu_rank": 6}}], - "1": [ - {"rank": 0, "device": {"rank": 0, "node_rank": 0, "gpu_rank": 0}}, - {"rank": 1, "device": {"rank": 1, "node_rank": 0, "gpu_rank": 3}}, - {"rank": 2, "device": {"rank": 1, "node_rank": 0, "gpu_rank": 5}}, - {"rank": 3, "device": {"rank": 1, "node_rank": 0, "gpu_rank": 7}}] - }, - "1": { - "2": [ - {"rank": 0, "device": {"rank": 0, "node_rank": 0, "gpu_rank": 0}}, - {"rank": 1, "device": {"rank": 1, "node_rank": 0, "gpu_rank": 3}}, - {"rank": 2, "device": {"rank": 0, "node_rank": 0, "gpu_rank": 4}}, - {"rank": 3, "device": {"rank": 0, "node_rank": 0, "gpu_rank": 6}}], - "3": [ - {"rank": 0, "device": {"rank": 1, "node_rank": 0, "gpu_rank": 1}}, - {"rank": 1, "device": {"rank": 0, "node_rank": 0, "gpu_rank": 2}}, - {"rank": 2, "device": {"rank": 1, "node_rank": 0, "gpu_rank": 5}}, - {"rank": 3, "device": {"rank": 1, "node_rank": 0, "gpu_rank": 7}} - ] - } - } - """ - src_pp_ranks: List[int] = [rank_info.pp_rank for rank_info in self.src_cluster.worker_rank_info] - group_by_pp_rank = defaultdict(list) - for i, pp_rank in enumerate(src_pp_ranks): - group_by_pp_rank[pp_rank].append(i) + assert (max(train_devices) - min(train_devices)) == (len(train_devices) - 1), f"{train_devices=} must be continuous" + assert (max(infer_devices) - min(infer_devices)) == (len(infer_devices) - 1), f"{infer_devices=} must be continuous" - tgt_devices = [] - for rank in range(self.tgt_cluster.world_size): - for device in self.tgt_cluster.rank2devices[rank]: - tgt_devices.append(dict(rank=rank, device=device)) - - for pp_rank, src_ranks in group_by_pp_rank.items(): - for src_rank in src_ranks: - self.broadcast_comm_pan[pp_rank][src_rank] = [] - src_rank_iter = itertools.cycle(src_ranks) - i = 0 - while i < len(tgt_devices): - tgt_device = tgt_devices[i] - src_rank = next(src_rank_iter) - # 如何src_rank和tgt_rank位于同一个设备上,再取一个,如果两个相同,则无法分配当前tgt,加入p2p - src_device = self.src_cluster.rank2devices[src_rank][0] - if (src_device["node_rank"], src_device["gpu_rank"]) == ( - tgt_device["device"]["node_rank"], - tgt_device["device"]["gpu_rank"], - ): - src_rank_next = next(src_rank_iter) - if src_rank_next == src_rank: - self.p2p_comm_plan[pp_rank][src_rank].append(tgt_device) - else: - i += 1 - self.broadcast_comm_pan[pp_rank][src_rank_next].append(tgt_device) - if i >= len(tgt_devices): - break - tgt_device_next = tgt_devices[i] - self.broadcast_comm_pan[pp_rank][src_rank].append(tgt_device_next) - else: - self.broadcast_comm_pan[pp_rank][src_rank].append(tgt_device) - i += 1 - print(f"broadcast_comm_pan: {json.dumps(self.broadcast_comm_pan)}") - print(f"p2p_comm_plan: {json.dumps(self.p2p_comm_plan)}") - if len(self.p2p_comm_plan) > 0: - print("p2p comm does not suggest, please change your config") - - def model_update_group_name(self, src_rank, tgt_devices): - tgt_names = [f"({tgt_device['rank']},{tgt_device['device']['rank']})" for tgt_device in tgt_devices] - return f"model_update_{self.src_cluster.cluster_name}_{src_rank}_to_{self.tgt_cluster.cluster_name}_{'-'.join(tgt_names)}" - - def make_collective_group(self): - for pp_rank, pp_comm_plan in self.broadcast_comm_pan.items(): - refs = [] - pp_comm_plan_args = {} - for src_rank, tgt_devices in pp_comm_plan.items(): - comm_plan_args = {} - group_name = self.model_update_group_name(src_rank, tgt_devices) - group_master_worker = self.src_cluster.rank2worker[src_rank] - group_master_addr = ray.get(group_master_worker.get_node_ip.remote()) - group_master_port = ray.get(group_master_worker.get_free_port.remote()) - comm_plan_args["group_name"] = group_name - comm_plan_args["master_addr"] = group_master_addr - comm_plan_args["master_port"] = group_master_port - comm_plan_args["tgt_devices"] = tgt_devices - comm_plan_args["src_pp_rank"] = pp_rank - comm_plan_args["src_rank"] = src_rank - pp_comm_plan_args[src_rank] = comm_plan_args - ref = group_master_worker.setup_collective_group.remote(model_update_name=self.model_update_name, - comm_plan={src_rank: comm_plan_args}) - refs.append(ref) - - print(f"pp_rank: {pp_rank} pp_comm_plan_args: {json.dumps(pp_comm_plan_args)}") - for tgt_worker in self.tgt_cluster.workers: - ref = tgt_worker.setup_collective_group.remote(model_update_name=self.model_update_name, - comm_plan=pp_comm_plan_args) - refs.append(ref) - ray.get(refs) + ray.get( + [ + train_worker.setup_model_update.remote( + infer_cluster=self.tgt_cluster, model_update_name=self.model_update_name + ) + for train_worker in self.src_cluster.workers + ] + ) def model_update(self, step=None): - metrics_list = {} - if step % self.frequency == 0: - for pp_rank, pp_comm_plan in self.broadcast_comm_pan.items(): - # 一个pp rank 内的要一起更新,目标是更新这一pp rank(pp stage part)内的参数 - # 具体model_update由src role自行实现,这样不需要显式更新模型参数 - refs = [] - for src_rank, tgt_devices in pp_comm_plan.items(): - src_cluster = self.src_cluster.rank2worker[src_rank] - ref = src_cluster.start_model_update.remote( - model_update_name=self.model_update_name, - tgt_workers=self.tgt_cluster.workers, - broadcast_tgt_devices=tgt_devices, - p2p_tgt_devices=self.p2p_comm_plan[pp_rank][src_rank], - ) - refs.append(ref) - data = ray.get(refs) - - metrics_list.update(reduce_metrics(DataProto.concat(data).meta_info.pop("metrics", {}))) - return metrics_list + if step % self.frequency != 0: + return {} + + dataprotos: list[DataProto] = ray.get( + [ + train_worker.start_model_update.remote(model_update_name=self.model_update_name) + for train_worker in self.src_cluster.workers + ] + ) + return reduce_metrics_list([dataproto.meta_info["metrics"] for dataproto in dataprotos]) diff --git a/roll/distributed/executor/worker.py b/roll/distributed/executor/worker.py index ce162a2dd..fd16a7bf7 100644 --- a/roll/distributed/executor/worker.py +++ b/roll/distributed/executor/worker.py @@ -18,6 +18,7 @@ from roll.utils.network_utils import collect_free_port, get_node_ip from roll.utils.offload_states import OffloadStateType from roll.utils.offload_nccl import monkey_patch_torch_dist + from roll.platforms import current_platform @@ -115,6 +116,29 @@ def get_free_port(): def get_master_addr_and_port(self): return self.master_addr, self.master_port + def _get_strategy_load_state(self) -> Optional[bool]: + """Check if strategy model is loaded in GPU. + + Handles multiple strategy implementations: + - vLLM: strategy.is_model_in_gpu (direct attribute) + - SGLang: strategy.model.is_model_in_gpu (nested attribute) + - Others: None (not trackable) + + Returns: + True if loaded, False if offloaded, None if not trackable + """ + if getattr(self, "strategy", None) is None: + return None + + # Try direct attribute (vLLM pattern) + is_loaded = getattr(self.strategy, 'is_model_in_gpu', None) + + # Try nested attribute (SGLang pattern) + if is_loaded is None and hasattr(self.strategy, 'model'): + is_loaded = getattr(self.strategy.model, 'is_model_in_gpu', None) + + return is_loaded + @staticmethod def get_visible_gpus(): return current_platform.get_visible_gpus() @@ -145,6 +169,12 @@ def load_states(self, *args, **kwargs): self.strategy.load_states() else: self.logger.warning("worker has not strategy") + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + def process_weights_after_loading(self): + if getattr(self, "strategy", None) is not None: + self.strategy.process_weights_after_loading() + @register(dispatch_mode=Dispatch.ONE_TO_ALL) def offload_states(self, *args, **kwargs): @@ -159,11 +189,8 @@ def broadcast_parameter(self, *args, **kwargs): else: self.logger.warning("worker has not strategy") - def broadcast_bucket(self, *args, **kwargs): - if getattr(self, "strategy", None) is not None: - self.strategy.broadcast_bucket(*args, **kwargs) - else: - self.logger.warning("worker has not strategy") + def setup_model_update(self, *args, **kwargs): + self.strategy.setup_model_update(*args, **kwargs) def setup_collective_group(self, *args, **kwargs): if getattr(self, "strategy", None) is not None: @@ -171,6 +198,12 @@ def setup_collective_group(self, *args, **kwargs): else: self.logger.warning("worker has not strategy") + def setup_p2p_collective_group(self, *args, **kwargs): + if getattr(self, "strategy", None) is not None: + self.strategy.setup_p2p_collective_group(*args, **kwargs) + else: + self.logger.warning("worker does not have a strategy") + def start_model_update(self, *args, **kwargs): metrics = {} if getattr(self, "strategy", None) is not None: @@ -189,9 +222,9 @@ def start_model_update(self, *args, **kwargs): output = DataProto(meta_info={"metrics": metrics}) return output - def update_parameter(self, *args, **kwargs): + def model_update_set_read_done_handle(self, *args, **kwargs): if getattr(self, "strategy", None) is not None: - self.strategy.update_parameter(*args, **kwargs) + self.strategy.model_update_set_read_done_handle(*args, **kwargs) else: self.logger.warning("worker has not strategy") @@ -207,10 +240,6 @@ def add_lora(self, *args, **kwargs): else: self.logger.warning("worker has not strategy") - def download_models(self, model_name_or_paths: set[str]): - futures.wait([self.thread_executor.submit(download_model, model_name_or_path) - for model_name_or_path in model_name_or_paths]) - @register(dispatch_mode=Dispatch.DP_MP_COMPUTE) def get_metrics(self, metric_names: Optional[List[str]] = None) -> DataProto: """ diff --git a/roll/distributed/scheduler/driver_utils.py b/roll/distributed/scheduler/driver_utils.py index c6a7b634a..e9c9e1f6e 100644 --- a/roll/distributed/scheduler/driver_utils.py +++ b/roll/distributed/scheduler/driver_utils.py @@ -121,4 +121,20 @@ async def wait(self): self.arrived = 0 self.event.clear() return - await self.event.wait() \ No newline at end of file + await self.event.wait() + + +@ray.remote +class Locker: + def __init__(self): + self._locked = False + + def acquire(self): + if self._locked: + return False + self._locked = True + return True + + def release(self): + assert self._locked + self._locked = False diff --git a/roll/distributed/scheduler/generate_scheduler.py b/roll/distributed/scheduler/generate_scheduler.py index d968eb5aa..9c17b7984 100644 --- a/roll/distributed/scheduler/generate_scheduler.py +++ b/roll/distributed/scheduler/generate_scheduler.py @@ -1,13 +1,16 @@ +import asyncio import copy import itertools -import queue import random -import threading -import asyncio +import math import uuid import time -from collections import defaultdict +from collections import defaultdict, deque +from dataclasses import dataclass, fields +from itertools import cycle + from typing import Any, Union, Optional, Dict, List, Set +from contextlib import asynccontextmanager import numpy as np import ray @@ -18,88 +21,619 @@ from tqdm import tqdm from transformers import set_seed -import hashlib -import base64 -import json -import os - from roll.distributed.executor.cluster import Cluster from roll.distributed.scheduler.protocol import DataProto, collate_fn, pad_dataproto_to_divisor, unpad_dataproto from roll.distributed.scheduler.reward_scheduler import RewardScheduler +from roll.distributed.scheduler.rollout_mock_mixin import RolloutMockMixin from roll.models.model_providers import default_tokenizer_provider, default_processor_provider -from roll.utils.constants import RAY_NAMESPACE from roll.utils.functionals import ( postprocess_generate, - reduce_metrics, concatenate_input_and_output, - GenerateRequestType, ) +from roll.utils.taskgroups import TaskGroup # TODO use official TaskGroup after upgrade to python 3.11 +from roll.utils.metrics.metrics_manager import DurationTracker +from roll.utils.import_utils import safe_import_class from roll.utils.logging import get_logger logger = get_logger() -@ray.remote(concurrency_groups={"single_thread": 1, "multi_thread": 128}) -class GenerateScheduler: +def is_report_data_finished(data: DataProto) -> bool: + finish_reasons = data.meta_info.get("finish_reasons", []) + assert isinstance(finish_reasons, list), f"{finish_reasons}" + assert all(isinstance(finish_reason, str) for finish_reason in finish_reasons), f"{finish_reasons}" + return not any(finish_reason == "abort" for finish_reason in finish_reasons) + +def expand_requests(data: DataProto, num_return_sequences, is_num_return_sequences_expand): + """ + Args: + data (DataProto) [IN|OUT]: 'num_return_sequences' will be overwritten + """ + assert "generation_config" in data.meta_info, f"data {data.meta_info} should have key 'generation_config'" + generation_config = data.meta_info["generation_config"] + target_requests = [] + if is_num_return_sequences_expand: + generation_config["num_return_sequences"] = 1 + for _ in range(num_return_sequences): + target_requests.append(copy.deepcopy(data)) + else: + generation_config["num_return_sequences"] = num_return_sequences + target_requests.append(copy.deepcopy(data)) + return target_requests + +def expand_responses(response: Optional[Union[DataProto, List[DataProto]]]) -> List[DataProto]: + ret: List[DataProto] = [] + if response is None: + return ret + stack = deque([response]) + while stack: + current = stack.pop() + if isinstance(current, list): + stack.extend(reversed(current)) + else: + assert isinstance(current, DataProto) + batch_size = current.batch.batch_size[0] + assert batch_size >= 0 + if batch_size > 1: + stack.extend(reversed([current[i] for i in range(batch_size)])) + else: + ret.append(current) + return ret + + +class LoadBalancer: + """ + Manage a bunch or workers (worker indexes). Limit the number of running requests of + each dp below max_running_requests. + + Usage: see document of ReplayBuffer + + TODO support rescheduling to differente dp_rank after model update or support dynamic dp_rank. + """ + + class Lease: + def __init__(self, load_balancer: "LoadBalancer", lease: int, dp_rank: int): + self.mutex = asyncio.Lock() + self.load_balancer = load_balancer + self.lease = lease + self.limit = lease + self.running = 0 + self._dp_rank = dp_rank + + def __del__(self): + # User must call clear or consume all lease to give back credit explicitly. + assert self.lease == 0 + + def clear(self): + assert self.lease >= 0 + assert self.running == 0 + self.load_balancer._release(self._dp_rank, credit=self.lease) + self.limit = 0 + self.lease = 0 + + @asynccontextmanager + async def lock(self, samplen): + assert self.running + samplen <= self.limit # user should not over-subscribe lease + self.running += samplen # must add outside the lock + + async with self.mutex: + if self.lease < samplen: + extra = samplen - self.lease + await self.load_balancer._reacquire(self._dp_rank, credit=extra) + self.lease += extra + assert self.lease == samplen + self.lease -= samplen + + try: + # return dp_rank explicitly, may support dynamic dp_rank + yield self._dp_rank + finally: + self.running -= samplen + self.load_balancer._release(self._dp_rank, credit=samplen) + + def __init__(self, mp_rank_zero: Dict[int, any], max_running_requests: int): + self.workers = {} # key: dp_rank, value: running_requests + self.worker_acquire_event = {} + for dp_rank in mp_rank_zero.keys(): + self.workers[dp_rank] = 0 + self.worker_acquire_event[dp_rank] = asyncio.Event() + + self.max_running_requests = max_running_requests + self.running_request = 0 + self.acquire_event = asyncio.Event() + + self._suspend = False + self.suspend_event = asyncio.Event() + self.empty_event = asyncio.Event() + + async def acquire(self, credit: int) -> Lease: + """ + Dispatching n sample of a prompt to the same worker using best fit strategy (using + linear search for simplicity), blocking wait if no worker is available. + """ + while True: + while self._suspend: + self.suspend_event.clear() + await self.suspend_event.wait() + + target = -1 + for dp_rank, running_requests in self.workers.items(): + if running_requests >= self.max_running_requests: + continue + if target == -1 or running_requests < self.workers[target]: + target = dp_rank + if target != -1: + # FIXME may send more than max_running_requests (i.e. workers[target] + credit > max_running_requests) + self.workers[target] += credit + self.running_request += credit + return self.Lease(self, lease=credit, dp_rank=target) + self.acquire_event.clear() + await self.acquire_event.wait() + + async def _reacquire(self, dp_rank: int, credit: int) -> int: + """ + For multi-turn rollout. + """ + assert dp_rank in self.workers + while True: + while self._suspend: + self.suspend_event.clear() + await self.suspend_event.wait() + + if self.workers[dp_rank] < self.max_running_requests: + self.workers[dp_rank] += credit + self.running_request += credit + return + self.worker_acquire_event[dp_rank].clear() + await self.worker_acquire_event[dp_rank].wait() + + def _release(self, dp_rank: int, credit: int = 1): + assert credit >= 0 + self.workers[dp_rank] -= credit + self.running_request -= credit + assert self.workers[dp_rank] >= 0 + assert self.running_request >= 0 + self.acquire_event.set() + self.worker_acquire_event[dp_rank].set() + self.empty_event.set() + + def empty(self) -> bool: + return sum(self.workers.values()) == 0 + + def full(self) -> bool: + return all(running_requests >= self.max_running_requests for running_requests in self.workers.values()) + + def suspend(self): + """ + Suspend all running requests. + + User calling acquire and suspended will be blocked after suspend. + """ + if self._suspend: + return + self._suspend = True + + async def wait_complete(self): + """ + Wait until all running requests are finished (no matter + whether suspended or not). + """ + while self.running_request > 0: + self.empty_event.clear() + await self.empty_event.wait() + + def resume(self): + self._suspend = False + self.suspend_event.set() + + +@dataclass +class ExperienceItem: + prompt_id: int + domain: str = "default" + sampling_start_step: Optional[int] = None + data: Optional[DataProto] = None + + +class ItemsGroup: + """ + items with the same starting step + """ + def __init__(self, start_step): + self.start_step: int = start_step + self.running_prompts: Set[int] = set() + self.finished_prompts: deque[List[ExperienceItem]] = deque() + self.num_samples = 0 + self.event = asyncio.Event() + + def info(self): + return ( + f"ItemsGroup {self.start_step}: " + f"{len(self.running_prompts)=} " + f"{len(self.finished_prompts)=} " + f"{self.num_samples=} " + ) + + def empty(self) -> bool: + return len(self.running_prompts) == 0 and len(self.finished_prompts) == 0 + + def get_running_prompt_ids(self) -> Set[int]: + return self.running_prompts + + def begin_prompt(self, prompt_id): + assert prompt_id not in self.running_prompts + self.running_prompts.add(prompt_id) + + def commit_prompt(self, prompt_id: int, result: List[ExperienceItem]): + self.running_prompts.remove(prompt_id) + assert prompt_id not in self.finished_prompts + self.finished_prompts.append(result) + self.num_samples += len(result) + self.event.set() + + def abort_prompt(self, prompt_id: int): + self.running_prompts.remove(prompt_id) + assert prompt_id not in self.finished_prompts + self.event.set() + + async def get_batch(self, expected_samples) -> List[List[ExperienceItem]]: + """ + Get at most batch_size * num_return_sequences samples from finished prompts, + blocking wait when there are running and has not collect enough responses. + """ + assert expected_samples >= 0 + while self.num_samples < expected_samples and not len(self.running_prompts) == 0: + self.event.clear() + await self.event.wait() + if self.num_samples <= expected_samples: + result = list(self.finished_prompts) + collected_samples = self.num_samples + self.finished_prompts = deque() + self.num_samples = 0 + else: + result = [] + collected_samples = 0 + while collected_samples < expected_samples and self.finished_prompts: + item = self.finished_prompts.popleft() + result.append(item) + collected_samples += len(item) + self.num_samples -= len(item) + assert sum(len(item) for item in self.finished_prompts) == self.num_samples + assert collected_samples == sum(len(item) for item in result) + # collected_samples may greater than expected_samples + return result + + +class ReplayBuffer: + """ + Provide a transactional interface to control running and finished prompts. + + Both sync and async training are supported (sync training is special case of async training). + + Limit running prompts (not aware of num_return_sequences) below batch_size + or batch_size + max_additional_running_prompts. + + Often used with LoadBalancer. ReplayBuffer control how many prompts can be + sent at the same time, and LoadBalancer limit request to ActorInfer and RewardWorker. + The real concurrency is limited by both ReplayBuffer and LoadBalancer. + + Public interface: + * advance_step: update current step and increate total batch size. (think of + this as an epoch-based reclamation (or epoch-based garbage collection)) + * poll: poll for a prompt_id with implicit rate limit + * begin: bind prompt_id to current step + * commit/abort: accept or filter out responses of a prompt + * gc: garbage collect outdated running or committed(finished) prompts + (sync training will clear all stored but not used prompts) + + Usage: see tests/distributed/scheduler/test_generate_scheduler.py and RolloutContext. + """ + def __init__( + self, + async_generation_ratio, + is_use_additional_prompts, + max_additional_running_prompts + ): + self.pid = 0 + self.current_step = None + self.groups: Dict[int, ItemsGroup] = {} + self.prompt_id_to_start_step: Dict[int, int] = {} # only store map info for running prompts + + self.batch_size = 0 + assert async_generation_ratio >= 0 + assert not is_use_additional_prompts or max_additional_running_prompts > 0 + self.async_generation_ratio = async_generation_ratio + self.is_use_additional_prompts = is_use_additional_prompts + self.max_additional_running_prompts = max_additional_running_prompts + + self._shutdown = False + + self.running_prompts = 0 # prompts in running state + self.running_extra_prompts = 0 # additinal prompts in running state + # only running_prompts not running_extra_prompts will add to completed_prompts at commit + # so running_prompts + completed_prompts may less than used_prompts if is_use_additional_prompts is set + self.completed_prompts = 0 # prompts in commit state + self.event = asyncio.Event() + self.advance_step_event = asyncio.Event() + + def info(self) -> str: + group_info = [group.info() for group in self.groups.values()] + return ( + f"ReplayBuffer: {self.current_step=} {self.batch_size=} " + f"{self.used_prompts=} {self.completed_prompts=} {self.running_prompts=} {self.running_extra_prompts=} " + f"{group_info=} " + ) + + def _next_pid(self): + pid = self.pid + self.pid += 1 + return pid + + @property + def used_prompts(self) -> int: + return self.pid + + def shutdown(self): + self._shutdown = True + self.event.set() + self.advance_step_event.set() + + def advance_step(self, step, batch_size): + # step must increase monotonically + assert not self.current_step or step > self.current_step + assert step not in self.groups + old_step = self.current_step + old_batch_size = self.batch_size + self.current_step = step + # Must create ItemsGroup before get_batch. Otherwise, + # if user call get_batch with min_step = current_step before + # any process_new_prompt task is scheduled, get_batch will skip + # current step and return empty list which breaks the postcondition + # of get_enough_finished_prompts. + self.groups[step] = ItemsGroup(start_step=step) + if self.batch_size == 0 and self.async_generation_ratio > 0: + # first step + self.batch_size = math.ceil(batch_size * self.async_generation_ratio) + self.batch_size += batch_size + logger.info(f"advance_step from {old_step} to {self.current_step}, " + f"batch_size from {old_batch_size} to {self.batch_size}") + self.event.set() + self.advance_step_event.set() + + def _check_send_new_request(self) -> bool: + if self.running_prompts + self.completed_prompts < self.batch_size: + self.running_prompts += 1 + return True + elif self.is_use_additional_prompts and self.batch_size > 0 and (self.running_prompts + self.running_extra_prompts < self.max_additional_running_prompts): + # condition self.batch_size>0 ensure not at initialization stage + self.running_extra_prompts += 1 + return True + else: + return False + + async def poll(self) -> int: + """ + Will blocking wait when can not send new request and is not in shutdown stage. + """ + prompt_id = self._next_pid() + while True: + if self._shutdown: + raise asyncio.CancelledError + elif self._check_send_new_request(): + self.prompt_id_to_start_step[prompt_id] = None + return prompt_id + self.event.clear() + await self.event.wait() + + async def begin(self, prompt_id) -> int: + """ + Blocking wait if group of current_step not exists (happen during garbage collection). + + Exception: + asyncio.CancelledError: if prompt_id is aborted or at system shutdown stage. + """ + assert self.current_step is not None + while True: + start_step = self.current_step + if start_step in self.groups: + assert start_step == next(reversed(self.groups)) + break + elif self._shutdown: + raise asyncio.CancelledError + self.advance_step_event.clear() + await self.advance_step_event.wait() + if prompt_id not in self.prompt_id_to_start_step: + raise asyncio.CancelledError + assert self.prompt_id_to_start_step[prompt_id] is None, f"{prompt_id=} {self.prompt_id_to_start_step[prompt_id]=}" + self.prompt_id_to_start_step[prompt_id] = start_step + group = self.groups[start_step] + group.begin_prompt(prompt_id) + return start_step + + def _commit_prompt(self): + assert self.running_prompts + self.running_extra_prompts > 0 + if self.running_prompts > 0: + self.running_prompts -= 1 + self.completed_prompts += 1 + else: + self.running_extra_prompts -= 1 + + def commit(self, prompt_id, result: List[ExperienceItem]): + try: + if prompt_id not in self.prompt_id_to_start_step: + # Prompt has finished or has been garbage collected. + raise asyncio.CancelledError + start_step = self.prompt_id_to_start_step.pop(prompt_id) + if start_step is None: + raise RuntimeError("Prompt has not been bind to step, cannot commit.") + assert start_step in self.groups, "group must exits between begin and commit/abort" + # assert len(result) == num_return_sequences + assert all(item.sampling_start_step == start_step for item in result) + self._commit_prompt() + self.groups[start_step].commit_prompt(prompt_id, result) + finally: + self.event.set() + + def _abort_prompt(self): + assert self.running_prompts + self.running_extra_prompts > 0 + if self.running_prompts > 0: + self.running_prompts -= 1 + else: + self.running_extra_prompts -= 1 + + def abort(self, prompt_id): + try: + if prompt_id not in self.prompt_id_to_start_step: + # Prompt has finished or has been garbage collected. + return + start_step = self.prompt_id_to_start_step.pop(prompt_id) + if start_step is None: + # Prompt has not been bind to step. + self._abort_prompt() + return + assert start_step in self.groups, "group must exits between begin and commit/abort" + self._abort_prompt() + self.groups[start_step].abort_prompt(prompt_id) + finally: + self.event.set() + + async def get_batch(self, expected_samples: int, progress_bar) -> List[ExperienceItem]: + """ + await on this function to wait for enough requests to be collected + + Assume self.groups will not be mutated during this function, i.e. min(self.groups,keys) and + max(self.groups.keys) will not change and no iterator invalidation. + """ + min_step = self.current_step - math.ceil(self.async_generation_ratio) + # min_step_group must exist or min_step must less than min step of self.groups + assert min_step in self.groups or next(iter(self.groups)) > min_step + + logger.info(f"ReplayBuffer get_batch: {self.current_step=} {min_step=} {expected_samples=}, {self.info()}") + + collected_samples = 0 + responses: List[List[DataProto]] = [] + + # Iter on self.groups will go from small step to large step. + for step, group in self.groups.items(): + if step < min_step: + continue + elif step == self.current_step: + # special case: current running step, scheduler may have not send + # new prompts (because of concurrency), we handle it outside this for loop. + break + elif step == min_step: + if self.async_generation_ratio % 1 == 0: + expected = expected_samples - collected_samples + else: + expected = int(expected_samples * (self.async_generation_ratio % 1)) + else: + expected = expected_samples - collected_samples + finished_prompts = await group.get_batch(expected_samples=expected) + amount = sum(len(response) for response in finished_prompts) + collected_samples += amount + progress_bar.update(amount) + responses.extend(finished_prompts) + if collected_samples >= expected_samples: + break + # step == self.current_step, wait for scheduler to send enough new prompts + while collected_samples < expected_samples: + # There may be no running prompt at this time, + # yield control to schedule process_new_prompt. + await asyncio.sleep(0) + finished_prompts = await group.get_batch(expected_samples=expected_samples-collected_samples) + amount = sum(len(response) for response in finished_prompts) + collected_samples += amount + # If want to update progress_bar in a fine-grained manner, can call get_batch on the latest step + # with expected_samples=num_return_sequences. But it will increase overhead. + progress_bar.update(amount) + responses.extend(finished_prompts) + + result: List[ExperienceItem] = [] + for response in responses: + result.extend(response) + assert len(result) == collected_samples + return result + + def gc(self) -> List[int]: + """ + Garbage collect old ItemsGroup. + + Return aborted prompt ids to scheduler(caller) and + rely on scheduler(caller) to abort generate requests. + + Assume called after get_batch(step=current_step) and before advance_step(step=current_step + 1), + and will garbage collect steps equal or less than current_step - min_start_step. + + User must handle return value of begin and commit to check whether should shutdown. + """ + # Assume the following operations are atomic (no yiled). + if self.current_step is None: + assert not self.groups + return [] + max_gc_step = self.current_step - math.ceil(self.async_generation_ratio) + assert max_gc_step == self.current_step or self.async_generation_ratio > 0 + assert not self.groups or max_gc_step <= next(reversed(self.groups)) + aborted_prompts = [] + # Must remove group after all prompts are gracefully aborted. + aborted_groups = [] + # Iter on self.groups will go from small step to large step. + for step, group in self.groups.items(): + if step <= max_gc_step: + aborted_prompts.extend(group.get_running_prompt_ids()) + aborted_groups.append(step) + for prompt_id in aborted_prompts: + self.abort(prompt_id) + for step in aborted_groups: + assert self.groups[step].empty() or self.is_use_additional_prompts, f"{step=} {self.groups[step]=}" + del self.groups[step] + logger.info(f"ReplayBuffer {self.current_step=} {max_gc_step=} garbage collect groups {aborted_groups}") + return aborted_prompts + + +class Scheduler: + def __init__(self): + self.request_id = uuid.uuid4() + self.request_counter = 0 + + def next_request_id(self): + request_id = f"{self.request_id}_{self.request_counter}" + self.request_counter += 1 + return request_id + + +@ray.remote +class GenerateScheduler(Scheduler): def __init__(self, pipeline_config=None): + super().__init__() self.cluster: Union[Any, Cluster] = None self.pipeline_config = pipeline_config - self.progress_bar: Optional[tqdm] = None - self.request_counter = itertools.count() - self.dp_fetch_count = {} - self.load_balance_coordinator = {} + self.mp_rank_zero = {} - self.data: Optional[DataProto] = None - self.responses: Dict[int, List[DataProto]] = defaultdict(list) - self.request_id_2_prompt_id: Dict[str, int] = {} - self.prompt_id_2_request_ids: Dict[int, set] = defaultdict(set) - self.response_batch_size: Optional[int] = None - self.abort_request_ids: set[str] = set() - self.input_data: Optional[DataProto] = None - self.is_completed = False - self.request_id_2_dp_rank = {} - self.completed_count = set() - self.prompt_count = 0 self.max_running_requests = 128 - self.alive_check_interval = 10 - self.last_alive_check = time.time() - self.lock = threading.Lock() - self.response_callback_fn = None + self.load_balancer = None - def generate(self, data: DataProto, actor_cluster: Union[Any, Cluster], pipeline_config) -> DataProto: - self.response_callback_fn = data.meta_info["response_callback_fn"] - self.pipeline_config = pipeline_config - self.cluster = actor_cluster - if len(self.mp_rank_zero) == 0: + async def generate(self, data: DataProto, actor_cluster: Union[Any, Cluster], pipeline_config) -> DataProto: + assert self.pipeline_config is None or pipeline_config is self.pipeline_config + if self.cluster is None: + self.cluster = actor_cluster dp_ranks: List[int] = [rank_info.dp_rank for rank_info in self.cluster.worker_rank_info] for i, dp_rank in enumerate(dp_ranks): rank_info = self.cluster.get_rank_info(rank=i) if rank_info.tp_rank == 0 and rank_info.pp_rank == 0 and rank_info.cp_rank == 0: self.mp_rank_zero[dp_rank] = self.cluster.workers[i] - self.dp_fetch_count = {dp_rank: 0 for dp_rank in self.mp_rank_zero.keys()} - self.load_balance_coordinator = {dp_rank: 0 for dp_rank in self.mp_rank_zero.keys()} - self.request_id_2_prompt_id.clear() - self.prompt_id_2_request_ids.clear() - self.abort_request_ids.clear() - self.request_id_2_dp_rank.clear() - self.completed_count.clear() generate_opt_level = pipeline_config.generate_opt_level num_return_sequences = actor_cluster.worker_config.generating_args.num_return_sequences - is_num_return_sequences_expand = pipeline_config.is_num_return_sequences_expand + if generate_opt_level == 0 and is_num_return_sequences_expand: logger.warning("is_num_return_sequences_expand=True and generate_opt_level may reduce performance.") - data.batch["prompt_id"] = torch.arange(data.batch.batch_size[0], device=data.batch.device) - self.input_data = data data.meta_info["is_num_return_sequences_expand"] = is_num_return_sequences_expand data.meta_info["num_return_sequences"] = num_return_sequences - self.prompt_count = self.input_data.batch.batch_size[0] - generation_config = self.cluster.worker_config.generating_args.to_dict() generation_config["num_return_sequences"] = num_return_sequences if is_num_return_sequences_expand: @@ -141,280 +675,136 @@ def generate(self, data: DataProto, actor_cluster: Union[Any, Cluster], pipeline non_tensor_batch=non_tensor_batch, meta_info=data.meta_info, ) - ret = self.cluster.generate(data) - self.input_data = None - return ret + output_in_dp = await asyncio.gather(*[ref.obj_ref for ref in self.cluster.generate(data=data, blocking=False)]) + return DataProto.concat(output_in_dp) elif generate_opt_level == 1: - # async + load balance - if is_num_return_sequences_expand: - batch_size = data.batch.batch_size[0] - output_batch_size = batch_size * num_return_sequences - input_ids = data.batch["input_ids"] - attention_mask = data.batch["attention_mask"] - position_ids = data.batch["position_ids"] - prompt_ids = data.batch["prompt_id"] - input_ids = input_ids.repeat(num_return_sequences, 1) - attention_mask = attention_mask.repeat(num_return_sequences, 1) - if position_ids.dim() == 3: # (bsz, 3, seqlen) - position_ids = position_ids.repeat(num_return_sequences, 1, 1) - non_tensor_batch = dict( - (k, np.tile(v, num_return_sequences)) - for k, v in data.non_tensor_batch.items()) - else: - position_ids = position_ids.repeat(num_return_sequences, 1) - non_tensor_batch = {} - prompt_ids = prompt_ids.unsqueeze(-1).repeat(num_return_sequences, 1) - - data = DataProto( - batch=TensorDict( - { - "input_ids": input_ids, - "attention_mask": attention_mask, - "position_ids": position_ids, - "prompt_id": prompt_ids, - }, - batch_size=output_batch_size, - ), - non_tensor_batch=non_tensor_batch, - meta_info=data.meta_info, - ) - self.is_completed = False - ret = self.generate_opt_level_1(data) - self.input_data = ret - return ret + return await self.generate_opt_level_1(data, num_return_sequences, is_num_return_sequences_expand) else: raise NotImplementedError(f"not support generate_opt_level {generate_opt_level}") - def get_available_dp_rank(self): - while True: - # 负载均衡逻辑,期望各dp 正在处理的条数基本接近 - sorted_ranks = sorted( - self.load_balance_coordinator.keys(), key=lambda rank: (self.load_balance_coordinator[rank], rank) - ) - if self.load_balance_coordinator[sorted_ranks[0]] < self.max_running_requests: - yield sorted_ranks[0] - - def send_request_to_one_worker(self, data: DataProto): - dp_rank = next(self.get_available_dp_rank()) - ray.get(self.cluster.workers[dp_rank].add_request.remote(command=GenerateRequestType.ADD, data=data)) - self.load_balance_coordinator[dp_rank] += 1 - self.dp_fetch_count[dp_rank] += 1 - - def generate_opt_level_1(self, data: DataProto): - # async++ - is_num_return_sequences_expand = self.pipeline_config.is_num_return_sequences_expand - num_return_sequences = self.cluster.worker_config.generating_args.num_return_sequences - - response_batch_size = 1 if is_num_return_sequences_expand else num_return_sequences - self.response_batch_size = response_batch_size - self.progress_bar = tqdm( - total=self.prompt_count, desc="generate progress(prompt)", mininterval=int(self.prompt_count * 0.1) + 1 - ) - - self.data = data - self.responses: Dict[int, List[DataProto]] = defaultdict(list) - - logger.info( - f"request id size: {data.batch.batch_size[0]} " - f"response_batch_size: {response_batch_size} " - f"is_num_return_sequences_expand: {is_num_return_sequences_expand}" - ) - self.cluster.start_server(data=DataProto(meta_info=data.meta_info), blocking=True) - - # 分发数据至收到target rollout 完成 - # 无限循环,把所有的response发送给dp worker - send_request_count = 0 - request_refs = [] - data_index_counter = itertools.count() - last_alive_check = time.time() - while not self.is_completed: - - # 探测dp worker是否存活,dp worker的server thread可能由于异常退出,造成hang - current_time = time.time() - if current_time - last_alive_check >= self.alive_check_interval: - self.cluster.add_request(command=GenerateRequestType.ALIVE_CHECK, data=DataProto()) - last_alive_check = current_time - - if send_request_count < data.batch.batch_size[0]: - # 取一个可以发送request的dp worker - dp_rank = next(self.get_available_dp_rank()) - - # 还有数据需要发送, 取需要发送的数据 - # request_id 全局递增,否则vllm/sglang scheduler状态不对 - request_id = next(self.request_counter) - data_index = next(data_index_counter) - request_data = collate_fn([self.data[data_index]]) - request_data.meta_info["request_id"] = str(request_id) - prompt_id = self.data[data_index].batch["prompt_id"].item() - self.request_id_2_prompt_id[request_data.meta_info["request_id"]] = request_data.batch[ - "prompt_id" - ].item() - self.request_id_2_dp_rank[request_data.meta_info["request_id"]] = dp_rank - self.prompt_id_2_request_ids[prompt_id].add(request_data.meta_info["request_id"]) - # 需要注意上面的调用顺序, report_response中会更新request_id索引dp_rank,所以这里需要最后add request_id - request_data.meta_info["response_callback_fn"] = self.response_callback_fn - request_data.meta_info["generation_config"] = data.meta_info["generation_config"] - request_refs.append( - self.cluster.workers[dp_rank].add_request.remote( - command=GenerateRequestType.ADD, data=request_data - ) - ) - with self.lock: - self.load_balance_coordinator[dp_rank] += 1 - self.dp_fetch_count[dp_rank] += 1 - send_request_count += 1 - if len(request_refs) % self.cluster.world_size == 0: - ray.get(request_refs) - request_refs = [] - - gen_metrics = self.cluster.stop_server() - generate_return_num = num_return_sequences + async def generate_opt_level_1(self, data: DataProto, num_return_sequences, is_num_return_sequences_expand): + batch_size = data.batch.batch_size[0] + progress_bar = tqdm(total=batch_size, desc="generate progress(prompt)", mininterval=int(batch_size * 0.1) + 1) + self.load_balancer = LoadBalancer(self.mp_rank_zero, self.max_running_requests) + + is_offload_states = data.meta_info.get("is_offload_states", True) + await asyncio.gather(*[ref.obj_ref for ref in self.cluster.load_states(blocking=False)]) + + tasks = [] + for data_index in range(batch_size): + request_data = collate_fn([data[data_index]]) + request_data_list = expand_requests(data=request_data, + num_return_sequences=num_return_sequences, is_num_return_sequences_expand=is_num_return_sequences_expand) + + prompt_requests = [] + for req in request_data_list: + lease = await self.load_balancer.acquire(1) + async def _generate_reqeust(data: DataProto, lease): + with lease.lock(1) as dp_rank: + request_id = self.next_request_id() + data.meta_info["request_id"] = request_id + data.meta_info["generation_config"] = data.meta_info["generation_config"] + response = await self.cluster.workers[dp_rank].generate_request.remote(data=request_data) + return response + prompt_requests.append(asyncio.create_task(_generate_reqeust(data=req, lease=lease))) + + async def gather_one_prompt(requests): + """ + gather requests of one prompt + """ + responses = await asyncio.gather(*requests) + progress_bar.update(1) + return responses + tasks.append(asyncio.create_task(gather_one_prompt(requests=prompt_requests))) + assert self.load_balancer.empty() + response_list = await asyncio.gather(*tasks) + + if is_offload_states: + await asyncio.gather(*[ref.obj_ref for ref in self.cluster.offload_states(blocking=False)]) response_ids_list_of_list = [] eos_token_id = None pad_token_id = None - for sample_index in range(len(self.responses)): + for responses in response_list: response_ids_list = [] - for response in self.responses[sample_index]: + for response in responses: eos_token_id = response.meta_info["eos_token_id"] pad_token_id = response.meta_info["pad_token_id"] response_ids_list.extend(response.meta_info["output_token_ids"]) - assert ( - len(response_ids_list) >= generate_return_num - ), f"response_ids_list length {len(response_ids_list)} < generate_return_num {generate_return_num}" - response_ids_list_of_list.extend(response_ids_list[:generate_return_num]) + assert len(response_ids_list) == num_return_sequences + response_ids_list_of_list.extend(response_ids_list) response_ids_list_of_list = [torch.tensor(token_ids) for token_ids in response_ids_list_of_list] output_tensor = pad_sequence(response_ids_list_of_list, batch_first=True, padding_value=pad_token_id) output_tensor = concatenate_input_and_output( - input_ids=self.input_data.batch["input_ids"], + input_ids=data.batch["input_ids"], output_ids=output_tensor, - num_return_sequences=generate_return_num, + num_return_sequences=num_return_sequences, ) output: DataProto = postprocess_generate( - prompts=self.input_data, + prompts=data, output=output_tensor, - num_return_sequences=generate_return_num, + num_return_sequences=num_return_sequences, sequence_length=self.pipeline_config.sequence_length, eos_token_id=eos_token_id, pad_token_id=pad_token_id, ) - _, sorted_indices = torch.sort(output.batch["prompt_id"]) - output.reorder(indices=sorted_indices) - output.pop("prompt_id") - self.data = None - output.meta_info["metrics"] = reduce_metrics(gen_metrics.meta_info.pop("metrics", {})) - logger.info(f"dp_fetch_count: {self.dp_fetch_count}") return output - @ray.method(concurrency_group="single_thread") - def report_response(self, data: DataProto): - """ - 本质上也是维护了一个状态机 - """ - request_id = data.meta_info["request_id"] - prompt_id = self.request_id_2_prompt_id[request_id] - dp_rank = self.request_id_2_dp_rank[request_id] - with self.lock: - self.load_balance_coordinator[dp_rank] -= 1 - - if self.is_completed: - return - - self.responses[prompt_id].append(data) - required_response_count = self.cluster.worker_config.generating_args.num_return_sequences - self.prompt_id_2_request_ids[prompt_id].remove(data.meta_info["request_id"]) - if len(self.responses[prompt_id]) * self.response_batch_size >= required_response_count: - # 取已经完成的prompt_id,对应的request_ids,需要都取消 - if prompt_id not in self.completed_count: - self.progress_bar.update(1) - self.completed_count.add(prompt_id) - abort_refs = [] - for request_id in self.prompt_id_2_request_ids[prompt_id]: - with self.lock: - self.load_balance_coordinator[dp_rank] -= 1 - abort_refs.append( - self.cluster.workers[dp_rank].add_request.remote( - command=GenerateRequestType.ABORT, data=DataProto(meta_info={"request_id": request_id}) - ) - ) - if len(self.completed_count) >= self.prompt_count: - self.is_completed = True - - -@ray.remote(concurrency_groups={"single_thread": 1, "multi_thread": 256}) -class DynamicSamplingScheduler: +class DynamicSamplingScheduler(RolloutMockMixin, Scheduler): def __init__(self, pipeline_config=None): + super().__init__() self.pipeline_config = pipeline_config set_seed(seed=pipeline_config.seed) - self.progress_bar: Optional[tqdm] = None - self.request_counter = None - self.dp_fetch_count = {} - self.load_balance_coordinator = {} - self.mp_rank_zero = {} - self.request_id_2_prompt_id: Dict[str, int] = {} - self.prompt_id_2_request_ids: Dict[int, set] = defaultdict(set) - # prompt_id to unique prompt hash value - self.prompt_id_2_hash_str: Dict[int, str] = {} - self.response_batch_size: Optional[int] = None - self.abort_request_ids: set[str] = set() - self.request_id_2_dp_rank = {} - self.requests_buffers: Dict[str, DataProto] = {} - self.lock = threading.Lock() - self.last_alive_check = time.time() - self.dataset_iter_count = 0 - self.exception_queue = queue.Queue() - self.running = False - self.dataset_epoch = 0 - self.reward_scheduler = RewardScheduler() - # Flow control measures. max_running_requests limits the maximum number of concurrent requests for each dp. - # max_additional_running_prompts limits the number of prompts running simultaneously to avoid excessive consumption of prompts. - self.max_running_requests = self.pipeline_config.max_running_requests - self.max_additional_running_prompts = self.pipeline_config.max_additional_running_prompts - self.is_use_additional_prompts = self.pipeline_config.is_use_additional_prompts - self.alive_check_interval = self.pipeline_config.alive_check_interval + self.sequence_length = pipeline_config.sequence_length self.actor_cluster = None + self.mp_rank_zero = {} + self.reward_clusters = None self.reward_worker_iters = None + self.dataset = None self.indices = [] - self.batch_size = None + self.dataset_epoch = 0 self.dataset_iter = None + self.dataset_iter_count = 0 + self.collect_fn_cls = None self.collect_fn_kwargs = None self.collect_fn = None self.tokenizer = None self.processor = None - self.response_filter_fn = None - self.query_filter_fn = None - self.response_callback_fn = None - self.generation_config = None - - self.completed_buffers = None - self.query_group_buffers = None - - self.query_filter_count = 0 - self.response_filter_count = 0 - self.running_prompts = 0 - self.response_cache: Dict[str, List] = None - self.prompt_use_count = 0 - self.sequence_length = pipeline_config.sequence_length - def set_scheduler( + self.async_sending_task = None + self.replay_buffer = None + self.load_balancer = None + self.running_requests = None + self.running_tasks = None + + # metrics of a step + self.generate_timer = None + self.reward_timer = None + + # meta_info is reassigned every step + self.meta_info = None + + udrl_cls = safe_import_class(pipeline_config.user_defined_rollout_loop_cls) + assert udrl_cls + self.udrl = udrl_cls() + + self.reward_scheduler = RewardScheduler() + + async def set_scheduler( self, actor_cluster: Union[Any, Cluster], reward_clusters: Dict[str, Union[Any, Cluster]], dataset: Dataset, collect_fn_cls, collect_fn_kwargs, - response_filter_fn=None, - query_filter_fn=None, - response_callback_fn=None, state: Dict[str, Any] = None, is_val: bool = False, - is_vlm: bool = False, ): """ GenerateScheduler可以由多个实例,不再局限于单例 @@ -426,14 +816,27 @@ def set_scheduler( else: logger.info(f"training generate scheduler sequence_length is: {self.sequence_length}") + # Initialize rollout mock mechanism from mixin (after is_val is set) + self._init_rollout_mock() + self.actor_cluster = actor_cluster + dp_ranks: List[int] = [rank_info.dp_rank for rank_info in self.actor_cluster.worker_rank_info] + for i, dp_rank in enumerate(dp_ranks): + rank_info = self.actor_cluster.get_rank_info(rank=i) + if rank_info.tp_rank == 0 and rank_info.pp_rank == 0 and rank_info.cp_rank == 0: + self.mp_rank_zero[dp_rank] = self.actor_cluster.workers[i] + self.reward_clusters = reward_clusters self.reward_worker_iters = {} for domain, cluster in reward_clusters.items(): self.reward_worker_iters[domain] = itertools.cycle(cluster.workers) + self.generate_timer = {domain: DurationTracker() for domain in reward_clusters.keys()} + self.reward_timer = {domain: DurationTracker() for domain in reward_clusters.keys()} + self.dataset = dataset self.indices = list(range(len(dataset))) + # TODO: (async training) test resume if state is not None and state.get("dataset_iter_count", 0) > 0: for _ in range(state["dataset_iter_count"]): self.get_next_dataset_item() @@ -442,58 +845,75 @@ def set_scheduler( self.collect_fn_kwargs = collect_fn_kwargs self.tokenizer = default_tokenizer_provider(model_args=self.actor_cluster.worker_config.model_args) self.processor = default_processor_provider(model_args=self.actor_cluster.worker_config.model_args) - if is_vlm: + if "processor" in [f.name for f in fields(collect_fn_cls)]: collect_fn_kwargs["processor"] = self.processor self.collect_fn = self.collect_fn_cls(tokenizer=self.tokenizer, **self.collect_fn_kwargs) - if self.is_use_additional_prompts: - self.response_filter_fn = response_filter_fn - self.query_filter_fn = query_filter_fn - else: - self.response_filter_fn = lambda data_list, config: True - self.query_filter_fn = lambda data_list, config: True - logger.info(f"use_additional_prompts is False, disable query and response filtering.") - self.response_callback_fn = response_callback_fn - dp_ranks: List[int] = [rank_info.dp_rank for rank_info in self.actor_cluster.worker_rank_info] - for i, dp_rank in enumerate(dp_ranks): - rank_info = self.actor_cluster.get_rank_info(rank=i) - if rank_info.tp_rank == 0 and rank_info.pp_rank == 0 and rank_info.cp_rank == 0: - self.mp_rank_zero[dp_rank] = self.actor_cluster.workers[i] - - self.request_counter = GlobalCounter.options( - name=f"DynamicSchedulerRequestCounter", - get_if_exists=True, - namespace=RAY_NAMESPACE, - ).remote() - - def reset_status(self): - self.completed_buffers: Dict[int, List[DataProto]] = defaultdict(list) - self.query_group_buffers: Dict[int, List[DataProto]] = defaultdict(list) - - self.dp_fetch_count = {dp_rank: 0 for dp_rank in self.mp_rank_zero.keys()} - self.load_balance_coordinator = {dp_rank: 0 for dp_rank in self.mp_rank_zero.keys()} - self.request_id_2_prompt_id.clear() - self.prompt_id_2_request_ids.clear() - self.prompt_id_2_hash_str.clear() - self.abort_request_ids.clear() - self.request_id_2_dp_rank.clear() - self.requests_buffers.clear() - self.response_filter_count = 0 - self.query_filter_count = 0 - self.running_prompts = 0 - self.prompt_use_count = 0 - self.response_cache = defaultdict(list) - self.exception_queue = queue.Queue() - bar_name = "-".join(self.reward_clusters.keys()) - self.progress_bar = tqdm( - total=self.batch_size, - desc=f"{bar_name} generate progress(prompt)", - mininterval=int(self.batch_size * 0.1) + 1, + # Dynamic filter is supported no matter whether is_use_additional_prompts, + # is_use_additional_prompts is required when using dynamic num_return_sequences. + self.replay_buffer = ReplayBuffer( + async_generation_ratio=self.pipeline_config.async_generation_ratio if not is_val else 0, + is_use_additional_prompts=self.pipeline_config.is_use_additional_prompts if not is_val else False, + max_additional_running_prompts=self.pipeline_config.max_additional_running_prompts if not is_val else 0, + ) + self.load_balancer = LoadBalancer(self.mp_rank_zero, self.pipeline_config.max_running_requests) + # dp_rank -> prompt_id -> request_ids + self.running_requests: Dict[int, Dict[int, Set[str]]] = {dp_rank: defaultdict(set) for dp_rank in self.mp_rank_zero.keys()} + self.running_tasks: Dict[int, asyncio.Task] = {} + + # async_sending_task is paused at start. But can not call self.pause_sampling directly here, + # because ActorInfer.strategy has not been initialized yet and is not ready to serve abort_requests rpc. + self.load_balancer.suspend() + + # async_sending_task coroutine will last during the whole training process, only stop at shutdown or exception. + # Because we do not need to pause all running prompts but only suspend generate requests, so that reward requests + # still can run during model update. + self.async_sending_task = asyncio.create_task(self.sending_request()) + + async def abort_running_requests(self): + dp_requests = {} + for dp_rank, prompt_requests in self.running_requests.items(): + dp_requests[dp_rank] = [] + for request_ids in prompt_requests.values(): + dp_requests[dp_rank].extend(request_ids) + await asyncio.gather( + *[ + self.actor_cluster.workers[dp_rank].abort_requests.remote(request_ids) + for dp_rank, request_ids in dp_requests.items() + ] ) - def get_batch_opt_level_0(self, data: DataProto, batch_size: int) -> DataProto: + def gc(self): + aborted_prompts = self.replay_buffer.gc() + for prompt_id in aborted_prompts: + if task := self.running_tasks.get(prompt_id, None): + task.cancel() + + async def pause_sampling(self): + self.load_balancer.suspend() + self.gc() + await self.abort_running_requests() + await self.load_balancer.wait_complete() + logger.info(f"sampling paused, replay_buffer info: {self.replay_buffer.info()}") + + async def shutdown(self): + self.replay_buffer.shutdown() + self.load_balancer.resume() + self.gc() + await self.abort_running_requests() + await self.load_balancer.wait_complete() + await self.async_sending_task + + async def get_batch_opt_level_0(self, data: DataProto, batch_size: int) -> DataProto: + generation_config = copy.deepcopy(data.meta_info["generation_config"]) completed_data: List[DataProto] = [] query_use_count = 0 + if self.is_val: + query_filter_fn = lambda data_list, config: True + else: + from roll.distributed.scheduler.user_defined_rollout_loop import query_filter + query_filter_fn = query_filter + query_filter_count = 0 while len(completed_data) < batch_size: data_item_list = [self.get_next_dataset_item() for _ in range(batch_size)] @@ -503,7 +923,7 @@ def get_batch_opt_level_0(self, data: DataProto, batch_size: int) -> DataProto: gen_batch = request_data.pop(batch_keys=["input_ids", "attention_mask", "position_ids"]) gen_batch.meta_info = request_data.meta_info - num_return_sequences = self.generation_config["num_return_sequences"] + num_return_sequences = generation_config["num_return_sequences"] request_data = request_data.repeat(repeat_times=num_return_sequences) # Pad gen_batch to be divisible by dp_size to avoid errors @@ -513,7 +933,7 @@ def get_batch_opt_level_0(self, data: DataProto, batch_size: int) -> DataProto: batch.union(other=request_data) batch.rename(old_keys="prompt_id", new_keys="origin_prompt_id") - batch_rewards = self.reward_scheduler.compute_rewards(data=batch, reward_clusters=self.reward_clusters, pipeline_config=self.pipeline_config) + batch_rewards = await self.reward_scheduler.compute_rewards(data=batch, reward_clusters=self.reward_clusters, pipeline_config=self.pipeline_config) metrics = batch.meta_info.pop("metrics", {}) metrics.update(batch_rewards.meta_info.pop("metrics", {})) @@ -522,216 +942,144 @@ def get_batch_opt_level_0(self, data: DataProto, batch_size: int) -> DataProto: batch.meta_info["metrics"] = metrics batch_grouped: Dict[str, DataProto] = batch.group_by("origin_prompt_id") for prompt_id, batch_item in batch_grouped.items(): - if self.query_filter_fn([batch_item], self.pipeline_config): + if query_filter_fn([batch_item], self.pipeline_config): completed_data.append(batch_item) else: - self.query_filter_count += 1 + query_filter_count += 1 query_use_count += batch_size - batch = DataProto.concat(completed_data[: self.batch_size]) + batch = DataProto.concat(completed_data[: batch_size]) batch.meta_info["metrics"] = { - f"scheduler/query_filter_count": self.query_filter_count, - f"scheduler/response_filter_count": self.response_filter_count, - f"scheduler/collect_query_count": self.batch_size, + f"scheduler/query_filter_count": query_filter_count, + f"scheduler/collect_query_count": batch_size, f"scheduler/query_use_count": query_use_count, } - self.reset_status() return batch - - def get_batch(self, data: DataProto, batch_size: int) -> DataProto: - """ - 从dataset里,按给定策略sample batch - 1. 常规无过滤 - 2. 动态过滤 - """ - self.batch_size = batch_size - self.reset_status() - self.running = True - self.generation_config = copy.deepcopy(data.meta_info["generation_config"]) + async def get_batch(self, data: DataProto, global_step: int, batch_size: int) -> DataProto: + # MOCK MODE: Load pre-recorded data, skip rollout (from mixin) + if self._should_load_mock(global_step): + return await self._load_mock_batch(global_step) if self.pipeline_config.generate_opt_level == 0: - return self.get_batch_opt_level_0(data, batch_size) + return await self.get_batch_opt_level_0(data, batch_size) - prompt_id_counter = itertools.count() - num_return_sequences = self.generation_config["num_return_sequences"] - while True: - if ( - sum([len(v) for v in list(self.completed_buffers.values())[:]]) - >= self.batch_size * num_return_sequences - ): - self.running = False - break - self.check_worker_alive(self.actor_cluster) - self.check_response_callback() - if not self.check_send_new_request(): - time.sleep(1) - continue + num_return_sequences = data.meta_info["generation_config"]["num_return_sequences"] + self.meta_info = copy.deepcopy(data.meta_info) + self.meta_info["collect_non_finish"] = self.pipeline_config.async_generation_ratio > 0 - # get a query from dataset - prompt_id = next(prompt_id_counter) - dataset_item = self.get_next_dataset_item() - if int(os.environ.get("REPORT_LENGTH_AND_REWARDS", "0")): - prompt_digest = hashlib.md5( - (dataset_item.get('prompt', '') + dataset_item.get('messages', '')).encode() - ).digest() - domain = dataset_item.get("domain", "default") - collect_data = self.collect_fn([dataset_item]) - request_data: DataProto = DataProto.from_single_dict(collect_data, meta_info=data.meta_info) + assert self.load_balancer.empty(), f"worker state: {self.load_balancer.workers}" + assert all(len(requests) == 0 for prompt_requests in self.running_requests.values() for requests in prompt_requests.values()) + # Notice: self.replay_buffer.running_prompts may not be 0 because + # pause_sampling only pause generate request but not reward request. - # replica, redundancy - request_data_list = self.expand_requests(request_data) - - dp_rank = next(self.get_available_dp_rank()) - with self.lock: - self.prompt_use_count += 1 - self.running_prompts += 1 - for req in request_data_list: - # get a available worker, 需要控制max_running_request, 当前策略会始终保持worker的满载 - request_id = ray.get(self.request_counter.get_value.remote()) - req.meta_info["request_id"] = f"{request_id}" - req.meta_info["response_callback_fn"] = self.response_callback_fn - self.request_id_2_prompt_id[req.meta_info["request_id"]] = prompt_id - self.request_id_2_dp_rank[req.meta_info["request_id"]] = dp_rank - self.prompt_id_2_request_ids[prompt_id].add(req.meta_info["request_id"]) # 用于replica情况 - if int(os.environ.get("REPORT_LENGTH_AND_REWARDS", "0")): - self.prompt_id_2_hash_str[prompt_id] = base64.urlsafe_b64encode(prompt_digest).decode().rstrip('=') # prompt_id 对应 unique prompt - self.requests_buffers[req.meta_info["request_id"]] = req - self.actor_cluster.workers[dp_rank].add_request.remote(command=GenerateRequestType.ADD, data=req) - req.meta_info.pop("response_callback_fn") - self.load_balance_coordinator[dp_rank] += 1 - self.dp_fetch_count[dp_rank] += 1 - - completed_buffers = {k: v for k, v in self.completed_buffers.items() if len(v) > 0} - collect_data = [item for sublist in list(completed_buffers.values())[:] for item in sublist] - query_use_count = next(prompt_id_counter) - logger.info( - f"total collect data: {len(collect_data)}, collect queries: {len(completed_buffers)} " - f"used queries: {query_use_count} query_filter_count: {self.query_filter_count} " - f"response_filter_count: {self.response_filter_count}" + self.replay_buffer.advance_step(step=global_step, batch_size=batch_size) + logger.info(f"start sampling, {global_step=} {batch_size=}, {self.replay_buffer.info()}") + self.load_balancer.resume() + + bar_name = "-".join(self.reward_clusters.keys()) + progress_bar = tqdm( + total=batch_size * num_return_sequences, + desc=f"{bar_name} generate progress(prompt)", + mininterval=int(batch_size * 0.1) + 1, ) - # TODO: 这里 len(collect_data) > rollout_batch_size, 可以尝试动态扩大batch_size - batch = DataProto.concat(collect_data[: self.batch_size * num_return_sequences]) + + # await on both get_batch and async_sending_task to receive exception from async_sending_task as soon as possible + get_task = asyncio.create_task( + self.replay_buffer.get_batch(expected_samples=batch_size * num_return_sequences, progress_bar=progress_bar) + ) + await asyncio.wait({get_task, self.async_sending_task}, return_when=asyncio.FIRST_COMPLETED) + if self.async_sending_task.done(): + assert self.async_sending_task.exception() is not None + await self.async_sending_task + assert get_task.done() + finished_items = await get_task + + if self.pipeline_config.is_use_additional_prompts: + # Keep the first batch_size*num_return_sequences ExperienceItem now. + assert len(finished_items) >= batch_size * num_return_sequences + finished_items = finished_items[:batch_size * num_return_sequences] + assert len(finished_items) == batch_size * num_return_sequences + batch = self.collect_items_as_batch(finished_items=finished_items) + + if self.is_val or self.pipeline_config.async_generation_ratio <= 0: + await self.pause_sampling() + assert not self.replay_buffer.groups, f"{self.replay_buffer.groups=}" + + for domain in self.reward_clusters.keys(): + metrics = {} + generate_stat = self.generate_timer[domain].log() + metrics[f"scheduler/{domain}/time/generate/count"] = generate_stat["count"] + metrics[f"scheduler/{domain}/time/generate/min"] = generate_stat["min"] + metrics[f"scheduler/{domain}/time/generate/max"] = generate_stat["max"] + metrics[f"scheduler/{domain}/time/generate/mean"] = generate_stat["mean"] + reward_stat = self.reward_timer[domain].log() + metrics[f"scheduler/{domain}/time/reward/count"] = reward_stat["count"] + metrics[f"scheduler/{domain}/time/reward/min"] = reward_stat["min"] + metrics[f"scheduler/{domain}/time/reward/max"] = reward_stat["max"] + metrics[f"scheduler/{domain}/time/reward/mean"] = reward_stat["mean"] + batch.meta_info["metrics"].update(metrics) + + # DUMP MODE: Save merged batch (from mixin) + await self._maybe_dump_batch(batch, global_step) + + return batch + + def collect_items_as_batch(self, finished_items: List[ExperienceItem]) -> DataProto: + collect_data_by_domain = defaultdict(list) + data_off_policy_step = 0.0 + prompt_ids = set() + for item in finished_items: + collect_data_by_domain[item.domain].append(item.data) + data_off_policy_step += self.replay_buffer.current_step - item.sampling_start_step + prompt_ids.add(item.prompt_id) + data_off_policy_step = data_off_policy_step / len(finished_items) + + collect_data_by_domain = { + domain: DataProto.concat(data_list) for domain, data_list in collect_data_by_domain.items() + } + query_use_count = len(prompt_ids) + collect_data_num = sum(data.batch.batch_size[0] for data in collect_data_by_domain.values()) + assert collect_data_num == len(finished_items) + logger.info(f"total collect data: {collect_data_num}, collect queries: {query_use_count}") + + batch = DataProto.concat(list(collect_data_by_domain.values())) + # TODO support response_filter_count and query_filter_count batch.meta_info["metrics"] = { - f"scheduler/query_filter_count": self.query_filter_count, - f"scheduler/response_filter_count": self.response_filter_count, - f"scheduler/collect_query_count": len(completed_buffers), + f"scheduler/collect_query_count": query_use_count, f"scheduler/query_use_count": query_use_count, + f"scheduler/off_policy_ratio": data_off_policy_step, } - # 统计全部response metrics metrics = {} - for domain, response_batches in self.response_cache.items(): - response_batch = DataProto.concat(response_batches[:]) + for domain, response_batch in collect_data_by_domain.items(): sequence_score = response_batch.batch["scores"] metrics[f"scheduler/{domain}/score/mean"] = torch.mean(sequence_score).detach().item() metrics[f"scheduler/{domain}/score/max"] = torch.max(sequence_score).detach().item() metrics[f"scheduler/{domain}/score/min"] = torch.min(sequence_score).detach().item() - batch.meta_info["metrics"].update(metrics) - self.reset_status() + + # TODO shigao implement REPORT_LENGTH_AND_REWARDS (deleted at refactor) return batch - @ray.method(concurrency_group="multi_thread") - def report_response(self, data: DataProto): + async def sending_request(self): """ - 这里需要考虑多线程数据访问 - data 返回可能有多条的 + See documentation of ReplyBuffer for recommended usage of ReplayBuffer and LoadBalancer. """ - try: - request_id = data.meta_info["request_id"] - prompt_id = self.request_id_2_prompt_id[request_id] - num_return_sequences = self.generation_config["num_return_sequences"] - - batch = self.postprocess_output_ids(data) - output_count = batch.batch.batch_size[0] - with self.lock: - self.load_balance_coordinator[self.request_id_2_dp_rank[request_id]] -= 1 - self.prompt_id_2_request_ids[prompt_id].remove(request_id) - domain = "default" - if "domain" in batch.non_tensor_batch.keys(): - domain = batch.non_tensor_batch["domain"][0] - reward_worker = next(self.reward_worker_iters[domain]) - - if not self.running: - return - - # call reward - # reward worker得能支持单条数据计算, dynamic sampling对需要batch计算reward的需要注意... - # 多域的时候,llm as judge, 需要单独为reward worker分配gpu - - # set rollout id - batch.non_tensor_batch["rollout_id"] = np.array([str(uuid.uuid4()) for _ in range(output_count)], dtype=object) - - rewards: DataProto = ray.get(reward_worker.compute_rewards.remote(batch)) - batch.union(rewards) - - response_buffers: List[DataProto] = [] - batch_expanded = [batch[[idx]] for idx in range(output_count)] - - # response_filter, 不太需要response filter - for batch_item in batch_expanded: - if self.response_filter_fn(batch_item, self.pipeline_config): - response_buffers.append(batch_item) - else: - self.response_filter_count += 1 - - with self.lock: - self.response_cache[domain].extend(batch_expanded) - - if len(response_buffers) == 0: - if len(self.prompt_id_2_request_ids[prompt_id]) == 0: - self.running_prompts -= 1 - return - - if len(self.completed_buffers[prompt_id]) > 0: - return - - # expand batch to response - self.query_group_buffers[prompt_id].extend(response_buffers) - - # query_filter, query has n responses - if len(self.query_group_buffers[prompt_id]) >= num_return_sequences: - if not self.query_filter_fn(self.query_group_buffers[prompt_id], self.pipeline_config): - self.query_filter_count += 1 - del self.query_group_buffers[prompt_id] - self.abort_requests(self.prompt_id_2_request_ids[prompt_id]) - return - - assert len(self.query_group_buffers[prompt_id]) >= num_return_sequences, ( - f"expect to generate {num_return_sequences} results from one prompt, " - f"but get {len(self.query_group_buffers[prompt_id])}." - ) - self.completed_buffers[prompt_id] = self.query_group_buffers[prompt_id][:num_return_sequences] - self.progress_bar.update() - - if int(os.environ.get("REPORT_LENGTH_AND_REWARDS", "0")): - # report response level rewards - response_level_rewards = [data.batch["response_level_rewards"] for data in self.query_group_buffers[prompt_id]] - response_rewards = torch.cat(response_level_rewards, dim=0).long().cpu().tolist() - prompt_hash = self.prompt_id_2_hash_str.pop(prompt_id) - prompt_response_proto = DataProto.concat(self.query_group_buffers[prompt_id][:num_return_sequences]) - # report response level lengths - response_lengths = torch.sum(prompt_response_proto.batch["response_mask"], dim=1).cpu().tolist() - - lengths_and_rewards = { - 'domain': domain, - 'prompt_hash': prompt_hash, - 'response_lengths': response_lengths, - 'response_rewards': response_rewards - } - length_dir = os.path.join(self.pipeline_config.length_profiler_dir, "length") - os.makedirs(length_dir, exist_ok=True) - filename = f"response-length-and-rewards-{domain}-ep{self.dataset_epoch}.jsonl" - length_file_path = os.path.join(length_dir, filename) - with open(length_file_path, "a") as f: - f.write(json.dumps(lengths_and_rewards) + "\n") - - # abort uncompleted request - self.abort_requests(self.prompt_id_2_request_ids[prompt_id]) - except Exception as e: - self.exception_queue.put(e) + async with TaskGroup() as tg: + while True: + try: + prompt_id = await self.replay_buffer.poll() + except: + logger.info(f"stop sending_request coroutine") + break + task = tg.create_task(RolloutContext.process_new_prompt(scheduler=self, prompt_id=prompt_id)) + self.running_tasks[prompt_id] = task + + # The above loop only break at shutdown, it is safe to abort all infligh requests here. + await self.abort_running_requests() + # Implicitly wait until all running tasks finished when TaskGroup context exit. def get_next_dataset_item(self): if self.dataset_iter is None: @@ -755,147 +1103,238 @@ def get_next_dataset_item(self): def get_scheduler_state(self): return {"dataset_iter_count": self.dataset_iter_count} - def abort_requests(self, request_ids: Set[str]): - abort_refs = [] - self.running_prompts -= 1 - for request_id in request_ids: - dp_rank = self.request_id_2_dp_rank[request_id] - self.load_balance_coordinator[dp_rank] -= 1 - abort_refs.append( - self.actor_cluster.workers[dp_rank].add_request.remote( - command=GenerateRequestType.ABORT, data=DataProto(meta_info={"request_id": request_id}) - ) - ) - def postprocess_output_ids(self, data: DataProto) -> DataProto: - # postprocess_generate, input_ids, attention_mask, left pad - request_id = data.meta_info["request_id"] - request: DataProto = self.requests_buffers.pop(request_id) +class RolloutContext: + """ + Helper class to manage life cycle of rollout of a prompt. + Provide a context manager based interface to user and hide implementation + details of DynamicSamplingScheduler, LoadBalancer and ReplayBuffer from user. + """ - eos_token_id = data.meta_info["eos_token_id"] - pad_token_id = data.meta_info["pad_token_id"] - output_token_ids = data.meta_info["output_token_ids"] - output_tokens = [torch.tensor(token_ids) for token_ids in output_token_ids] - - output_logprobs = data.meta_info.get("output_logprobs", None) - - output_tensor = pad_sequence(output_tokens, batch_first=True, padding_value=pad_token_id) - output_tensor = concatenate_input_and_output( - input_ids=request.batch["input_ids"], output_ids=output_tensor, num_return_sequences=len(output_tokens) - ) - output: DataProto = postprocess_generate( - prompts=request, - output=output_tensor, - num_return_sequences=len(output_tokens), - sequence_length=self.sequence_length, - eos_token_id=eos_token_id, - pad_token_id=pad_token_id, - output_logprobs=output_logprobs, - ) - request_repeat = request.repeat(repeat_times=len(output_tokens)) - output.non_tensor_batch = request_repeat.non_tensor_batch - output.meta_info = request_repeat.meta_info - return output - - def expand_requests(self, data: DataProto): - """ - replica, 以及redundancy - """ - generate_opt_level = self.pipeline_config.generate_opt_level - is_num_return_sequences_expand = self.pipeline_config.is_num_return_sequences_expand - num_return_sequences = self.generation_config["num_return_sequences"] - - assert generate_opt_level > 0, ( - f"generate_opt_level {generate_opt_level} should > 0, " f"in dynamic sampling scheduler." - ) - assert "generation_config" in data.meta_info, f"data {data.meta_info} should have key 'generation_config'" - generation_config = data.meta_info["generation_config"] - - target_requests = [] - if is_num_return_sequences_expand: - generation_config["num_return_sequences"] = 1 - for _ in range(num_return_sequences): - target_requests.append(copy.deepcopy(data)) + @staticmethod + async def process_new_prompt( + scheduler: DynamicSamplingScheduler, + prompt_id: int, + ): + num_return_sequences = scheduler.meta_info["generation_config"]["num_return_sequences"] + context = RolloutContext(scheduler=scheduler, prompt_id=prompt_id, meta_info=scheduler.meta_info) + success = False + try: + responses = await scheduler.udrl.process_new_prompt(context=context) + if responses is None: + logger.info(f"filter out prompt {prompt_id}") + raise asyncio.CancelledError # abort this prompt + responses = expand_responses(responses) + assert ( + len(responses) == num_return_sequences or scheduler.replay_buffer.is_use_additional_prompts + ), "is_use_additional_prompts is required when using dynamic num_return_sequences" + except Exception as e: + logger.warning(f"abort prompt {prompt_id} on exception {e}") + raise else: - generation_config["num_return_sequences"] = num_return_sequences - target_requests.append(copy.deepcopy(data)) - - return target_requests - - def check_worker_alive(self, cluster): - # 探测dp worker是否存活,dp worker的server thread可能由于异常退出,造成hang - current_time = time.time() - if current_time - self.last_alive_check >= self.alive_check_interval: - cluster.add_request(command=GenerateRequestType.ALIVE_CHECK, data=DataProto()) - self.last_alive_check = current_time - - def check_response_callback(self): - if self.exception_queue.qsize() > 0: - e = self.exception_queue.get() - logger.error(f"report_response get exception {e}") - raise e - - def check_send_new_request(self) -> bool: - if self.running_prompts >= (self.batch_size + self.max_additional_running_prompts): - return False - if not self.is_use_additional_prompts and self.prompt_use_count >= self.batch_size: - return False - return True + success = True + finally: + scheduler.running_tasks.pop(prompt_id, None) + + # commit/abort should be put at last in finally block, because commit may raise exception + if not success: + scheduler.replay_buffer.abort(prompt_id) + else: + assert context.sampling_start_step is not None + scheduler.replay_buffer.commit( + prompt_id, + [ + ExperienceItem( + prompt_id=prompt_id, + domain=context.domain, + sampling_start_step=context.sampling_start_step, + data=response, + ) + for response in responses + ], + ) - def get_available_dp_rank(self): - while True: - # 负载均衡逻辑,期望各dp 正在处理的条数基本接近 - sorted_ranks = sorted( - self.load_balance_coordinator.keys(), key=lambda rank: (self.load_balance_coordinator[rank], rank) - ) - if self.load_balance_coordinator[sorted_ranks[0]] < self.max_running_requests: - yield sorted_ranks[0] + def __init__( + self, + scheduler: DynamicSamplingScheduler, + prompt_id: int, + meta_info, + ): + # store reference to scheduler as a 'proxy' + self._scheduler: DynamicSamplingScheduler = scheduler + + # export system/prompt level meta info and config to user + self.prompt_id = prompt_id + self.meta_info = copy.deepcopy(meta_info) # user may change config in meta_info + self.pipeline_config = scheduler.pipeline_config + self.is_val = scheduler.is_val + self.sequence_length = scheduler.sequence_length + self.prompt_length = scheduler.pipeline_config.prompt_length + self.is_num_return_sequences_expand = scheduler.pipeline_config.is_num_return_sequences_expand + + # User can call reward worker of different domain in for a single data, but ExperienceItem.domain is bind to dataset + self.domain = None + + # The following attributes are setted after generate and reward begin. + self.sampling_start_step = None + self._lease: LoadBalancer.Lease = None + self._in_do_generate_and_reward = False + + def get_request_data(self, meta_info): + assert not self._in_do_generate_and_reward and self.sampling_start_step is None + + # TODO allow user to get multiple data inside a RolloutContext? + if getattr(self, "got_data", False): + raise RuntimeError("Should call get_request_data only once." + " If want ot filter dataset item out, return None" + " and let framework to schedule another prompt") + else: + self.got_data: bool = True + + dataset_item = self._scheduler.get_next_dataset_item() + domain = dataset_item.get("domain", "default") + collect_data = self._scheduler.collect_fn([dataset_item]) + self.domain = domain + return DataProto.from_single_dict(collect_data, meta_info=meta_info), domain + + @asynccontextmanager + async def do_generate_and_reward(self, max_concurrency): + assert not self._in_do_generate_and_reward and self.sampling_start_step is None + self._in_do_generate_and_reward = True + + # All reuqest of the same prompt are scheduled to the same worker. + # sample_params.n will take n credits rather than 1. + # LoadBalancer.acquire will block until can send new request to actor infer. + # Current implementation rely on the assumption that returned dp_rank is stable. + self._lease = await self._scheduler.load_balancer.acquire(credit=max_concurrency) + + # Assume sampling_start_step of all samples of this prompt are the same, however + # the real sampling_start_step can be different from self.sampling_start_step. + try: + sampling_start_step = await self._scheduler.replay_buffer.begin(prompt_id=self.prompt_id) + except: + self._lease.clear() + raise + self.sampling_start_step = sampling_start_step + try: + yield + except: + self._lease.clear() + raise + finally: + assert ( + self.prompt_id not in self._scheduler.running_requests[self._lease._dp_rank] or + len(self._scheduler.running_requests[self._lease._dp_rank][self.prompt_id]) == 0 + ), f"User should gather all running requests: {self._scheduler.running_requests[self._lease._dp_rank][self.prompt_id]=}" + self._scheduler.running_requests[self._lease._dp_rank].pop(self.prompt_id, None) + self._in_do_generate_and_reward = False -@ray.remote -class GlobalCounter: - def __init__(self): - self.value = -1 + async def generate( + self, + req: DataProto, + domain: str, + ) -> DataProto: + assert self._in_do_generate_and_reward + async with self._lease.lock(samplen=req.meta_info["generation_config"]["num_return_sequences"]) as dp_rank: + with self._scheduler.generate_timer[domain].track(): + request_id = self._scheduler.next_request_id() + req.meta_info["request_id"] = request_id + logger.debug(f"generate_and_reward: {self.prompt_id=} {request_id} generate_request") + self._scheduler.running_requests[dp_rank][self.prompt_id].add(request_id) + try: + # InferWorker.generate_request only return data with finish_reason=="abort" on abort + # but not raise asyncio.CancelledError. This try finally block may be not necessary. + data = await self._scheduler.actor_cluster.workers[dp_rank].generate_request.remote(req) + # TODO ray.cancel(ref) on asyncio.CancelledError + finally: + self._scheduler.running_requests[dp_rank][self.prompt_id].remove(request_id) + assert data is not None + return data + + async def compute_rewards( + self, + req: DataProto, + domain: str, + ) -> DataProto: + # reward worker得能支持单条数据计算, dynamic sampling对需要batch计算reward的需要注意... + # 多域的时候,llm as judge, 需要单独为reward worker分配gpu + assert self._in_do_generate_and_reward + with self._scheduler.reward_timer[domain].track(): + reward_worker = next(self._scheduler.reward_worker_iters[domain]) + logger.debug(f"generate_and_reward: {self.prompt_id=} compute_rewards") + output_count = req.batch.batch_size[0] + req.non_tensor_batch["rollout_id"] = np.array([str(uuid.uuid4()) for _ in range(output_count)], dtype=object) + return await reward_worker.compute_rewards.remote(req) + + async def abort_running_requests(self): + """ + Abort all running requests. - def get_value(self): - self.value += 1 - return self.value + Notice: Both abort and partial rollout will return a unfinished response, user should distinguish + these two cases by themselves to avoid dead loop (if do not check abort state in multi-turn rollout, + may send generate request again). + """ + assert self._in_do_generate_and_reward + assert self.prompt_id is not None + dp_rank = self._lease._dp_rank + request_ids = list(self._scheduler.running_requests[dp_rank][self.prompt_id]) + await self._scheduler.actor_cluster.workers[dp_rank].abort_requests.remote(request_ids) @ray.remote class RequestScheduler: - def __init__(self, infer_cluster, pipeline_config): + def __init__(self, infer_cluster, pipeline_config, resource_manager): self.infer_cluster = infer_cluster self.pipeline_config = pipeline_config + self.resource_manager = resource_manager self.request_id = uuid.uuid4() self.request_counter = 0 self.src_rank2_dp_rank = {} self.request_id_2_dp_rank = {} - self.inflight_requests: List[Dict[str, asyncio.Future]] = [{} for _ in range(self.infer_cluster.world_size)] + self.request_id_2_src_rank: Dict[str, int] = {} # Reverse lookup for abort + self.running_requests: List[set[str]] = [set() for _ in range(self.infer_cluster.world_size)] self.worker_iter = itertools.cycle(range(self.infer_cluster.world_size)) self.need_suspend = False self.suspend_notifier = asyncio.Event() + self.empty_notifier = asyncio.Event() + + # Active DP ranks for request routing + self.active_dp_ranks: Set[int] = set(range(self.infer_cluster.world_size)) # All ranks initially active + self.routing_lock = asyncio.Lock() # Protect routing updates async def generate_one_request(self, data: DataProto): await self._check_suspend() src_rank = data.meta_info["src_rank"] - if src_rank not in self.src_rank2_dp_rank: - dp_rank = next(self.worker_iter) - self.src_rank2_dp_rank[src_rank] = dp_rank + # Atomic routing assignment under lock to prevent TOCTOU race with shrink/expand + async with self.routing_lock: + # Least-loaded dispatch + if src_rank not in self.src_rank2_dp_rank: + dp_rank = self._get_least_active_dp_rank() + self.src_rank2_dp_rank[src_rank] = dp_rank + dp_rank = self.src_rank2_dp_rank[src_rank] request_id = f"{self.request_id}_{self.request_counter}" self.request_counter += 1 data.meta_info["request_id"] = request_id - fut = asyncio.Future() + self.request_id_2_dp_rank[request_id] = dp_rank - self.inflight_requests[dp_rank][request_id] = fut - ref = self.infer_cluster.workers[dp_rank].add_request.remote(command=GenerateRequestType.ADD, data=data) - await asyncio.wrap_future(ref.future()) - response_data = await fut - if response_data is None: - # request aborted + self.request_id_2_src_rank[request_id] = src_rank + self.running_requests[dp_rank].add(request_id) + + try: + response_data = await self.infer_cluster.workers[dp_rank].generate_request.remote(data=data) + finally: + self.running_requests[dp_rank].remove(request_id) + self.empty_notifier.set() + # Cleanup tracking (on both success and abort paths) + self.request_id_2_src_rank.pop(request_id, None) + + assert response_data is not None + + if not is_report_data_finished(response_data): return None # postprocess_generate, input_ids, attention_mask, left pad @@ -925,46 +1364,546 @@ async def generate_one_request(self, data: DataProto): output.meta_info = request_repeat.meta_info return output - async def report_response(self, data: DataProto, is_abort=False): - request_id = data.meta_info["request_id"] - if request_id not in self.request_id_2_dp_rank: - return - dp_rank = self.request_id_2_dp_rank.pop(request_id) - fut = self.inflight_requests[dp_rank].pop(request_id) - if is_abort: - fut.set_result(None) - else: - fut.set_result(data) - async def abort_request(self): - futures = [] - for i in range(self.infer_cluster.world_size): - if len(self.inflight_requests[i]) == 0: - continue - ref = self.infer_cluster.workers[i].add_request.remote( - command=GenerateRequestType.ABORT, data=DataProto( - meta_info={"request_id": [request_id for request_id in self.inflight_requests[i].keys()]} - ) - ) - futures.append(ref) - for request_id in self.inflight_requests[i].keys(): - futures.append(self.report_response(data=DataProto(meta_info={"request_id": request_id}), is_abort=True)) - # must await at last, because report_response will mut inflight_requests - await asyncio.gather(*futures) + await asyncio.gather(*( + self.infer_cluster.workers[dp_rank].abort_requests.remote(list(self.running_requests[dp_rank])) + for dp_rank in range(self.infer_cluster.world_size) + if self.running_requests[dp_rank] + )) + async def _check_suspend(self): while self.need_suspend: await self.suspend_notifier.wait() + def empty(self): + return sum([len(running_requests) for running_requests in self.running_requests]) == 0 + async def suspend(self): if self.need_suspend: return self.suspend_notifier.clear() self.need_suspend = True await self.abort_request() + while not self.empty(): + self.empty_notifier.clear() + await self.empty_notifier.wait() def resume(self): if not self.need_suspend: return self.need_suspend = False self.suspend_notifier.set() + + def _get_gpus_for_dp_rank(self, dp_rank: int) -> List[int]: + """Map DP rank to GPU IDs using cluster's device info. + + Args: + dp_rank: Data parallel rank index (0 to dp_size-1) + + Returns: + List of GPU IDs used by this DP rank's workers + + Example: + # Pure DP: rank == dp_rank + # DP rank 0 uses GPUs [0], DP rank 1 uses GPUs [1], etc. + gpus = self._get_gpus_for_dp_rank(dp_rank=0) + # Returns: [0] + """ + # In agentic pipeline (pure DP): rank == dp_rank, so directly access rank2devices + devices_info = self.infer_cluster.rank2devices[dp_rank] + + # Extract GPU IDs: gpu_id = node_rank * num_gpus_per_node + gpu_rank + gpu_ids = [] + for device in devices_info: + num_gpus_per_node = self.resource_manager.gpu_per_node + gpu_id = device["node_rank"] * num_gpus_per_node + device["gpu_rank"] + gpu_ids.append(gpu_id) + + return sorted(set(gpu_ids)) # Remove duplicates and sort + + def _get_least_active_dp_rank(self) -> int: + """Find DP rank with fewest assigned src_ranks (environments). + + Returns: + DP rank with minimum src_rank count from src_rank2_dp_rank + + Raises: + RuntimeError: If no active ranks + + Note: + Counts unique src_ranks (environments) per worker, not in-flight requests. + With sticky mapping, one src_rank generates multiple sequential requests. + """ + candidate_ranks = list(self.active_dp_ranks) + if not candidate_ranks: + raise RuntimeError("No active DP ranks") + # todo optimization: (yangpeng) not efficient, better to use counter for this + # Count src_ranks per dp_rank + src_rank_count = defaultdict(int) + for src_rank, dp_rank in self.src_rank2_dp_rank.items(): + if dp_rank in self.active_dp_ranks: + src_rank_count[dp_rank] += 1 + + # Return dp_rank with minimum src_rank count + return min(candidate_ranks, key=lambda r: src_rank_count[r]) + + def _clear_src_rank_mappings(self, src_ranks: Set[int]) -> None: + """Clear sticky mappings to allow re-routing on retry.""" + for src_rank in src_ranks: + self.src_rank2_dp_rank.pop(src_rank, None) + + async def rebalance_on_shrink(self, shrink_dp_ranks: List[int]) -> Dict[str, int]: + """Abort requests on shrinking workers, clear mappings for natural re-dispatch. + + Args: + shrink_dp_ranks: DP ranks to remove from active set + + Returns: + {"aborted": count, "remapped": count} + + Raises: + ValueError: If shrink_dp_ranks empty/invalid/duplicates + RuntimeError: If timeout or operation fails + """ + # VAL: VAL_NON_EMPTY, VAL_TYPE_CHECK, VAL_INT_RANGE, VAL_NO_DUPLICATES + if not shrink_dp_ranks: + raise ValueError("shrink_dp_ranks cannot be empty") + + for rank in shrink_dp_ranks: + if not isinstance(rank, int): + raise TypeError(f"Expected int, got {type(rank)}") + if not (0 <= rank < self.infer_cluster.world_size): + raise ValueError(f"rank {rank} out of range") + + if len(shrink_dp_ranks) != len(set(shrink_dp_ranks)): + raise ValueError(f"Duplicates in shrink_dp_ranks") + + # P0: LOCK_TIMEOUT + try: + return await asyncio.wait_for( + self._rebalance_on_shrink(shrink_dp_ranks), + timeout=30.0 + ) + except asyncio.TimeoutError: + raise RuntimeError("rebalance_on_shrink timed out after 30s") + + async def _rebalance_on_shrink(self, shrink_dp_ranks: List[int]) -> Dict[str, int]: + """Internal implementation of shrink rebalancing. + + PRE-CONDITION: routing_lock MUST be held by caller (shrink_workers). + This method does NOT acquire the lock internally to avoid double-lock deadlock. + + Args: + shrink_dp_ranks: DP ranks to remove from active set + + Returns: + {"aborted": count, "remapped": count} + + Raises: + RuntimeError: If shrink operation fails + """ + keep_ranks = list(self.active_dp_ranks - set(shrink_dp_ranks)) + if not keep_ranks: + raise ValueError("Cannot shrink to zero active ranks") + + old_active_ranks = self.active_dp_ranks.copy() + self.active_dp_ranks = set(keep_ranks) + + try: + total_aborted = 0 + abort_futures = [] + + for dp_rank in shrink_dp_ranks: + request_ids = list(self.running_requests[dp_rank]) + if not request_ids: + continue + + total_aborted += len(request_ids) + + abort_futures.append( + self.infer_cluster.workers[dp_rank].abort_requests.remote(request_ids) + ) + + + + await asyncio.gather(*abort_futures) + + while True: + remain = sum(len(self.running_requests[dp_rank]) for dp_rank in shrink_dp_ranks) + if remain == 0: + break + logger.info(f"Shrink: waiting for {len(shrink_dp_ranks)} workers {remain=} to finish abort") + await asyncio.sleep(3) + + # Clear ALL mappings pointing to shrinking workers (not just in-flight) + shrink_dp_ranks_set = set(shrink_dp_ranks) + src_ranks_to_remap = set([ + src_rank for src_rank, dp_rank in self.src_rank2_dp_rank.items() + if dp_rank in shrink_dp_ranks_set + ]) + self._clear_src_rank_mappings(src_ranks_to_remap) + + logger.info( + f"Shrink: aborted {total_aborted} requests, " + f"cleared {len(src_ranks_to_remap)} mappings" + ) + + return {"aborted": total_aborted, "remapped": len(src_ranks_to_remap)} + + except Exception as e: + self.active_dp_ranks = old_active_ranks + raise RuntimeError(f"Shrink failed: {e}") from e + + async def rebalance_on_expand(self, expand_dp_ranks: List[int]) -> Dict[str, int]: + """Add workers and rebalance via src_rank-level abort. + + Args: + expand_dp_ranks: DP ranks to add to active set + + Returns: + {"aborted": count, "remapped": count} + + Raises: + ValueError: If expand_dp_ranks invalid + RuntimeError: If timeout or operation fails + """ + # VAL: VAL_NON_EMPTY, VAL_TYPE_CHECK, VAL_INT_RANGE, VAL_NO_DUPLICATES + if not expand_dp_ranks: + raise ValueError("expand_dp_ranks cannot be empty") + for rank in expand_dp_ranks: + if not isinstance(rank, int): + raise TypeError(f"Expected int, got {type(rank)}") + if not (0 <= rank < self.infer_cluster.world_size): + raise ValueError(f"rank {rank} out of range") + if len(expand_dp_ranks) != len(set(expand_dp_ranks)): + raise ValueError(f"Duplicates in expand_dp_ranks") + + # P0: LOCK_TIMEOUT + try: + return await asyncio.wait_for( + self._rebalance_on_expand(expand_dp_ranks), + timeout=30.0 + ) + except asyncio.TimeoutError: + raise RuntimeError("rebalance_on_expand timed out after 30s") + + async def _rebalance_on_expand(self, expand_dp_ranks: List[int]) -> Dict[str, int]: + """Internal implementation of expand rebalancing. + + PRE-CONDITION: routing_lock MUST be held by caller (expand_workers). + This method does NOT acquire the lock internally to avoid double-lock deadlock. + + Algorithm: Round-robin selection across old workers + 1. Calculate proportional src_ranks to abort: src_ranks_to_keep = ceil(total * old_count / new_count) + 2. Group existing src_ranks by dp_rank (only old workers) + 3. Round-robin iterate over old workers using cycle() + 4. Select one src_rank at a time until remaining_to_abort reaches 0 + 5. Abort ALL requests from selected src_ranks + 6. Clear src_rank mappings for reallocation to new workers + + Implementation Notes: + - Uses cycle() for infinite round-robin iteration over old workers + - Check at line 1146 (if not dp_rank in old_active_dp_ranks) is redundant + since dp_rank_to_src_ranks already contains only old workers, but kept as defensive guard + - Loop terminates when remaining_to_abort <= 0 or all worker lists are exhausted + - If all workers exhausted before reaching target, loop may cycle indefinitely + (no explicit check for empty state, but pop(0) will eventually empty all lists) + + Args: + expand_dp_ranks: DP ranks to add to active set (already validated) + + Returns: + {"aborted": count, "remapped": count} - count of src_ranks aborted/remapped + + Preconditions: + - routing_lock MUST be held by caller + - expand_dp_ranks validated (non-empty, int, in range, no duplicates) + + Postconditions: + - active_dp_ranks updated with expand_dp_ranks + - Selected src_ranks aborted and removed from mappings + - Requests from aborted src_ranks reported as is_abort=True + """ + # Calculate counts before updating active_dp_ranks + old_dp_count = len(self.active_dp_ranks) + old_active_dp_ranks = self.active_dp_ranks.copy() + + self.active_dp_ranks.update(expand_dp_ranks) + new_dp_count = len(self.active_dp_ranks) + + total_src_ranks = len(self.src_rank2_dp_rank) + if total_src_ranks == 0: + return {"aborted": 0, "remapped": 0} + + # Proportional calculation + src_ranks_to_keep = math.ceil(int(total_src_ranks * old_dp_count / new_dp_count)) + src_ranks_to_abort = total_src_ranks - src_ranks_to_keep + + if src_ranks_to_abort <= 0: + logger.info("Expand: no rebalancing needed (src_ranks_to_abort <= 0)") + return {"aborted": 0, "remapped": 0} + + # Group src_ranks by dp_rank (old workers only) + dp_rank_to_src_ranks = defaultdict(list) + for src_rank, dp_rank in self.src_rank2_dp_rank.items(): + if dp_rank in old_active_dp_ranks: + dp_rank_to_src_ranks[dp_rank].append(src_rank) + + # Round-robin selection: iterate over old workers and select one src_rank at a time + # todo optimization:(yangpeng) take uneven dp load into consideration and do dynamic load balancing, not just RR + selected_src_ranks = [] + remaining_to_abort = src_ranks_to_abort + for dp_rank in cycle(dp_rank_to_src_ranks.keys()): + if not dp_rank in old_active_dp_ranks: + continue + + if remaining_to_abort <= 0: + break + + src_ranks_on_worker = dp_rank_to_src_ranks.get(dp_rank, []) + if not src_ranks_on_worker: + continue + selected_src_ranks.append(src_ranks_on_worker.pop(0)) + + remaining_to_abort -= 1 + + # Remove from mapping and group by dp_rank for abort + abort_by_dp_rank = defaultdict(list) + for src_rank in selected_src_ranks: + dp_rank = self.src_rank2_dp_rank.pop(src_rank) + + # Find request_id(s) for this src_rank + for request_id, sr in self.request_id_2_src_rank.items(): + if sr == src_rank: + abort_by_dp_rank[dp_rank].append(request_id) + + # Send batched ABORT commands + abort_futures = [] + total_aborted = 0 + for dp_rank, request_ids in abort_by_dp_rank.items(): + if not request_ids: + continue + + total_aborted += len(request_ids) + abort_futures.append( + self.infer_cluster.workers[dp_rank].abort_requests.remote(request_ids) + ) + + + await asyncio.gather(*abort_futures) + + logger.info( + f"Expand: aborted {len(selected_src_ranks)} src_ranks, " + f"cleared {len(selected_src_ranks)} mappings " + f"(proportional: {old_dp_count}/{new_dp_count})" + ) + + return {"aborted": len(selected_src_ranks), "remapped": len(selected_src_ranks)} + + def _validate_target_gpus(self, target_gpus: List[int], mode: str) -> None: + """Validate target_gpus input for shrink/expand operations. + + Args: + target_gpus: List of GPU IDs to free (shrink) or restore (expand) + mode: Operation mode ("shrink" or "expand") + + Raises: + ValueError: If target_gpus is empty, has duplicates, or mode is invalid + + Example: + self._validate_target_gpus([4, 5, 6, 7], mode="shrink") + # Validates successfully + + self._validate_target_gpus([], mode="shrink") + # Raises: ValueError("[shrink] target_gpus cannot be empty") + + self._validate_target_gpus([4, 4, 5], mode="expand") + # Raises: ValueError("[expand] target_gpus has duplicates: [4, 4, 5]") + """ + # VAL: VAL_NON_EMPTY + if not target_gpus: + raise ValueError(f"[{mode}] target_gpus cannot be empty") + + # VAL: VAL_NO_DUPLICATES + if len(target_gpus) != len(set(target_gpus)): + raise ValueError(f"[{mode}] target_gpus has duplicates: {target_gpus}") + + if mode not in ("shrink", "expand"): + raise ValueError(f"Invalid mode: {mode}") + + def _validate_calculated_ranks(self, ranks: List[int], mode: str) -> None: + """Validate calculated DP ranks against current active_dp_ranks state. + + Args: + ranks: List of DP ranks calculated from target_gpus + mode: Operation mode ("shrink" or "expand") + + Raises: + ValueError: If ranks is empty, contains out-of-range values, + or violates state consistency (shrink: must be active, + expand: must be inactive) + + Example: + # Shrink validation + self.active_dp_ranks = {0, 1, 2, 3} + self._validate_calculated_ranks([2, 3], mode="shrink") + # Validates successfully (ranks 2, 3 are active) + + self._validate_calculated_ranks([4], mode="shrink") + # Raises: ValueError("[shrink] DP rank 4 not active") + + # Expand validation + self.active_dp_ranks = {0, 1} + self._validate_calculated_ranks([2, 3], mode="expand") + # Validates successfully (ranks 2, 3 are inactive) + + self._validate_calculated_ranks([0], mode="expand") + # Raises: ValueError("[expand] DP rank 0 already active") + """ + # VAL: VAL_NON_EMPTY + if not ranks: + raise ValueError(f"[{mode}] Calculated ranks list is empty") + + # VAL: VAL_INT_RANGE + for dp_rank in ranks: + if not (0 <= dp_rank < self.infer_cluster.world_size): + raise ValueError(f"[{mode}] DP rank {dp_rank} out of range [0, {self.infer_cluster.world_size})") + + # AST: State consistency + + for dp_rank in ranks: + if dp_rank not in self.active_dp_ranks: + raise ValueError(f"DP rank {dp_rank} not active {mode=}") + + async def shrink_workers(self, target_gpus: List[int]) -> Dict[str, Any]: + """Complete atomic shrink operation: validate → rebalance → offload → update routing. + + Orchestrates the full worker shrink process: + 1. Validates target_gpus input + 2. Calculates DP ranks to offload based on GPU overlap + 3. Validates calculated ranks against active state + 4. Atomically (under routing_lock): + - Rebalances routing (aborts requests on shrinking workers) + - Offloads model states from shrinking workers + 5. Returns metrics for monitoring + + Args: + target_gpus: GPU IDs to free (e.g., [4, 5, 6, 7] to free second half of 8 GPUs) + + Returns: + Metrics dict containing: + - "aborted": Number of requests aborted during rebalancing + - "remapped": Number of src_ranks remapped (cleared from routing) + - "shrink_duration_ms": Total operation time in milliseconds + - "offload_ranks": List of DP ranks that were offloaded + + Raises: + ValueError: If target_gpus invalid (empty, duplicates) or + calculated ranks invalid (not active, out of range) + RuntimeError: If rebalance or offload operations fail + + Example: + # Shrink to free GPUs [4, 5, 6, 7] (second half of 8-GPU setup) + result = await scheduler.shrink_workers([4, 5, 6, 7]) + # Returns: {"aborted": 10, "remapped": 5, "shrink_duration_ms": 2340.5, "offload_ranks": [2, 3]} + + Side Effects: + - Updates active_dp_ranks (removes offload_ranks) + - Aborts in-flight requests on shrinking workers + - Clears src_rank mappings for remapped environments + - Offloads model states from shrinking workers to CPU + """ + start_time = time.time() + + # VAL: VAL_NON_EMPTY, VAL_NO_DUPLICATES + self._validate_target_gpus(target_gpus, mode="shrink") + # Calculate DP ranks to offload + target_gpus = set(target_gpus) + offload_ranks = [dp for dp in range(self.infer_cluster.world_size) + if set(self._get_gpus_for_dp_rank(dp)).intersection(target_gpus)] + + # VAL: VAL_NON_EMPTY, state consistency check + self._validate_calculated_ranks(offload_ranks, mode="shrink") + + # Atomic operation under routing_lock + async with self.routing_lock: + # Rebalance (abort + update active_dp_ranks) + result = await self.rebalance_on_shrink(offload_ranks) + # release the lock before blocking offload so that active dp rank can work immediately + # Offload states from target workers + offload_refs = self.infer_cluster.offload_states_partial(offload_ranks, blocking=False) + await asyncio.gather(*[asyncio.wrap_future(ref.future()) for ref in offload_refs]) + + return {**result, "shrink_duration_ms": (time.time() - start_time) * 1000, + "offload_ranks": offload_ranks} + + async def expand_workers(self, target_gpus: List[int], skip_load: bool = False) -> Dict[str, Any]: + """Complete atomic expand operation: validate → load → rebalance → update routing. + + Orchestrates the full worker expand process: + 1. Validates target_gpus input + 2. Calculates DP ranks to restore based on GPU overlap + 3. Validates calculated ranks against active state (skip if skip_load=True) + 4. Atomically (under routing_lock): + - Loads model states on expanding workers (skip if skip_load=True) + - Rebalances routing (proportionally redistributes requests) + 5. Returns metrics for monitoring + + Args: + target_gpus: GPU IDs to restore (e.g., [4, 5, 6, 7] to restore second half of 8 GPUs) + skip_load: If True, skip model loading and validation (use when model_update already loaded states). + This only updates active_dp_ranks to restore routing state without re-loading models. + + Returns: + Metrics dict containing: + - "aborted": Number of requests aborted during rebalancing (proportional redistribution) + - "remapped": Number of src_ranks remapped (cleared from routing) + - "expand_duration_ms": Total operation time in milliseconds + - "load_ranks": List of DP ranks that were restored + + Raises: + ValueError: If target_gpus invalid (empty, duplicates) or + calculated ranks invalid (already active, out of range) + RuntimeError: If load or rebalance operations fail + + Example: + # Expand to restore GPUs [4, 5, 6, 7] (second half of 8-GPU setup) + result = await scheduler.expand_workers([4, 5, 6, 7]) + # Returns: {"aborted": 3, "remapped": 3, "expand_duration_ms": 1850.2, "load_ranks": [2, 3]} + + # After model_update already loaded states to all GPUs, just restore routing: + result = await scheduler.expand_workers([4, 5, 6, 7], skip_load=True) + + Side Effects: + - Updates active_dp_ranks (adds load_ranks) + - Loads model states from CPU to expanding workers (unless skip_load=True) + - Aborts some requests from old workers for proportional rebalancing + - Clears src_rank mappings for rebalanced environments (will route to new workers) + """ + start_time = time.time() + + # VAL: VAL_NON_EMPTY, VAL_NO_DUPLICATES + self._validate_target_gpus(target_gpus, mode="expand") + + # Calculate DP ranks to restore + target_gpus = set(target_gpus) + load_ranks = [dp for dp in range(self.infer_cluster.world_size) + if set(self._get_gpus_for_dp_rank(dp)).issubset(target_gpus)] + + # VAL: VAL_NON_EMPTY, state consistency check + # Skip validation when skip_load=True because ranks may already be "active" in cluster + # (model states loaded by model_update) but not tracked in active_dp_ranks yet + if not skip_load: + self._validate_calculated_ranks(load_ranks, mode="expand") + load_refs = self.infer_cluster.load_states_partial(load_ranks, blocking=False) + await asyncio.gather(*[asyncio.wrap_future(ref.future()) for ref in load_refs]) + + # Atomic operation under routing_lock + async with self.routing_lock: + + # Rebalance (update active_dp_ranks + conditional abort) + result = await self.rebalance_on_expand(load_ranks) + + return {**result, "expand_duration_ms": (time.time() - start_time) * 1000, + "load_ranks": load_ranks} diff --git a/roll/distributed/scheduler/reward_scheduler.py b/roll/distributed/scheduler/reward_scheduler.py index 5619539e4..a58132c0f 100644 --- a/roll/distributed/scheduler/reward_scheduler.py +++ b/roll/distributed/scheduler/reward_scheduler.py @@ -1,3 +1,4 @@ +import asyncio from collections import defaultdict from typing import Dict, Optional, List, Any @@ -29,7 +30,7 @@ def __init__(self): self.pipeline_config = None self.progress_bar: Optional[tqdm] = None - def compute_rewards(self, data: DataProto, reward_clusters: Dict[str, Any], pipeline_config) -> DataProto: + async def compute_rewards(self, data: DataProto, reward_clusters: Dict[str, Any], pipeline_config) -> DataProto: """ 保序返回rewards """ @@ -54,7 +55,8 @@ def compute_rewards(self, data: DataProto, reward_clusters: Dict[str, Any], pipe # reward worker compute_rewards 接口返回结果保序 if domain not in grouped_data.keys(): continue - domain_rewards: DataProto = DataProto.materialize_concat(data_refs=domain_rewards_ref) + data = await asyncio.gather(*[ref.obj_ref for ref in domain_rewards_ref]) + domain_rewards: DataProto = DataProto.concat(data) domain_rewards.batch["prompt_id"] = grouped_data[domain].batch["prompt_id"] rewards_list.append(domain_rewards) diff --git a/roll/distributed/scheduler/rollout_mock_mixin.py b/roll/distributed/scheduler/rollout_mock_mixin.py new file mode 100644 index 000000000..ece814cf5 --- /dev/null +++ b/roll/distributed/scheduler/rollout_mock_mixin.py @@ -0,0 +1,200 @@ +""" +Rollout Mock Mixin for dump/mock mechanism. + +This mixin provides dump/mock functionality for schedulers to enable +deterministic testing by saving/loading DataProto objects. +""" +import os +import pickle +from typing import Optional + +from roll.distributed.scheduler.protocol import DataProto +from roll.utils.logging import get_logger + +logger = get_logger() + + +class RolloutMockMixin: + """ + Mixin class providing rollout dump/mock functionality. + + This mixin should be used with scheduler classes that have: + - self.config or self.pipeline_config: Configuration object with optional rollout_mock attribute + - self.mode (str) OR self.is_val (bool): Indicating 'train' or 'val' mode + + Usage: + # For schedulers with self.mode (like RolloutScheduler): + class MyScheduler(RolloutMockMixin, BaseScheduler): + def __init__(self, config, mode, ...): + self.config = config + self.mode = mode + self._init_rollout_mock() + ... + + # For schedulers with self.is_val (like DynamicSamplingScheduler): + class MyScheduler(RolloutMockMixin, BaseScheduler): + def __init__(self, pipeline_config, ...): + self.pipeline_config = pipeline_config + self.is_val = is_val + self._init_rollout_mock() + ... + + async def get_batch(self, ...): + # In mock mode, load pre-recorded data + if self._should_load_mock(global_step): + return await self._load_mock_batch(global_step) + + # Normal flow... + batch = await self._actual_get_batch(...) + + # In dump mode, save the batch + await self._maybe_dump_batch(batch, global_step) + return batch + """ + + def _get_config(self): + """Get configuration object (supports both self.config and self.pipeline_config).""" + return getattr(self, 'config', None) or getattr(self, 'pipeline_config', None) + + def _get_mode_str(self) -> str: + """ + Get mode string ('train' or 'val'). + + Supports both self.mode (str) and self.is_val (bool) attributes. + """ + if hasattr(self, 'mode'): + return self.mode + elif hasattr(self, 'is_val'): + return 'val' if self.is_val else 'train' + else: + raise AttributeError("Scheduler must have either 'mode' or 'is_val' attribute") + + def _init_rollout_mock(self): + """ + Initialize rollout mock configuration. + + Should be called in the scheduler's __init__ method after + config and mode/is_val attributes are set. + """ + config = self._get_config() + if config is None: + logger.warning("[RolloutMock] No config found, mock functionality disabled") + self.mock_config = None + return + + self.mock_config = getattr(config, 'rollout_mock', None) + if self.mock_config and self.mock_config.enable: + mode_str = self._get_mode_str() + dump_dir = os.path.join(self.mock_config.dump_dir, mode_str) + os.makedirs(dump_dir, exist_ok=True) + logger.info( + f"[RolloutMock] Rollout Mock enabled: mode={self.mock_config.mode}, " + f"dir={self.mock_config.dump_dir}, scheduler_mode={mode_str}, format=pickle" + ) + + def _should_load_mock(self, global_step: int) -> bool: + """ + Check if we should load mock data for this step. + + Args: + global_step: Current training step + + Returns: + True if mock mode is enabled and we should load data + """ + return ( + self.mock_config + and self.mock_config.enable + and self.mock_config.mode == "mock" + ) + + def _should_dump_batch(self) -> bool: + """ + Check if we should dump batches. + + Returns: + True if dump mode is enabled + """ + return ( + self.mock_config + and self.mock_config.enable + and self.mock_config.mode == "dump" + ) + + async def _maybe_dump_batch(self, batch: DataProto, global_step: int): + """ + Dump batch if dump mode is enabled. + + Args: + batch: DataProto to dump + global_step: Current training step + """ + if self._should_dump_batch(): + await self._dump_batch(batch, global_step) + + async def _dump_batch(self, batch: DataProto, global_step: int): + """ + Dump DataProto to disk (pickle format). + + Args: + batch: DataProto to dump + global_step: Current training step + """ + mode_str = self._get_mode_str() + dump_path = os.path.join( + self.mock_config.dump_dir, + mode_str, + f"step_{global_step:06d}.pkl" + ) + os.makedirs(os.path.dirname(dump_path), exist_ok=True) + + # Use pickle serialization (DataProto supports __getstate__/__setstate__) + with open(dump_path, 'wb') as f: + pickle.dump(batch, f, protocol=pickle.HIGHEST_PROTOCOL) + + file_size_mb = os.path.getsize(dump_path) / (1024 * 1024) + logger.info( + f"[RolloutMock] Dumped step {global_step}: {dump_path} " + f"(samples={len(batch)}, size={file_size_mb:.2f}MB)" + ) + + async def _load_mock_batch(self, global_step: int) -> DataProto: + """ + Load pre-recorded DataProto from disk (strict mode). + + Args: + global_step: Current training step + + Returns: + Loaded DataProto + + Raises: + FileNotFoundError: If mock file doesn't exist + """ + mode_str = self._get_mode_str() + mock_path = os.path.join( + self.mock_config.dump_dir, + mode_str, + f"step_{global_step:06d}.pkl" + ) + + # Strict mode: raise error if file doesn't exist + if not os.path.exists(mock_path): + raise FileNotFoundError( + f"[RolloutMock] Mock file not found: {mock_path}\n" + f"Possible reasons:\n" + f" 1. Step {global_step} was never run in dump mode\n" + f" 2. Incorrect dump_dir configuration: {self.mock_config.dump_dir}\n" + f" 3. Mode mismatch (current mode: {mode_str})\n" + f"Please run in dump mode first to ensure all step data is generated." + ) + + # Deserialize + with open(mock_path, 'rb') as f: + batch = pickle.load(f) + + logger.info( + f"[RolloutMock] Loaded step {global_step}: {mock_path} " + f"(samples={len(batch)})" + ) + return batch diff --git a/roll/distributed/scheduler/rollout_scheduler.py b/roll/distributed/scheduler/rollout_scheduler.py index 8f301730c..6ce801c31 100644 --- a/roll/distributed/scheduler/rollout_scheduler.py +++ b/roll/distributed/scheduler/rollout_scheduler.py @@ -2,7 +2,7 @@ import random import time from dataclasses import dataclass, field -from typing import Any, Dict, List, Optional, Tuple +from typing import Any, Dict, List, Optional, Set, Tuple import ray from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy @@ -12,13 +12,194 @@ from roll.distributed.executor.cluster import Cluster from roll.distributed.scheduler.generate_scheduler import RequestScheduler from roll.distributed.scheduler.protocol import DataProto +from roll.pipeline.agentic.agentic_config import EnvManagerConfig, EnvMonitorConfig +from roll.distributed.scheduler.rollout_mock_mixin import RolloutMockMixin from roll.pipeline.agentic.agentic_config import EnvManagerConfig -from roll.utils.functionals import append_to_dict, GenerateRequestType +from roll.utils.functionals import append_to_dict from roll.utils.import_utils import safe_import_class from roll.utils.logging import get_logger logger = get_logger() + +class EnvActivityMonitor: + """Environment activity monitor for tracking and detecting hung envs.""" + + def __init__(self, config: EnvMonitorConfig, group_queue_dict: Dict[int, 'GroupQueue']): + """ + Args: + config: EnvMonitorConfig object + group_queue_dict: Reference to GroupQueue dict for checking episode status + """ + self.group_queue_dict = group_queue_dict + self.enable = config.enable + + # Configuration parameters + self.monitor_interval = config.monitor_interval # seconds + self.hung_timeout = config.hung_timeout # seconds (default: 1 hour) + + # Tracking data structures - Dual-timestamp approach + # Track when env starts processing an episode + # Key: ((group_id, env_id), episode_id) -> Value: timestamp + self.env_episode_start: Dict[Tuple[Tuple[int, int], int], float] = {} + + # Track when env submits episode rollout + # Key: ((group_id, env_id), episode_id) -> Value: timestamp + self.env_episode_submit: Dict[Tuple[Tuple[int, int], int], float] = {} + + # Track each env's current episode (for cleanup) + # Key: (group_id, env_id) -> Value: episode_id + self.env_current_episode: Dict[Tuple[int, int], int] = {} + + # Monitor task + self.monitor_task: Optional[asyncio.Task] = None + + def record_episode_start(self, group_id: int, env_id: int, episode_id: int): + """ + Record when env starts processing a new episode. + Called from GroupQueue.get_episode_id() when an episode is assigned to an env. + + Args: + group_id: Group ID + env_id: Environment ID + episode_id: Episode ID assigned to this env + """ + if not self.enable: + return + + env_key = (group_id, env_id) + episode_key = ((group_id, env_id), episode_id) + + # Automatic cleanup: Remove old episode records for this env + old_episode_id = self.env_current_episode.get(env_key) + if old_episode_id is not None and old_episode_id != episode_id: + old_episode_key = ((group_id, env_id), old_episode_id) + self.env_episode_start.pop(old_episode_key, None) + self.env_episode_submit.pop(old_episode_key, None) + + # Record new episode start time + self.env_episode_start[episode_key] = time.time() + self.env_current_episode[env_key] = episode_id + + def record_activity(self, group_id: int, env_id: int, episode_id: int, rollout: Optional[DataProto]): + """ + Record env activity when submitting a rollout. + Called from GroupQueueManager.put() when env submits rollout. + + Args: + group_id: Group ID + env_id: Environment ID + episode_id: Episode ID + rollout: Rollout data (None means env is exiting) + """ + if not self.enable: + return + + env_key = (group_id, env_id) + episode_key = ((group_id, env_id), episode_id) + + if rollout is None: + # Env calls put(..., None) to signal exit, remove all tracking + self.env_episode_start.pop(episode_key, None) + self.env_episode_submit.pop(episode_key, None) + self.env_current_episode.pop(env_key, None) + return + + # Normal rollout submission, record submit time + self.env_episode_submit[episode_key] = time.time() + + def start_monitoring(self): + """Start background monitoring task.""" + if not self.enable or self.monitor_task is not None: + return + + self.monitor_task = asyncio.create_task(self._monitor_loop()) + + def stop_monitoring(self): + """Stop background monitoring task.""" + if self.monitor_task: + self.monitor_task.cancel() + self.monitor_task = None + + def cleanup_episode(self, group_id: int, episode_id: int): + """ + Clean up monitoring data for completed episode. + Note: With dual-timestamp tracking, cleanup is mostly automatic in record_episode_start(). + This method is kept for compatibility but has minimal work to do. + """ + if not self.enable: + return + + # No cleanup needed - dual-timestamp approach handles cleanup automatically + # when new episodes start via record_episode_start() + pass + + async def _monitor_loop(self): + """Background monitoring task that periodically detects hung envs and logs.""" + while True: + try: + await asyncio.sleep(self.monitor_interval) + self.check_and_log_hung_envs() + except asyncio.CancelledError: + break + except Exception as e: + logger.error(f"[EnvMonitor] Monitor loop error: {e}") + + def check_and_log_hung_envs(self): + """ + Detect and log hung envs using dual-timestamp tracking. + + Detection Logic: + - For each env with a start time recorded: + - Check if current episode has a submit time + - If no submit time and (now - start_time) > hung_timeout: + → Report as hung + - If submit time exists: + → Env has completed, don't report (even if timestamp is old) + """ + now = time.time() + hung_envs_by_group = {} # group_id -> list of hung env info + + # Iterate over all episode start records + for episode_key, start_time in self.env_episode_start.items(): + (group_id, env_id), episode_id = episode_key + + # Check if this episode has been submitted + submit_time = self.env_episode_submit.get(episode_key) + + if submit_time is None: + # Env started but hasn't submitted (still processing) + inactive_time = now - start_time + + if inactive_time > self.hung_timeout: + # Report as hung + if group_id not in hung_envs_by_group: + hung_envs_by_group[group_id] = [] + + hung_envs_by_group[group_id].append({ + "env_id": env_id, + "episode_id": episode_id, + "inactive_seconds": int(inactive_time), + }) + # else: Episode submitted, env is waiting for next episode (normal) + + # Output logs + if hung_envs_by_group: + for group_id, hung_envs in hung_envs_by_group.items(): + hung_env_ids = [e["env_id"] for e in hung_envs] + logger.warning( + f"[EnvMonitor] Group {group_id}: Detected {len(hung_envs)} hung envs: {hung_env_ids}" + ) + for env_info in hung_envs[:5]: # Only log details for first 5 + logger.warning( + f"[EnvMonitor] - env_id={env_info['env_id']}, " + f"episode_id={env_info['episode_id']}, " + f"inactive_for={env_info['inactive_seconds']}s" + ) + if len(hung_envs) > 5: + logger.warning(f"[EnvMonitor] ... and {len(hung_envs) - 5} more") + + @dataclass class GroupData: group_id: int @@ -37,6 +218,7 @@ def __init__( max_traj_per_env, async_generation_ratio, group_filter, + env_monitor: Optional['EnvActivityMonitor'] = None, ): self.group_id = group_id self.progress_bar = progress_bar @@ -47,6 +229,7 @@ def __init__( self.async_generation_ratio = async_generation_ratio self.group_filter = group_filter self.group_filter_count = 0 + self.env_monitor = env_monitor self.current_step = None self.next_episode_id = 0 @@ -95,17 +278,33 @@ def advance_step(self, step): expired_episodes.append(episode_id) for episode_id in expired_episodes: self.groups.pop(episode_id) + if self.env_monitor: + self.env_monitor.cleanup_episode(self.group_id, episode_id) self.current_step = step self._advance_step(step) self.progress.set() - async def get_episode_id(self) -> Optional[int]: + async def get_episode_id(self, env_id: Optional[int] = None) -> Optional[int]: + """ + Get the next episode_id for an env to process. + + Args: + env_id: Environment ID requesting work (None for backward compatibility) + + Returns: + episode_id to process, or None if shutting down + """ while not self.quit: # iterate over groups in order for episode_id, group in self.groups.items(): if group.running_rollouts < self.group_size + self.group_size_redundancy: group.running_rollouts += 1 + + # Record episode start for hang detection + if self.env_monitor and env_id is not None: + self.env_monitor.record_episode_start(self.group_id, env_id, episode_id) + return episode_id if self.max_traj_per_env is None: while self.current_step is None: @@ -132,6 +331,8 @@ def put(self, episode_id, start_step, rollout): logger.info(f"filter rollout group {group.group_id} episode {group.episode_id}") self.group_filter_count += 1 self.groups.pop(episode_id) + if self.env_monitor: + self.env_monitor.cleanup_episode(self.group_id, episode_id) self.advance_group(create_step=self.current_step) else: self.complete.set() @@ -146,6 +347,8 @@ async def get(self) -> GroupData: group = self.groups[episode_id] if len(group.rollouts) >= self.group_size: self.groups.pop(episode_id) + if self.env_monitor: + self.env_monitor.cleanup_episode(self.group_id, episode_id) return group self.complete.clear() await self.complete.wait() @@ -156,7 +359,7 @@ def __init__(self, config, env_manager_config: EnvManagerConfig, mode): self.mode = mode self.env_manager_config = env_manager_config self.group_size = self.env_manager_config.group_size - self.progress_bar = tqdm(desc=f"{self.mode} rollout progress(trajectory)", mininterval=self.env_manager_config.max_traj_per_env) + self.progress_bar = tqdm(desc=f"{self.mode} rollout progress(total trajectory)", mininterval=self.env_manager_config.max_traj_per_env) self.pending_gets = set() self.rollout_complete = {} @@ -170,7 +373,15 @@ def __init__(self, config, env_manager_config: EnvManagerConfig, mode): else: self.async_generation_ratio = 0 self.max_traj_per_env = env_manager_config.max_traj_per_env if config.val_batch_size > 0 else None + + # Initialize env activity monitor first (before creating GroupQueues) self.group_queue: Dict[int, GroupQueue] = {} + self.env_monitor = EnvActivityMonitor( + config=config.env_monitor, + group_queue_dict=self.group_queue + ) + + # Create GroupQueues with env_monitor reference for rank, rank_env_configs in env_manager_config.env_configs.items(): for env_id, env_config in rank_env_configs.items(): group_id = env_config["group_id"] @@ -183,8 +394,13 @@ def __init__(self, config, env_manager_config: EnvManagerConfig, mode): max_traj_per_env=self.max_traj_per_env, async_generation_ratio=self.async_generation_ratio, group_filter=self.group_filter, + env_monitor=self.env_monitor, ) + # Start monitoring after all GroupQueues are created + if config.env_monitor.enable: + self.env_monitor.start_monitoring() + # for debug self.total = 0 self.waiting = 0 @@ -208,19 +424,51 @@ def advance_step(self, step): for group_queue in self.group_queue.values(): group_queue.advance_step(step) - async def get_episode_id(self, group_id): + async def get_episode_id(self, group_id, env_id=None): + """ + Get the next episode ID for an environment. + + Args: + group_id: Group ID + env_id: Environment ID (for hang detection tracking) + + Returns: + episode_id to process + """ assert group_id in self.group_queue - return await self.group_queue[group_id].get_episode_id() + return await self.group_queue[group_id].get_episode_id(env_id) def shutdown(self): + # Stop monitoring task + self.env_monitor.stop_monitoring() + for get_task in self.pending_gets: get_task.cancel() self.pending_gets = set() for group_queue in self.group_queue.values(): group_queue.shutdown() - def put(self, group_id, episode_id, start_step, rollout: DataProto): + def put(self, group_id, episode_id, start_step, rollout: DataProto, env_id=None): + """ + Put rollout data to queue. + + Args: + group_id: Group ID + episode_id: Episode ID + start_step: Starting step + rollout: Rollout data (can be None for final submission) + env_id: Environment ID (optional, for monitoring) + + Backward compatibility: + - Old calls: put(group_id, episode_id, start_step, rollout) - env_id defaults to None + - New calls: put(group_id, episode_id, start_step, rollout, env_id) - enables monitoring + """ assert group_id in self.group_queue + + # Record env activity only if env_id is provided + if env_id is not None: + self.env_monitor.record_activity(group_id, env_id, episode_id, rollout) + self.waiting += 1 self.group_queue[group_id].put(episode_id, start_step, rollout) self.waiting -= 1 @@ -236,6 +484,7 @@ async def get_batch(self, batch_size, current_step) -> List[DataProto]: # When batch_size < 0, iterate until exit run_rollout_loop immediately. ret: List[DataProto] = [] + progress_bar = tqdm(desc=f"{self.mode} rollout get_batch progress(trajectory)", mininterval=self.group_size) while batch_size < 0 or len(ret) < batch_size: if len(self.rollout_complete) == len(self.group_queue): @@ -277,7 +526,9 @@ async def wait_a_episode(): group_rollout = group_rollout[:self.group_size] ret.extend(group_rollout) - assert batch_size < 0 or (done and len(ret) >= batch_size) or (not done and len(ret) <= batch_size) + progress_bar.update(len(group_rollout)) + + assert batch_size < 0 or (done and len(ret) >= batch_size) or (not done and len(ret) <= batch_size), f"{batch_size=}, {len(ret)=}, {done=}" if done: self.pending_gets.update(done) self.pending_gets.update(pending) @@ -288,7 +539,7 @@ async def wait_a_episode(): d.meta_info["get_batch_return_start_time"] = get_batch_return_start_time return ret -class RolloutScheduler: +class RolloutScheduler(RolloutMockMixin): """ Usage: # User should control load_states/offload_states in pipeline by themselves. @@ -314,6 +565,10 @@ def __init__(self, config, env_manager_config: EnvManagerConfig, resource_manage env_num = self.env_manager_config.world_size * self.env_manager_config.max_env_num_per_worker self.env_output_queue = GroupQueueManager.options( + name=f"GroupQueueManager-{mode}", + scheduling_strategy=NodeAffinitySchedulingStrategy( + node_id=ray.get_runtime_context().get_node_id(), + soft=False), max_concurrency = env_num + 1 # reserve extra one for get_batch ).remote( self.config, @@ -322,12 +577,13 @@ def __init__(self, config, env_manager_config: EnvManagerConfig, resource_manage ) self.generate_scheduler = RequestScheduler.options( + name=f"RequestScheduler-{self.env_manager_config.name}-{mode}", scheduling_strategy=NodeAffinitySchedulingStrategy( node_id=ray.get_runtime_context().get_node_id(), soft=False, ), max_concurrency = env_num + 1 # reserve extra one for suspend/resume - ).remote(infer_cluster=self.infer_cluster, pipeline_config=config) + ).remote(infer_cluster=self.infer_cluster, pipeline_config=config, resource_manager=self.resource_manager) self.es_manager: Any = Cluster( name=self.env_manager_config.name, @@ -345,6 +601,12 @@ def __init__(self, config, env_manager_config: EnvManagerConfig, resource_manage self.rollout_task = None + # Partial GPU mode state atomicity + self.mode_switch_lock = asyncio.Lock() # Prevent concurrent shrink/expand + + # Initialize rollout mock mechanism from mixin + self._init_rollout_mock() + async def shutdown(self): if self.rollout_task is None: return @@ -366,6 +628,10 @@ async def _get_batch(self, batch_size, global_step): async def get_batch(self, data: DataProto, batch_size): global_step = data.meta_info["global_step"] + # MOCK MODE: Load pre-recorded data, skip rollout (from mixin) + if self._should_load_mock(global_step): + return await self._load_mock_batch(global_step) + # start env manager if self.rollout_task is None: seed = random.randint(0, 1000000) if self.mode == "train" else self.config.seed @@ -399,4 +665,95 @@ async def get_batch(self, data: DataProto, batch_size): batch = DataProto.concat(data_batch) batch.meta_info["metrics"] = metrics batch.meta_info["get_batch_return_start_time"] = time.time() + + # DUMP MODE: Save merged batch (from mixin) + await self._maybe_dump_batch(batch, global_step) + return batch + + async def shrink_sampler(self, target_gpus: List[int]) -> Dict[str, Any]: + """Thin wrapper: Delegate shrink operation to RequestScheduler. + + v4.6 ARCHITECTURAL CHANGE: RolloutScheduler no longer performs validation, + calculation, or state management. All worker lifecycle operations are now + owned by RequestScheduler for atomic execution under routing_lock. + + Args: + target_gpus: GPU IDs to free (e.g., [4,5] for actor_train or [6,7] for critic) + + Returns: + Dict with metrics from RequestScheduler.shrink_workers(): + - "shrink_duration_ms": Total shrink operation time + - "offload_ranks": DP ranks offloaded + - "aborted": Number of requests aborted + - "remapped": Number of src_ranks remapped (cleared from routing) + - "rollout_scheduler_duration_ms": Timing from RolloutScheduler perspective + + Raises: + RuntimeError: If shrink_workers() fails (propagated from RequestScheduler) + + Side Effects: + - Calls RequestScheduler.shrink_workers() which performs: + * Validation, calculation, rebalancing, state offload atomically + * All operations protected by routing_lock + + Example: + # Shrink before training to free actor_train GPUs + metrics = await rollout_scheduler.shrink_sampler.remote([4, 5, 6, 7]) + # RequestScheduler handles: validation → calculation → rebalance → offload + """ + start_time = time.time() + + # Delegate complete shrink operation to RequestScheduler (atomic under routing_lock) + result = await self.generate_scheduler.shrink_workers.remote(target_gpus) + + # Add timing from RolloutScheduler perspective + result["rollout_scheduler_duration_ms"] = (time.time() - start_time) * 1000 + + return result + + async def expand_sampler(self, target_gpus: List[int], skip_load: bool = False) -> Dict[str, Any]: + """Thin wrapper: Delegate expand operation to RequestScheduler. + + v4.6 ARCHITECTURAL CHANGE: RolloutScheduler no longer performs validation, + calculation, or state management. All worker lifecycle operations are now + owned by RequestScheduler for atomic execution under routing_lock. + + Args: + target_gpus: GPU IDs to restore (e.g., [4,5] for actor_train or [6,7] for critic) + skip_load: If True, skip model loading (use when model_update already loaded states). + This only updates active_dp_ranks to restore routing state. + + Returns: + Dict with metrics from RequestScheduler.expand_workers(): + - "expand_duration_ms": Total expand operation time + - "load_ranks": DP ranks reloaded + - "aborted": Number of requests aborted (proportional rebalancing) + - "remapped": Number of src_ranks remapped (same as aborted) + - "rollout_scheduler_duration_ms": Timing from RolloutScheduler perspective + + Raises: + RuntimeError: If expand_workers() fails (propagated from RequestScheduler) + + Side Effects: + - Calls RequestScheduler.expand_workers() which performs: + * Validation, calculation, state loading (unless skip_load=True), routing updates atomically + * All operations protected by routing_lock + + Example: + # Expand after training to restore actor_train GPUs + metrics = await rollout_scheduler.expand_sampler.remote([4, 5, 6, 7]) + # RequestScheduler handles: validation → calculation → load → rebalance + + # After model_update already loaded states, just restore routing: + metrics = await rollout_scheduler.expand_sampler.remote([4, 5, 6, 7], skip_load=True) + """ + start_time = time.time() + + # Delegate complete expand operation to RequestScheduler (atomic under routing_lock) + result = await self.generate_scheduler.expand_workers.remote(target_gpus, skip_load) + + # Add timing from RolloutScheduler perspective + result["rollout_scheduler_duration_ms"] = (time.time() - start_time) * 1000 + + return result diff --git a/roll/distributed/scheduler/user_defined_rollout_loop.py b/roll/distributed/scheduler/user_defined_rollout_loop.py new file mode 100644 index 000000000..7e46d38b2 --- /dev/null +++ b/roll/distributed/scheduler/user_defined_rollout_loop.py @@ -0,0 +1,271 @@ +import asyncio +import copy +import math +from typing import List, Optional + +import torch +from torch.nn.utils.rnn import pad_sequence + +from roll.distributed.scheduler.generate_scheduler import ( + RolloutContext, + expand_requests, + is_report_data_finished, +) +from roll.distributed.scheduler.protocol import DataProto +from roll.pipeline.rlvr.rlvr_config import RLVRConfig +from roll.distributed.scheduler.protocol import DataProto +from roll.utils.functionals import ( + postprocess_generate, + concatenate_input_and_output, +) +from roll.utils.logging import get_logger + + +logger = get_logger() + +# ================= helper functions ================= + +def query_filter(data_list: List[DataProto], config: RLVRConfig) -> bool: + """ + 各domain的过滤规则可以自定义 + """ + response_level_rewards = [data.batch["response_level_rewards"] for data in data_list] + if len(response_level_rewards) == 1: + return True + rewards = torch.cat(response_level_rewards, dim=0) + + domain = data_list[0].non_tensor_batch["domain"][0] + query_filter_config = config.rewards[domain].query_filter_config + + if query_filter_config.type == "no_filter": + return True + elif query_filter_config.type == "mean_filter": + threshold_up = query_filter_config.filter_args.get("threshold_up", math.inf) + threshold_down = query_filter_config.filter_args.get("threshold_down", -1) + if torch.mean(rewards) <= threshold_down or torch.mean(rewards) >= threshold_up: + return False + elif query_filter_config.type == "std_filter": + std_threshold = query_filter_config.filter_args.get("std_threshold", -1) + if torch.std(rewards) <= std_threshold: + return False + return True + +def response_filter(data_item, config): + return True + +def postprocess_paused_data(pre_data, data: DataProto, sequence_length, prompt_length) -> DataProto: + if "output_token_ids" not in data.meta_info: # abort without inferred a token + # too many this log means need more infer workers + logger.info(f"received data without output_token_ids, request_id: {data.meta_info['request_id']}") + return pre_data + logger.debug(f"received paused data, request_id: {data.meta_info['request_id']}") + + assert len(data.meta_info["output_token_ids"]) == 1, ( + "async pipeline only support num_return_sequences=1 or is_num_return_sequences_expand=True" + ) + + # value: list[list[int|float]] + for key in ["output_token_ids", "output_logprobs"]: + cur_value = data.meta_info.pop(key) + pre_value = pre_data.meta_info.get(f"pre_{key}", [[]] * len(cur_value)) + assert len(pre_value) == len(cur_value) + pre_value = [pre_value[i] + cur_value[i] for i in range(len(pre_value))] + data.meta_info[f"pre_{key}"] = pre_value + new_batch = {**pre_data.batch} + + init_attention_mask = pre_data.batch.get("init_attention_mask", pre_data.batch["attention_mask"]) + new_batch["init_attention_mask"] = init_attention_mask + new_batch["init_input_ids"] = pre_data.batch.get("init_input_ids", pre_data.batch["input_ids"]) + + # concat pre output_ids and input_ids + new_input_ids = concatenate_input_and_output( + input_ids=new_batch["init_input_ids"], + output_ids=torch.LongTensor(data.meta_info["pre_output_token_ids"]), + num_return_sequences=len(data.meta_info["pre_output_token_ids"]), + ) + new_batch["input_ids"] = new_input_ids + + new_attention_mask = torch.ones_like(new_input_ids, dtype=init_attention_mask.dtype) + new_attention_mask[:, :init_attention_mask.shape[1]] = init_attention_mask + new_batch["attention_mask"] = new_attention_mask + + max_new_tokens = sequence_length - new_input_ids.shape[1] + if max_new_tokens <= 0: + raise ValueError(f"max_new_tokens {max_new_tokens} <= 0, init_input_ids {new_batch['init_input_ids'].shape}, " + f"pre_output_token_ids {len(data.meta_info['pre_output_token_ids'][0])}") + data.meta_info["max_new_tokens"] = max_new_tokens + new_non_tensor_batch = dict( + [(k, v.repeat(len(data.meta_info["pre_output_token_ids"]))) for k, v in pre_data.non_tensor_batch.items()] + ) # repeat num_return_sequences=1 + if "multi_modal_data" in pre_data.non_tensor_batch: + for i, (mm_data, prompt_token_ids) in enumerate( + zip(new_non_tensor_batch["multi_modal_data"], data.meta_info["pre_output_token_ids"]) + ): + # use new dict to replace repeated reference + mm_data = new_non_tensor_batch["multi_modal_data"][i] = dict(mm_data) + # VLM uses prompt_ids (without replaced image tokens) in multi_modal_data + prompt_token_ids = mm_data["prompt_token_ids"] + prompt_token_ids + mm_data.update({"prompt_token_ids": prompt_token_ids}) + data = DataProto.from_dict( + new_batch, non_tensors=new_non_tensor_batch, meta_info={**pre_data.meta_info, **data.meta_info} + ) + assert data.batch["init_attention_mask"].shape[1] == prompt_length + assert data.batch["init_input_ids"].shape[1] == prompt_length + return data + +def postprocess_output_data(request, data: DataProto, sequence_length) -> DataProto: + # postprocess_generate, input_ids, attention_mask, left pad + eos_token_id = data.meta_info["eos_token_id"] + pad_token_id = data.meta_info["pad_token_id"] + input_ids = request.batch.pop("init_input_ids", request.batch["input_ids"]) + request.batch["input_ids"] = input_ids + request.batch["attention_mask"] = request.batch.pop("init_attention_mask", request.batch["attention_mask"]) + output_token_ids = data.meta_info["output_token_ids"] + pre_output_token_ids = request.meta_info.pop("pre_output_token_ids", [[]] * len(output_token_ids)) + output_token_ids = [pre_output_token_ids[i] + output_token_ids[i] for i in range(len(pre_output_token_ids))] + + output_logprobs = data.meta_info.get("output_logprobs", None) + if output_logprobs is not None: + pre_output_logprobs = request.meta_info.get("pre_output_logprobs", [[]] * len(output_token_ids)) + output_logprobs = [pre_output_logprobs[i] + output_logprobs[i] for i in range(len(pre_output_logprobs))] + + output_tokens = [torch.tensor(token_ids) for token_ids in output_token_ids] + output_tensor = pad_sequence(output_tokens, batch_first=True, padding_value=pad_token_id) + output_tensor = concatenate_input_and_output( + input_ids=input_ids, output_ids=output_tensor, num_return_sequences=len(output_tokens) + ) + output: DataProto = postprocess_generate( + prompts=request, + output=output_tensor, + num_return_sequences=len(output_tokens), + sequence_length=sequence_length, + eos_token_id=eos_token_id, + pad_token_id=pad_token_id, + output_logprobs=output_logprobs, + ) + request_repeat = request.repeat(repeat_times=len(output_tokens)) + output.non_tensor_batch = request_repeat.non_tensor_batch + output.meta_info = request_repeat.meta_info + return output + +# ================= example of user defined rollout loop ================= + +class UserDefinedRolloutLoop: + """ + Default user defined rollout loop. + + User should write there own udrl class with an async function name process_new_prompt + with signature (self, context: RolloutContext) -> Optional[DataProto|List[DataProto]]. + + RolloutContext hide almost all the implementation details of DynamicSamplingScheduler, LoadBalancer, + ReplayBuffer, and sync/async training. + + A typical process_new_prompt has few steps: + 1. get and filter dataset + 2: spawn tasks to process requests, including generate, reward, and response level filter + 3. prompt level filter + 4. return responses to commit to ReplayBuffer + + To abort this prompt(or dataset), just return None at any where. + + Exception safe: + The framework will only raise asyncio.CancelledError exception. (process_new_prompt will be called by scheduler + as an asyncio.Task and scheduler may cancel this task if needed. User should not suppress asyncio.CancelledError + exception and should handle clean up by themself.) + + User should catch all other exceptions, any other exceptions will be treat as sys.exit by framework. + """ + def __init__(self): + pass + + async def process_new_prompt(self, context: RolloutContext) -> Optional[DataProto|List[DataProto]]: + num_return_sequences = context.meta_info["generation_config"]["num_return_sequences"] + # TODO user can control whether to expand requests at prompt level + is_num_return_sequences_expand = context.is_num_return_sequences_expand + + ################# STEP 1: get and filter dataset + # TODO shigao dataset这一层应该暴露哪些部分(是collect前还是后面的数据呢),需要用户自定义collect_fn吗 + request_data, domain = context.get_request_data(meta_info=context.meta_info) + request_data_list = expand_requests(data=request_data, num_return_sequences=num_return_sequences, + is_num_return_sequences_expand=is_num_return_sequences_expand) + # TODO data filter + + ################# STEP 2: spawn tasks to process requests, including generate, reward, and filter at response level + # Must run inside RolloutContext.do_generate_and_reward context. + # RolloutContext.do_generate_and_reward will wait until can send new request (controlled by LoadBalancer). + # And at exit, RolloutContext will enforce there is no running requests. + async with context.do_generate_and_reward(max_concurrency=num_return_sequences): + responses_list: List[List[DataProto]] = await asyncio.gather( + *[self._generate_and_reward(context=context, req=req, domain=domain) for req in request_data_list] + ) + responses: List[DataProto] = [item for sublist in responses_list for item in sublist] + # User can call RolloutContext.abort_running_requests to abort any running generate requests (generate will return a response + # with finish_reason=="abort", user should distinguish this from partial rollout to avoid dead loop). + # assert there is no running requests outside do_generate_and_reward context. + + ################# STEP 3: prompt level filter + if not context.is_val and not query_filter(responses, context.pipeline_config): + # TODO add metrics (query_filter_count) + logger.debug(f"prompt_id {context.prompt_id} is filtered") + return + + ################# STEP 4: return responses to commit to ReplayBuffer + return responses + + async def _generate_and_reward( + self, + context: RolloutContext, + req: DataProto, + domain: str, + ): + responses: List[DataProto] = [] + + for _ in range(5): # limit max retry times, otherwise may cause dead loop + original_req = copy.deepcopy(req) + + # TODO deprecate collect_unfinished after sglang support partial rollout + collect_unfinished = req.meta_info.get("collect_unfinished", False) + + # TODO: multi-turn rollout + while True: + # TODO: user defined request preprocessor + + data = await context.generate(req=req, domain=domain) + + # TODO: user defined response postprocessor + + # Scheduler may abort request in async training. Should resend partial output + # to support partial rollout. + if is_report_data_finished(data): + req = postprocess_output_data(req, data, context.sequence_length) + break + else: + if not collect_unfinished: + # return None to abort this prompt + return + else: + req = postprocess_paused_data(req, data, context.sequence_length, context.prompt_length) + + rewards = await context.compute_rewards(req=req, domain=domain) + req.union(rewards) + + output_count = req.batch.batch_size[0] + assert output_count == req.meta_info["generation_config"]["num_return_sequences"] + batch_expanded = [req[[idx]] for idx in range(output_count)] + + response_filter_count = 0 + for batch_item in batch_expanded: + if context.is_val or response_filter(batch_item, context.pipeline_config): + responses.append(batch_item) + else: + # TODO add metrics (response_filter_count) + response_filter_count += 1 + + if response_filter_count == 0: + break + else: + # retry if filter out some responses + original_req.meta_info["generation_config"]["num_return_sequences"] = response_filter_count + req = original_req + + return responses diff --git a/roll/distributed/strategy/deepspeed_strategy.py b/roll/distributed/strategy/deepspeed_strategy.py index 0053fb4c1..f81fb19da 100644 --- a/roll/distributed/strategy/deepspeed_strategy.py +++ b/roll/distributed/strategy/deepspeed_strategy.py @@ -1,12 +1,9 @@ -import os from collections import defaultdict from contextlib import nullcontext -from dataclasses import asdict from datetime import timedelta from typing import Callable, Dict, Tuple import deepspeed -import ray import torch import torch.distributed as dist from codetiming import Timer @@ -14,7 +11,6 @@ from deepspeed.runtime.zero import GatheredParameters from deepspeed.runtime.zero.offload_config import OffloadStateTypeEnum from peft import get_peft_model_state_dict -from tqdm import tqdm from transformers import get_scheduler, set_seed from transformers.integrations import HfDeepSpeedConfig @@ -23,6 +19,8 @@ from roll.distributed.scheduler.protocol import DataProto from roll.distributed.strategy.strategy import InferenceStrategy, TrainStrategy from roll.models.model_providers import default_processor_provider, default_tokenizer_provider +from roll.platforms import current_platform +from roll.third_party.deepspeed.model_update import DeepSpeedWeightUpdater from roll.third_party.deepspeed.offload_states_patch import bind_deepspeed_offload_states_func from roll.utils.collective import collective from roll.utils.context_parallel import get_ulysses_group, set_upg_manager @@ -31,7 +29,6 @@ from roll.utils.constants import IGNORE_INDEX from roll.utils.logging import get_logger from roll.utils.offload_states import OffloadStateType -from roll.platforms import current_platform logger = get_logger() @@ -143,6 +140,15 @@ def forward_step( micro_batch_size = batch.meta_info["micro_batch_size"] num_microbatches = max(batch_size // micro_batch_size, 1) micro_batches = batch.chunk(chunks=num_microbatches) + + cp_size = self.worker.rank_info.cp_size + batch_num_tokens = self._get_batch_num_tokens(batch) + batch.meta_info['batch_num_tokens'] = {k: v // cp_size for k, v in batch_num_tokens.items()} + global_valid_tokens = self._get_global_valid_samples(batch) + batch.meta_info['global_valid_samples'] = {k: v // cp_size for k, v in global_valid_tokens.items()} + + loss_scale = num_microbatches * self.worker.rank_info.dp_size + disable_adapter = batch.meta_info.get("disable_adapter", False) adapter_context = self.unwrap_model().disable_adapter() if disable_adapter else nullcontext() losses_reduced = [] @@ -184,6 +190,8 @@ def forward_step( **forward_args, ) loss, loss_reduced = forward_func(data, output.logits) + if self.worker_config.apply_loss_scale: + loss *= loss_scale losses_reduced.append(loss_reduced) results = collate_fn_to_dict_list(losses_reduced) return results @@ -240,16 +248,6 @@ def broadcast_parameter(self, model_update_name, src_pp_rank, dtype, shape, para param.data.copy_(weight) del weight - def update_parameter(self, model_update_name, parameter_name, weight, ranks_in_worker): - param = self.model.get_parameter(parameter_name) - if not self.ds_config.is_zero3(): - param.data.copy_(weight) - else: - with GatheredParameters([param], modifier_rank=0): - if dist.get_rank() == 0: - param.data.copy_(weight) - del weight - # offload/load 相关接口 def load_states(self, include=None, non_blocking=False): if include is not None: @@ -341,6 +339,8 @@ def initialize(self, model_provider): self.tokenizer = default_tokenizer_provider(model_args=self.worker_config.model_args) self.processor = default_processor_provider(model_args=self.worker_config.model_args) + self.weight_updaters = {} + model = model_provider(tokenizer=self.tokenizer, model_args=self.worker_config.model_args, is_trainable=True) if cp_size > 1: @@ -415,6 +415,7 @@ def op_compute_language_loss(self, logits: torch.Tensor, labels: torch.Tensor): Returns: loss: Scalar loss tensor + metrics: Dict """ # Labels already shifted by DataCollator, directly compute cross-entropy loss = torch.nn.functional.cross_entropy( @@ -422,7 +423,8 @@ def op_compute_language_loss(self, logits: torch.Tensor, labels: torch.Tensor): labels.view(-1), ignore_index=IGNORE_INDEX ) - return loss + metrics = {f"{self.worker_config.name}/loss@sum": loss.detach().float().unsqueeze(0)} + return loss, metrics def train_step( self, @@ -431,8 +433,18 @@ def train_step( ): self.model.train() mini_batch_size = self.worker_config.training_args.per_device_train_batch_size - data_iter = batch.make_iterator(mini_batch_size=mini_batch_size, epochs=1) mini_steps = batch.batch.batch_size[0] // self.worker_config.training_args.per_device_train_batch_size + + cp_size = self.worker.rank_info.cp_size + batch_num_tokens = self._get_batch_num_tokens(batch) + batch.meta_info['batch_num_tokens'] = {k: v // cp_size for k, v in batch_num_tokens.items()} + global_valid_tokens = self._get_global_valid_samples(batch) + batch.meta_info['global_valid_samples'] = {k: v // cp_size for k, v in global_valid_tokens.items()} + + loss_scale = mini_steps * self.worker.rank_info.dp_size + batch.meta_info['micro_batch_size'] = mini_batch_size + + data_iter = batch.make_iterator(mini_batch_size=mini_batch_size, epochs=1) metrics = {} for step in range(mini_steps): @@ -472,6 +484,8 @@ def train_step( loss, loss_reduced = loss_func(data, output.logits) append_to_dict(metrics, loss_reduced) loss *= self.worker.rank_info.cp_size + if self.worker_config.apply_loss_scale: + loss *= loss_scale self.model.backward(loss) is_gradient_accumulation_boundary = self.model.is_gradient_accumulation_boundary() @@ -554,113 +568,18 @@ def collect_lora_params(self): del lora_state_dict return lora_params - def model_update(self, model_update_name, tgt_workers, broadcast_tgt_devices, p2p_tgt_devices): - model = self.unwrap_model() - if is_lora := (self.worker_config.model_args.lora_target is not None): - all_params = self.collect_lora_params() - peft_config = model.peft_config.get("default", None) - else: - all_params = list(model.named_parameters()) - - comm_plan = self.model_update_comm_plan[model_update_name][self.worker.rank_info.pp_rank] - model = self.unwrap_model() - broadcast_time_cost = 0 - with Timer("model_update_total") as timer_total: - for param_name, param in tqdm( - all_params, desc="weight update progress", total=len(all_params) - ): - shape = param.shape if not self.ds_config.is_zero3() else param.ds_shape - if not self.ds_config.is_zero3(): - - param_weight = param.data - refs = [] - for p2p_tgt_device in p2p_tgt_devices: - p2p_tgt_worker = tgt_workers[p2p_tgt_device["rank"]] - ref = p2p_tgt_worker.update_parameter.remote( - model_update_name=model_update_name, - parameter_name=param_name, - weight=param_weight, - ranks_in_worker=[p2p_tgt_device["device"]["rank"]], - is_lora=is_lora, - ) - refs.append(ref) - - if ( - self.worker.rank_info.tp_rank == 0 - and self.worker.rank_info.cp_rank == 0 - and self.worker.rank_info.dp_rank == 0 - ): - for worker in tgt_workers: - ref = worker.broadcast_parameter.remote( - model_update_name=model_update_name, - src_pp_rank=self.worker.rank_info.pp_rank, - dtype=param_weight.dtype, - shape=shape, - parameter_name=param_name, - is_lora=is_lora, - ) - refs.append(ref) - if len(broadcast_tgt_devices) > 0: - collective.broadcast(tensor=param_weight, src_rank=0, group_name=comm_plan["group_name"]) - ray.get(refs) - - else: - with GatheredParameters([param]): - param_weight = param.data - with Timer("broadcast") as timer_broadcast: - refs = [] - for p2p_tgt_device in p2p_tgt_devices: - p2p_tgt_worker = tgt_workers[p2p_tgt_device["rank"]] - ref = p2p_tgt_worker.update_parameter.remote( - model_update_name=model_update_name, - parameter_name=param_name, - weight=param_weight, - ranks_in_worker=[p2p_tgt_device["device"]["rank"]], - is_lora=is_lora, - ) - refs.append(ref) - - if ( - self.worker.rank_info.tp_rank == 0 - and self.worker.rank_info.cp_rank == 0 - and self.worker.rank_info.dp_rank == 0 - ): - for worker in tgt_workers: - ref = worker.broadcast_parameter.remote( - model_update_name=model_update_name, - src_pp_rank=self.worker.rank_info.pp_rank, - dtype=param_weight.dtype, - shape=shape, - parameter_name=param_name, - is_lora=is_lora, - ) - refs.append(ref) - if len(broadcast_tgt_devices) > 0: - collective.broadcast( - tensor=param_weight, src_rank=0, group_name=comm_plan["group_name"] - ) - ray.get(refs) - broadcast_time_cost += timer_broadcast.last - - if is_lora: - with Timer("add_lora") as timer_add_lora: - if ( - self.worker.rank_info.tp_rank == 0 - and self.worker.rank_info.cp_rank == 0 - and self.worker.rank_info.dp_rank == 0 - ): - refs = [] - for worker in tgt_workers: - ref = worker.add_lora.remote(peft_config=asdict(peft_config)) - refs.append(ref) - ray.get(refs) + def setup_model_update(self, infer_cluster, model_update_name: str): + assert model_update_name not in self.weight_updaters + is_lora = self.worker_config.model_args.lora_target is not None + self.weight_updaters[model_update_name] = DeepSpeedWeightUpdater( + pipeline_config=self.worker.pipeline_config, + infer_cluster=infer_cluster, + worker_config=self.worker_config, + model_update_name=model_update_name, + model=self.unwrap_model(), + ds_config=self.ds_config, + is_lora=is_lora, + ) - metrics = { - "broadcast": broadcast_time_cost, - } - if is_lora: - metrics["all_gather"] = timer_total.last - broadcast_time_cost - timer_add_lora.last - metrics["add_lora"] = timer_add_lora.last - else: - metrics["all_gather"] = timer_total.last - broadcast_time_cost - return metrics + def model_update(self, model_update_name: str): + return self.weight_updaters[model_update_name].model_update() diff --git a/roll/distributed/strategy/factory.py b/roll/distributed/strategy/factory.py index e408fd929..a83dcf0f7 100644 --- a/roll/distributed/strategy/factory.py +++ b/roll/distributed/strategy/factory.py @@ -2,9 +2,16 @@ from roll.distributed.executor.worker import Worker from roll.distributed.strategy.strategy import InferenceStrategy, TrainStrategy +from roll.utils.asyncio_decorator import create_sync_class -def create_strategy(worker: Worker) -> Union[InferenceStrategy, TrainStrategy]: +def create_strategy(worker: Worker, sync_wrapper: bool = False) -> Union[InferenceStrategy, TrainStrategy]: + """ + Args: + sync_wrapper (bool): vllm and sglang override interface of InferenceStrategy to async function. + When use those two strategy in ray Threaded Actor, we provide sync_wrapper to wrap + async function to sync function to avoid writing too much loop.run_unti_complete. + """ strategy_name = worker.worker_config.strategy_args.strategy_name # Lazy import strategy to avoid cuda initialized @@ -13,7 +20,7 @@ def create_strategy(worker: Worker) -> Union[InferenceStrategy, TrainStrategy]: elif strategy_name == "deepspeed_train": from roll.distributed.strategy.deepspeed_strategy import DeepSpeedTrainStrategy as strategy_cls elif strategy_name == "diffusion_deepspeed_train": - from roll.distributed.strategy.diffusion_strategy import DeepSpeedTrainStrategy as strategy_cls + from roll.distributed.strategy.diffusion_strategy import DeepSpeedTrainStrategy as strategy_cls elif strategy_name == "hf_infer": from roll.distributed.strategy.hf_strategy import HfInferStrategy as strategy_cls elif strategy_name == "vllm": @@ -26,7 +33,13 @@ def create_strategy(worker: Worker) -> Union[InferenceStrategy, TrainStrategy]: from roll.distributed.strategy.megatron_strategy import MegatronTrainStrategy as strategy_cls elif strategy_name == "mock_infer": from roll.distributed.strategy.mock_strategy import MockInferStrategy as strategy_cls + elif strategy_name == "fsdp2_infer": + from roll.distributed.strategy.fsdp2_strategy import FSDP2InferStrategy as strategy_cls + elif strategy_name == "fsdp2_train": + from roll.distributed.strategy.fsdp2_strategy import FSDP2TrainStrategy as strategy_cls else: raise ValueError(f"Unknown strategy name: {strategy_name}") + if sync_wrapper: + strategy_cls = create_sync_class(strategy_cls) return strategy_cls(worker) diff --git a/roll/distributed/strategy/fsdp2_strategy.py b/roll/distributed/strategy/fsdp2_strategy.py new file mode 100644 index 000000000..389ff9cb2 --- /dev/null +++ b/roll/distributed/strategy/fsdp2_strategy.py @@ -0,0 +1,1268 @@ +import contextlib +import os +import random +from collections import defaultdict +from contextlib import nullcontext +from typing import Callable, Dict, Optional, Tuple + +import numpy as np +import ray +import torch +import torch.distributed as dist +import torch.distributed.checkpoint as dcp +from codetiming import Timer +from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input +from torch import optim +from torch.distributed.checkpoint.state_dict import StateDictOptions, get_model_state_dict +from torch.distributed.device_mesh import init_device_mesh +from torch.distributed.fsdp import CPUOffloadPolicy, MixedPrecisionPolicy +from torch.distributed.tensor import DTensor, distribute_tensor +from torch.nn.utils import clip_grad_norm_ +from torch.nn.utils.clip_grad import _clip_grads_with_norm_, _get_total_norm +from transformers import AutoConfig, get_scheduler, set_seed + +from roll.datasets.collator import collate_fn_to_dict_list +from roll.distributed.executor.worker import Worker +from roll.distributed.scheduler.protocol import DataProto +from roll.distributed.strategy.strategy import InferenceStrategy, TrainStrategy +from roll.models.model_providers import ( + clear_fsdp2_init_context, + default_processor_provider, + default_tokenizer_provider, + set_fsdp2_init_context, +) +from roll.platforms import current_platform +from roll.third_party.fsdp2.model_update import FSDP2WeightUpdater +from roll.utils.checkpoint_manager import CheckpointManager, download_model +from roll.utils.collective import collective +from roll.utils.context_parallel import get_ulysses_group, set_upg_manager +from roll.utils.context_parallel.autograd_gather import ulysses_gather +from roll.utils.context_parallel.rmpad_ulysses import ( + gather_outputs_and_unpad, + ulysses_pad_and_slice_inputs, + ulysses_pad_inputs, +) +from roll.utils.fsdp_utils import ( + apply_fsdp2, + fsdp2_load_full_state_dict, + get_init_weight_context_manager, + get_shard_placement_fn, +) +from roll.utils.functionals import append_to_dict, log_probs_from_logits +from roll.utils.logging import get_logger +from roll.utils.offload_states import OffloadStateType + +logger = get_logger() + + +def _parse_dtype(dtype): + if dtype is None: + return None + + if isinstance(dtype, torch.dtype): + return dtype + + if isinstance(dtype, str): + dtype_lower = dtype.lower() + dtype_map = { + "bf16": torch.bfloat16, + "bfloat16": torch.bfloat16, + "fp16": torch.float16, + "float16": torch.float16, + "half": torch.float16, + "fp32": torch.float32, + "float32": torch.float32, + "float": torch.float32, + "fp64": torch.float64, + "float64": torch.float64, + } + + if dtype_lower in dtype_map: + return dtype_map[dtype_lower] + else: + if hasattr(torch, dtype): + return getattr(torch, dtype) + else: + raise ValueError( + f"Unsupported dtype string: '{dtype}'. " f"Supported values: {list(dtype_map.keys())}" + ) + + return dtype + + +def create_device_mesh_with_ulysses(world_size: int, fsdp_size: int): + """ + Create device mesh for FSDP. + """ + + # Default to global sharding (1D mesh) if fsdp_size is not explicitly set for HSDP + if fsdp_size <= 1 or fsdp_size >= world_size: + mesh_shape = (world_size,) + mesh_dim_names = ["fsdp"] + else: + # HSDP Case: Shard within fsdp_size group, Replicate across the rest + # PyTorch fully_shard shards on the LAST dimension (inner) and replicates on outer dimensions. + # Example: world=8, fsdp=4. We want 2 replicas of 4-way sharding. + # Mesh: (2, 4). Replicate on dim 0 (2), Shard on dim 1 (4). + ddp_size = world_size // fsdp_size + mesh_shape = (ddp_size, fsdp_size) + mesh_dim_names = ["ddp", "fsdp"] + + return init_device_mesh( + current_platform.device_type, + mesh_shape=mesh_shape, + mesh_dim_names=mesh_dim_names, + ) + + +class FSDP2StrategyBase(InferenceStrategy): + def __init__(self, worker: Worker): + super().__init__(worker) + self.cpu_offload_enabled: bool = False + if not hasattr(self, "checkpoint_manager") or self.checkpoint_manager is None: + checkpoint_config = getattr(self.worker_config, "checkpoint_config", None) + self.checkpoint_manager = CheckpointManager(checkpoint_config=checkpoint_config) + self._model_update_device_buffer: Optional[torch.Tensor] = None + self.weight_updaters = {} + + def _get_dp_rank(self) -> int: + rank_info = getattr(self.worker, "rank_info", None) + if rank_info is not None and getattr(rank_info, "dp_rank", None) is not None: + return rank_info.dp_rank + return dist.get_rank() + + def _build_checkpoint_paths( + self, + base_dir: str, + world_size: Optional[int] = None, + dp_rank: Optional[int] = None, + ): + world_size = world_size or dist.get_world_size() + dp_rank = dp_rank if dp_rank is not None else self._get_dp_rank() + suffix = f"world_size_{world_size}_rank_{dp_rank}.pt" + model_path = os.path.join(base_dir, f"model_{suffix}") + optim_path = os.path.join(base_dir, f"optim_{suffix}") + extra_path = os.path.join(base_dir, f"extra_state_{suffix}") + return model_path, optim_path, extra_path + + @staticmethod + def _get_dcp_checkpoint_dir(base_dir: str) -> str: + return os.path.join(base_dir, "dcp") + + def _get_dcp_state_dict_options(self, full_state_dict: bool = False) -> StateDictOptions: + # Always use cpu_offload=True for DCP to avoid OOM during load/save + # independent of training offload configuration. + return StateDictOptions( + full_state_dict=full_state_dict, + cpu_offload=True, + ) + + def _save_checkpoint_with_dcp(self, checkpoint_dir: str, is_last_step: bool): + state_dict = { + **self.model.state_dict(), + } + + optimizer = getattr(self, "optimizer", None) + if optimizer is not None: + state_dict["optimizer"] = optimizer + + scheduler = getattr(self, "scheduler", None) + if scheduler is not None: + state_dict["scheduler"] = scheduler + + rng_state = self.get_rng_state() + state_dict["rng_state"] = rng_state + + if not self.async_save_strategy or is_last_step: + if self.checkpoint_future is not None: + self.checkpoint_future.result() + self.checkpoint_future = None + dcp.save( + state_dict=state_dict, + checkpoint_id=checkpoint_dir, + ) + else: + if self.checkpoint_future is not None: + self.checkpoint_future.result() + self.checkpoint_future = dcp.async_save( + state_dict=state_dict, + checkpoint_id=checkpoint_dir, + ) + + def _load_checkpoint_with_dcp(self, checkpoint_dir: str): + state_dict = { + **self.model.state_dict(), + } + + optimizer = getattr(self, "optimizer", None) + if optimizer is not None: + state_dict["optimizer"] = optimizer + + scheduler = getattr(self, "scheduler", None) + if scheduler is not None: + state_dict["scheduler"] = scheduler + + state_dict["rng_state"] = {} + + dcp.load( + state_dict=state_dict, + checkpoint_id=checkpoint_dir, + ) + + if "rng_state" in state_dict and state_dict["rng_state"]: + self.load_rng_state(state_dict["rng_state"]) + + info = self.model.load_state_dict(state_dict, strict=False) + missing_keys = info.missing_keys + unexpected_keys = info.unexpected_keys + + filtered_unexpected_keys = [ + key for key in unexpected_keys if key not in ("optimizer", "scheduler", "rng_state") + ] + + if missing_keys: + logger.warning(f"Missing keys: {missing_keys}") + if filtered_unexpected_keys: + logger.warning(f"Unexpected keys: {filtered_unexpected_keys}") + + def _load_checkpoint_from_legacy_shards( + self, + load_dir: str, + world_size: int, + dp_rank: int, + optimizer, + ): + model_path, optim_path, _ = self._build_checkpoint_paths( + load_dir, + world_size=world_size, + dp_rank=dp_rank, + ) + + model_state_dict = self._load_torch_file(model_path, required=True) + optimizer_state_dict = self._load_torch_file(optim_path, required=optimizer is not None) + + if not model_state_dict: + logger.warning("Empty model state dict loaded from %s, skipping model restore", model_path) + return + + first_param = next(iter(model_state_dict.values())) + if isinstance(first_param, DTensor): + self.model.load_state_dict(model_state_dict, assign=True) + else: + meta_sharded_sd = self.model.state_dict() + sharded_sd = {} + for param_name, full_tensor in model_state_dict.items(): + if param_name in meta_sharded_sd: + sharded_meta_param = meta_sharded_sd[param_name] + if isinstance(sharded_meta_param, DTensor): + # Respect the DTensor's device (CPU for offload_policy=True) + target_device = sharded_meta_param.device + sharded_tensor = distribute_tensor( + full_tensor.to(target_device), + sharded_meta_param.device_mesh, + sharded_meta_param.placements, + ) + sharded_sd[param_name] = torch.nn.Parameter(sharded_tensor) + else: + sharded_sd[param_name] = torch.nn.Parameter(full_tensor) + else: + sharded_sd[param_name] = torch.nn.Parameter(full_tensor) + self.model.load_state_dict(sharded_sd, assign=True) + + if optimizer_state_dict is not None and optimizer is not None: + optimizer.load_state_dict(optimizer_state_dict) + + def _load_extra_state_dict(self, base_dir: str, world_size: int, dp_rank: int): + _, _, extra_state_path = self._build_checkpoint_paths( + base_dir, + world_size=world_size, + dp_rank=dp_rank, + ) + + if os.path.exists(extra_state_path): + return torch.load(extra_state_path, map_location="cpu", weights_only=False) + + return {} + + def save_checkpoint(self, save_dir, global_step, ckpt_id, tag="checkpoint", local_state_path=None, **kwargs): + """ + Save the sharded (DTensor) checkpoint as well as HF-compatible full weights. + In FSDP, all ranks should coordinate: + 1. All ranks save their sharded checkpoints (model/optim/extra state) to the same directory + 2. Only rank 0 saves the full HuggingFace-compatible model + """ + logger.info(f"save_dir: {save_dir}") + if local_state_path is None: + local_state_path = save_dir + + is_last_step = kwargs.get("is_last_step", None) + + if is_last_step is None: + if self.worker_config.training_args.max_steps is not None: + is_last_step = global_step == self.worker_config.training_args.max_steps - 1 + else: + # If max_steps is not set, we consider all steps as the last step in case of hang for async saving + is_last_step = True + + # PumpkinComment: + # Why we need to wait here and also in save_dcp? Because if not, easy to hang in LoRA + # Not sure why, but keep the logic here for now. + if self.async_save_strategy and self.checkpoint_future is not None: + logger.info("Waiting for previous async checkpoint to complete...") + self.checkpoint_future.result() + self.checkpoint_future = None + + os.makedirs(save_dir, exist_ok=True) + + with Timer("load", logger=None) as load_timer: + self.load_states() + + dcp_checkpoint_dir = self._get_dcp_checkpoint_dir(save_dir) + os.makedirs(dcp_checkpoint_dir, exist_ok=True) + + with Timer("dcp_save", logger=None) as dcp_timer: + self._save_checkpoint_with_dcp(checkpoint_dir=dcp_checkpoint_dir, is_last_step=is_last_step) + + # PumpkinComment: + # If DCP save is async, uploading (which may copy+delete the local dir) must not start + # until the async save has fully finished writing checkpoint shards. + dcp_save_future = self.checkpoint_future if (self.async_save_strategy and not is_last_step) else None + + with Timer("hf_save", logger=None) as hf_timer: + full_state_options = self._get_dcp_state_dict_options(full_state_dict=True) + full_model_state = get_model_state_dict( + model=self.model, + options=full_state_options, + ) + + if dist.get_rank() == 0: + underlying_model = self.unwrap_model() + underlying_model.save_pretrained( + save_dir, + state_dict=full_model_state, + safe_serialization=True, + ) + self.tokenizer.save_pretrained(save_dir) + if getattr(self, "processor", None): + self.processor.save_pretrained(save_dir) + + checkpoint_config = getattr(self.worker_config, "checkpoint_config", None) or {} + async_upload = checkpoint_config.get("async_upload", True) + keep_local_file = checkpoint_config.get("keep_local_file", False) + if dcp_save_future is not None and async_upload: + + def _on_dcp_done(fut): + print("[DEBUG] Enter Callback for DCP save") + try: + fut.result() + except Exception: + logger.error(f"Async DCP save failed for ckpt_id={ckpt_id}, skip upload.") + return + + self.thread_executor.submit( + self.checkpoint_manager.upload, + ckpt_id=ckpt_id, + local_state_path=local_state_path, + keep_local_file=keep_local_file, + ) + + dcp_save_future.add_done_callback(_on_dcp_done) + else: + # If async_upload=False, block until DCP async save completes, then upload. + if dcp_save_future is not None: + dcp_save_future.result() + + if async_upload: + self.thread_executor.submit( + self.checkpoint_manager.upload, + ckpt_id=ckpt_id, + local_state_path=local_state_path, + keep_local_file=keep_local_file, + ) + else: + self.checkpoint_manager.upload( + ckpt_id=ckpt_id, + local_state_path=local_state_path, + keep_local_file=keep_local_file, + ) + + return { + "load": load_timer.last, + "dcp_save": dcp_timer.last, + "hf_save": hf_timer.last, + } + + def _load_torch_file(self, path: str, required: bool = True): + if os.path.exists(path): + return torch.load(path, map_location="cpu", weights_only=False) + if required: + raise FileNotFoundError(f"Missing checkpoint shard: {path}") + logger.warning(f"Optional checkpoint shard missing, skipping: {path}") + return None + + def load_checkpoint(self, load_dir, tag="checkpoint", **kwargs): + """ + Load checkpoint from a shared directory where all ranks' sharded checkpoints are stored. + + In FSDP, synchronize the load_dir across all ranks to ensure they load from the same location. + """ + logger.info(f"load_dir: {load_dir}") + + dcp_checkpoint_dir = self._get_dcp_checkpoint_dir(load_dir) + used_dcp = False + if os.path.isdir(dcp_checkpoint_dir): + if dist.is_initialized(): + dist.barrier() + + self._load_checkpoint_with_dcp( + checkpoint_dir=dcp_checkpoint_dir, + ) + used_dcp = True + logger.info(f"Loaded DCP checkpoint from {dcp_checkpoint_dir}") + if dist.is_initialized(): + dist.barrier() + return + + @staticmethod + def get_rng_state(): + rng_state = { + "cpu": torch.get_rng_state(), + "cuda": torch.cuda.get_rng_state(), + "numpy": np.random.get_state(), + "random": random.getstate(), + } + return rng_state + + @staticmethod + def load_rng_state(rng_state): + torch.set_rng_state(rng_state["cpu"]) + torch.cuda.set_rng_state(rng_state["cuda"]) + np.random.set_state(rng_state["numpy"]) + random.setstate(rng_state["random"]) + + def _copy_weight_to_param(self, param: torch.nn.Parameter, weight: torch.Tensor): + """ + Copy a full (replicated) tensor onto a possibly-sharded FSDP2 parameter. + Handles DTensor placement to keep shards consistent across ranks. + """ + + target = param.data if hasattr(param, "data") else param + source = weight.data if hasattr(weight, "data") else weight + source = source.detach() + + if isinstance(source, DTensor): + if isinstance(target, DTensor): + same_mesh = source.device_mesh == target.device_mesh + same_place = source.placements == target.placements + if same_mesh and same_place: + target.copy_(source) + return + source = source.full_tensor() + + if isinstance(target, DTensor): + sharded = distribute_tensor( + source.to(target.device), + target.device_mesh, + target.placements, + ) + target.copy_(sharded) + else: + target.copy_(source.to(target.device)) + + def _gather_full_tensor(self, param: torch.nn.Parameter) -> torch.Tensor: + tensor = param.data if hasattr(param, "data") else param + if isinstance(tensor, DTensor): + original_device = tensor.device + if original_device.type == "cpu" and current_platform.device_type == "cuda" and torch.cuda.is_available(): + tensor = tensor.to(current_platform.device_type) + tensor = tensor.full_tensor() + if original_device.type == "cpu": + tensor = tensor.cpu() + # full_tensor() already returns a new tensor from all-gather + return tensor.detach() + # For non-DTensor (e.g., LoRA params that aren't sharded), we need to clone + # to avoid modifying the original parameter during bucket packing + return tensor.detach().clone() + + def _move_optimizer_states(self, device: torch.device, non_blocking: bool = False): + optimizer = getattr(self, "optimizer", None) + if optimizer is None: + return + for state in optimizer.state.values(): + for key, value in state.items(): + if torch.is_tensor(value): + state[key] = value.to(device, non_blocking=non_blocking) + + def _get_broadcast_tensor(self, weight_cpu: torch.Tensor) -> torch.Tensor: + """ + Reuse buffer to avoid allocating new memory. + """ + if current_platform.device_type == "cpu": + return weight_cpu + numel = weight_cpu.numel() + dtype = weight_cpu.dtype + buffer = self._model_update_device_buffer + if buffer is None or buffer.numel() < numel or buffer.dtype != dtype: + buffer = torch.empty(numel, dtype=dtype, device=current_platform.device_type) + self._model_update_device_buffer = buffer + device_view = buffer[:numel].view(weight_cpu.shape) + device_view.copy_(weight_cpu, non_blocking=True) + return device_view + + def get_data_input(self, batch: DataProto): + """Ensure Ulysses/context-parallel ranks receive identical data.""" + + def broadcast_obj(obj, group): + obj_list = [obj if dist.get_rank(group) == 0 else None] + src_rank = dist.get_process_group_ranks(group)[0] + dist.broadcast_object_list(obj_list, src=src_rank, group=group) + return obj_list[0] + + if getattr(self.worker.rank_info, "cp_size", 1) <= 1: + return batch + + broadcast_non_tensor_batch = batch.meta_info.get("_broadcast_non_tensor_batch", False) + if broadcast_non_tensor_batch: + tmp_batch = broadcast_obj(batch, get_ulysses_group()) + batch.batch = tmp_batch.batch + batch.non_tensor_batch = tmp_batch.non_tensor_batch + else: + batch.batch = broadcast_obj(batch.batch, get_ulysses_group()) + return batch + + def _prepare_fsdp2_model( + self, + model_provider, + *, + is_trainable: bool, + default_model_dtype: torch.dtype, + warmup_collective: bool = False, + ): + + set_seed(seed=self.worker.pipeline_config.seed) + + if not torch.distributed.is_initialized(): + if current_platform.device_type != "cpu": + backends_str = f"cpu:gloo,{current_platform.device_type}:{current_platform.communication_backend}" + else: + backends_str = current_platform.communication_backend + torch.distributed.init_process_group(backend=backends_str) + + if warmup_collective: + dist.all_reduce(torch.zeros(1).to(current_platform.device_type)) + + if self.worker_config.strategy_args.strategy_config.get("apply_tiled_mlp", False): + from roll.third_party.fsdp2.tiled_mlp import apply_tiled_mlp_monkey_patch + + apply_tiled_mlp_monkey_patch( + num_shards=self.worker_config.strategy_args.strategy_config.get("tiled_num_shards", 4), + model_type=self.worker_config.strategy_args.strategy_config.get("model_type", None), + ) + + world_size = torch.distributed.get_world_size() + global_rank = torch.distributed.get_rank() + + cp_size = self.worker_config.model_args.ulysses_size + if cp_size > 1: + if current_platform.apply_ulysses_patch() is not None: + set_upg_manager( + ulysses_size=cp_size, + rank=global_rank, + world_size=world_size, + ) + else: + cp_size = 1 + + if self.worker_config.model_args.ulysses_size != cp_size: + # PumpkinComment: Fallback if something goes wrong with CP + logger.warning( + f"ulysses_size in config ({self.worker_config.model_args.ulysses_size}) is not equal to cp_size ({cp_size}), using cp_size instead" + ) + self.worker_config.strategy_args.strategy_config["fsdp_size"] = ( + self.worker_config.strategy_args.strategy_config["fsdp_size"] + * self.worker_config.model_args.ulysses_size + ) + self.worker_config.model_args.ulysses_size = cp_size + + self.worker.rank_info.dp_rank = global_rank // cp_size + self.worker.rank_info.dp_size = world_size // cp_size + self.worker.rank_info.cp_rank = global_rank % cp_size + self.worker.rank_info.cp_size = cp_size + + if cp_size > 1 and global_rank == 0: + logger.debug(f"FSDP2 CP(Ulysses) enabled: cp_size={cp_size}, dp_size={self.worker.rank_info.dp_size}") + + self.tokenizer = default_tokenizer_provider(model_args=self.worker_config.model_args) + self.processor = default_processor_provider(model_args=self.worker_config.model_args) + + torch_dtype = self.worker_config.strategy_args.strategy_config.get("param_dtype", default_model_dtype) + torch_dtype = _parse_dtype(torch_dtype) + self.worker_config.model_args.compute_dtype = torch_dtype + + fsdp_size = self.worker_config.strategy_args.strategy_config.get("fsdp_size", 1) + if cp_size > 1 and (fsdp_size <= 1 or fsdp_size >= world_size): + fsdp_size = world_size // cp_size + self.worker_config.strategy_args.strategy_config["fsdp_size"] = fsdp_size + if global_rank == 0: + logger.info(f"CP enabled: auto-setting fsdp_size={fsdp_size} so ddp_size==cp_size for hybrid sharding") + elif fsdp_size != world_size: + logger.warning(f"fsdp_size {fsdp_size} is not equal to world_size {world_size}, using world_size instead") + fsdp_size = world_size + + self.device_mesh = create_device_mesh_with_ulysses(world_size=world_size, fsdp_size=fsdp_size) + + model_name_or_path = download_model(self.worker_config.model_args.model_name_or_path) + config = AutoConfig.from_pretrained( + model_name_or_path, + trust_remote_code=True, + **self.worker_config.model_args.model_config_kwargs, + ) + + self._validate_ulysses_compat(config, cp_size) + + use_meta_tensor = not getattr(config, "tie_word_embeddings", False) + init_context = get_init_weight_context_manager( + use_meta_tensor=use_meta_tensor, + mesh=self.device_mesh, + ) + + set_fsdp2_init_context(init_context) + try: + model = model_provider( + tokenizer=self.tokenizer, + model_args=self.worker_config.model_args, + is_trainable=is_trainable, + ) + finally: + clear_fsdp2_init_context() + + self.is_lora = self.worker_config.model_args.lora_target is not None + + return model, torch_dtype, cp_size + + @staticmethod + def _validate_ulysses_compat(config, cp_size: int): + try: + num_attention_heads, num_key_value_heads = ( + config.num_attention_heads, + config.num_key_value_heads, + ) + except AttributeError: + num_attention_heads, num_key_value_heads = ( + config.text_config.num_attention_heads, + config.text_config.num_key_value_heads, + ) + + assert ( + num_attention_heads % cp_size == 0 + ), f"num_attention_heads {num_attention_heads} must be divisible by ulysses_size {cp_size}" + assert num_key_value_heads % cp_size == 0 or cp_size % num_key_value_heads == 0, ( + f"num_key_value_heads {num_key_value_heads} must be divisible by ulysses_size " + f"{cp_size}or vise versa. Upon ulysses_size % num_key_value_heads == 0," + f"kv heads are repeated to ensure correctness." + ) + + def load_states(self, include=None, non_blocking=False): + if not self.cpu_offload_enabled: + if include is None or OffloadStateType.model_params in include: + device = current_platform.current_device() + self.model.to(device, non_blocking=non_blocking) + # When cpu_offload is disabled, always keep optimizer states on GPU + self._move_optimizer_states(current_platform.current_device(), non_blocking=non_blocking) + else: + # When cpu_offload is enabled, only load optimizer states if requested + if include is None or OffloadStateType.optimizer_states in include: + self._move_optimizer_states( + current_platform.current_device(), + non_blocking=non_blocking, + ) + + def offload_states(self, include=None, non_blocking=False): + """ " + PumpkinComment: + + If CPUOFFloadPolicy is True: Every thing about offload /load model param is built from FSDP2. + If CPUOFFloadPolicy is False: The model param in on GPU, we need to mvoe the optimizer to GPU as well. + + Therefore, we actually could leave model param. offload/onload logic to FSDP2 during training + But here, I maintain mannual support and compatible with FSDP2 CPUOFFloadPolicy for other offload logic. + """ + if not self.cpu_offload_enabled: + if include is None or OffloadStateType.model_params in include: + self.model.to("cpu", non_blocking=non_blocking) + if current_platform.device_type == "cuda": + torch.cuda.empty_cache() + # When cpu_offload is disabled, optimizer states should stay on GPU + # Only offload optimizer states if cpu_offload is enabled + else: + # When cpu_offload is enabled, offload optimizer states + if include is None or OffloadStateType.optimizer_states in include: + self._move_optimizer_states(torch.device("cpu"), non_blocking=non_blocking) + + +class FSDP2InferStrategy(FSDP2StrategyBase): + strategy_name = "fsdp2_infer" + + def __init__(self, worker: Worker): + super().__init__(worker) + self.device_mesh = None + self.fsdp_config = None + + def initialize(self, model_provider): + model, torch_dtype, _ = self._prepare_fsdp2_model( + model_provider, + is_trainable=False, + default_model_dtype=torch.bfloat16, + ) + + self.setup_fsdp2_configuration() + self.initialize_fsdp2_model(model) + + dist.barrier() + + def setup_fsdp2_configuration(self): + """Setup FSDP-2 configuration""" + # ckpt strategy + async_save_strategy = self.worker_config.strategy_args.strategy_config.get("async_save_ckpt", True) + self.async_save_strategy = async_save_strategy + if self.async_save_strategy: + self.checkpoint_future = None + + # Get mixed precision settings from config + param_dtype = self.worker_config.strategy_args.strategy_config.get("param_dtype", torch.bfloat16) + reduce_dtype = self.worker_config.strategy_args.strategy_config.get("reduce_dtype", torch.float32) + + # Convert string dtype specifications to torch.dtype + param_dtype = _parse_dtype(param_dtype) + reduce_dtype = _parse_dtype(reduce_dtype) + self.param_dtype = param_dtype + self.reduce_dtype = reduce_dtype + + mixed_precision = MixedPrecisionPolicy( + param_dtype=param_dtype, + reduce_dtype=reduce_dtype, + cast_forward_inputs=True, + ) + + # Reshard after forward setting (FSDP2 uses this instead of sharding_strategy) + # FULL_SHARD: reshard_after_forward=True + # SHARD_GRAD_OP: reshard_after_forward=False + # HYBRID_SHARD: reshard_after_forward=True with a 2D device mesh + # HYBRID_SHARD_ZERO2: reshard_after_forward=False with a 2D device mesh + # If None, True for submodules, False for root module + reshard_after_forward = self.worker_config.strategy_args.strategy_config.get("reshard_after_forward", None) + + offload_policy_cfg = self.worker_config.strategy_args.strategy_config.get("offload_policy", False) + self.cpu_offload_enabled = bool(offload_policy_cfg) + offload_policy = None + if self.cpu_offload_enabled: + offload_policy = CPUOffloadPolicy( + pin_memory=True, + ) + + # Store configuration for fully_shard() + self.fsdp_config = { + "mesh": self.device_mesh, + "reshard_after_forward": reshard_after_forward, + "mp_policy": mixed_precision, + "offload_policy": offload_policy, + "shard_placement_fn": get_shard_placement_fn( + fsdp_size=self.worker_config.strategy_args.strategy_config.get("fsdp_size", 1) + ), + } + + def initialize_fsdp2_model(self, model): + offload_policy = self.fsdp_config["offload_policy"] + full_state = model.state_dict() + apply_fsdp2( + model, + self.fsdp_config, + self.worker_config.strategy_args.strategy_config, + self.is_lora, + ) + + fsdp2_load_full_state_dict( + model, + full_state, + self.device_mesh, + offload_policy, + ) + + self.model = model + + def forward_step( + self, + batch: DataProto, + forward_func: Callable[ + [DataProto, torch.Tensor], + Tuple[torch.Tensor, Dict[str, torch.Tensor]], + ], + ) -> Dict[str, torch.Tensor]: + self.model.eval() + batch_size = batch.batch.batch_size[0] + micro_batch_size = batch.meta_info["micro_batch_size"] + num_microbatches = max(batch_size // micro_batch_size, 1) + micro_batches = batch.chunk(chunks=num_microbatches) + + cp_size = self.worker.rank_info.cp_size + batch_num_tokens = self._get_batch_num_tokens(batch) + batch.meta_info['batch_num_tokens'] = {k: v // cp_size for k, v in batch_num_tokens.items()} + global_valid_tokens = self._get_global_valid_samples(batch) + batch.meta_info['global_valid_samples'] = {k: v // cp_size for k, v in global_valid_tokens.items()} + + loss_scale = num_microbatches * self.worker.rank_info.dp_size + + disable_adapter = batch.meta_info.get("disable_adapter", False) + adapter_context = self.unwrap_model().disable_adapter() if disable_adapter else nullcontext() + losses_reduced = [] + + with adapter_context: + for data in micro_batches: + with torch.autocast( + device_type=current_platform.device_type, + dtype=self.param_dtype, + ): + input_ids = data.batch["input_ids"] + attention_mask = data.batch["attention_mask"] + position_ids = data.batch["position_ids"] + forward_args = data.meta_info.get("forward_args", {}) + if position_ids.dim() == 3: + # qwen-vl mrope-style 3D position_ids stored in DataProto as (bsz, C, seqlen) + # transpose to (C, bsz, seqlen) for model forward. + position_ids = position_ids.transpose(0, 1) # (bsz, C, seqlen) -> (C, bsz, seqlen) + if "multi_modal_inputs" in data.non_tensor_batch: + multi_modal_inputs = data.non_tensor_batch["multi_modal_inputs"] + multi_modal_data = defaultdict(list) + # mm inputs of some samples would be empty to allow text and mm mixed data + for sample_mm_inputs in multi_modal_inputs: + for key in sample_mm_inputs.keys(): + multi_modal_data[key].append(sample_mm_inputs[key]) + for key in multi_modal_data.keys(): + assert key not in forward_args + # DataProto.to('cuda') in upper frame not work for non_tensor_batch + forward_args[key] = torch.concat(multi_modal_data[key], dim=0).to(input_ids.device) + forward_args.update({"force_vit_image": True}) + + logits = self._fsdp2_forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + forward_args=forward_args, + ) + + loss, loss_reduced = forward_func(data, logits) + if self.worker_config.apply_loss_scale: + loss *= loss_scale + losses_reduced.append(loss_reduced) + + results = collate_fn_to_dict_list(losses_reduced) + return results + + def get_feature_on_cp_rank( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor = None, + position_ids: torch.Tensor = None, + ): + """Get features for specific context parallel rank""" + seqlens_in_batch = input_ids.size(1) + assert ( + seqlens_in_batch % self.worker.rank_info.cp_size == 0 + ), f"input_length={seqlens_in_batch} not divisible by cp_size={self.worker.rank_info.cp_size}" + cp_middle_rank_len = seqlens_in_batch // self.worker.rank_info.cp_size + padded_input_ids = input_ids + result = {} + start_index = cp_middle_rank_len * self.worker.rank_info.cp_rank + end_index = cp_middle_rank_len * (self.worker.rank_info.cp_rank + 1) + result["input_ids"] = padded_input_ids[:, start_index:end_index] + if attention_mask is not None: + result["attention_mask"] = attention_mask[:, start_index:end_index] + if position_ids is not None: + if position_ids.dim() == 3: + result["position_ids"] = position_ids[:, :, start_index:end_index] + else: + result["position_ids"] = position_ids[:, start_index:end_index] + return result + + def _fsdp2_forward( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + position_ids: torch.Tensor, + forward_args: Dict[str, torch.Tensor], + ) -> torch.Tensor: + cp_size = self.worker.rank_info.cp_size + cp_rank = self.worker.rank_info.cp_rank + + # PumpkinComment: + # - do NOT slice padded tensors first (would reintroduce imbalance) + # - unpad to token stream, pad-to-multiple-of-cp, slice equally, run model with attn_mask=None + # - gather outputs and unpad, then pad back to original (bs, seqlen) so downstream remains unchanged + if cp_size > 1: + underlying = self.unwrap_model() + model_type = getattr(getattr(underlying, "config", None), "model_type", "") or "" + is_vlm = getattr(getattr(underlying, "config", None), "vision_config", None) is not None + is_supported_vlm = is_vlm and model_type in ("qwen2_5_vl", "qwen3_vl") + + if not is_supported_vlm: + features = self.get_feature_on_cp_rank(input_ids, attention_mask, position_ids) + input_ids = features["input_ids"] + attention_mask = features["attention_mask"] + position_ids = features["position_ids"] + + # Ensure use_cache is False if not specified (matches HF strategy) + if "use_cache" not in forward_args: + forward_args["use_cache"] = False + + return self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + **forward_args, + ).logits + + def generate(self, batch: DataProto, generation_config): + if self.worker.rank_info.cp_size > 1: + raise RuntimeError("FSDP2 generate() is not supported with CP(Ulysses) enabled yet. ") + input_ids = batch.batch["input_ids"] # (bs, prompt_length) + attention_mask = batch.batch["attention_mask"] # left-padded attention_mask + + output = self.model.generate( + input_ids=input_ids, + attention_mask=attention_mask, + use_cache=True, + **generation_config, + ) + + return output + + def unwrap_model(self): + if hasattr(self.model, "module"): + return self.model.module + return self.model + + def broadcast_parameter( + self, + model_update_name, + src_pp_rank, + dtype, + shape, + parameter_name, + is_lora=False, + ): + if model_update_name not in self.model_update_comm_plan: + self.model_update_comm_plan[model_update_name] = {} + if src_pp_rank not in self.model_update_comm_plan[model_update_name]: + self._setup_collective_group_impl( + model_update_name=model_update_name, + comm_plan=None, + backend=None, + mode="receiver", + ) + comm_plan = self.model_update_comm_plan[model_update_name][src_pp_rank] + weight = torch.empty(shape, dtype=dtype, device=current_platform.device_type) + collective.broadcast(tensor=weight, src_rank=0, group_name=comm_plan["group_name"]) + param = self.model.get_parameter(parameter_name) + self._copy_weight_to_param(param, weight) + del weight + + def update_parameter( + self, + model_update_name, + parameter_name, + weight, + ranks_in_worker, + is_lora: bool = False, + ): + # TODO: Update in bucket + param = self.model.get_parameter(parameter_name) + self._copy_weight_to_param(param, weight) + del weight + + def op_compute_log_probs( + self, + logits: torch.Tensor, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + ): + """ + input_ids [[p, p, r, r, r, 0, 0]] p: prompt, r: response, 0: pad + response_mask [[0, 0, 1, 1, 1, 0, 0]] + """ + # Create labels from FULL input_ids (shifted by 1) + labels: torch.Tensor = input_ids[:, 1:].clone() + labels[attention_mask[:, 1:] == 0] = 0 # avoid invalid token id + + if self.worker.rank_info.cp_size > 1: + # For CP: slice the shifted labels to match the sharded logits + # logits are sharded across sequence dimension by Ulysses + labels = torch.cat([labels, torch.zeros_like(labels[:, :1])], dim=1) + labels = self.get_feature_on_cp_rank(labels)["input_ids"] + + # Compute log_probs for this CP rank + log_probs = log_probs_from_logits(logits, labels) + + log_probs = ulysses_gather( + log_probs, + gather_dim=1, + group=get_ulysses_group(), + grad_scaler=True, + ) + + # Apply mask using FULL attention_mask and handle the shift + log_probs = log_probs[:, :-1] * attention_mask[:, 1:] + else: + # Non-CP path: original logic + labels = torch.cat([labels, torch.zeros_like(labels[:, :1])], dim=1) + log_probs = log_probs_from_logits(logits, labels) + log_probs = log_probs[:, :-1] * attention_mask[:, 1:] + + return log_probs + + def op_compute_entropy(self, logits: torch.Tensor, attention_mask: torch.Tensor): + from roll.utils.functionals import entropy_from_logits + + entropy = entropy_from_logits(logits) + if self.worker.rank_info.cp_size > 1: + entropy = ulysses_gather( + entropy, + gather_dim=1, + group=get_ulysses_group(), + grad_scaler=True, + ) + entropy = entropy[:, :-1] * attention_mask[:, 1:] + return entropy + + +class FSDP2TrainStrategy(FSDP2InferStrategy, TrainStrategy): + strategy_name = "fsdp2_train" + + def initialize(self, model_provider): + model, torch_dtype, _ = self._prepare_fsdp2_model( + model_provider, + is_trainable=True, + default_model_dtype=torch.float32, + warmup_collective=True, + ) + + logger.info(f"max steps pipeline {self.worker_config.training_args.max_steps}") + self.worker_config.training_args.max_steps = ( + self.worker_config.training_args.max_steps // self.worker.rank_info.dp_size + ) + logger.info(f"max steps worker train {self.worker_config.training_args.max_steps}") + + # Setup FSDP-2 configuration + self.setup_fsdp2_configuration() + + if self.param_dtype == torch.float16: + from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler + + self.scaler = ShardedGradScaler(growth_interval=400) + else: + self.scaler = None + + # Initialize FSDP-2 model + self.initialize_fsdp2_model(model) + + # In-case of LoRA + trainable_params = (param for param in self.model.parameters() if param.requires_grad) + self.optimizer = optim.AdamW( + trainable_params, + lr=self.worker_config.training_args.learning_rate, + betas=( + self.worker_config.training_args.adam_beta1, + self.worker_config.training_args.adam_beta2, + ), + weight_decay=self.worker_config.training_args.weight_decay, + ) + + self.scheduler = get_scheduler( + self.worker_config.training_args.lr_scheduler_type, + self.optimizer, + num_warmup_steps=self.worker_config.training_args.get_warmup_steps( + self.worker_config.training_args.max_steps + ), + num_training_steps=self.worker_config.training_args.max_steps, + ) + + dist.barrier() + + def _grad_accumulation_context(self): + set_sync_fn = getattr(self.model, "set_requires_gradient_sync", None) + if callable(set_sync_fn): + return self._requires_grad_sync_context(set_sync_fn) + + no_sync_method = getattr(self.model, "no_sync", None) + if callable(no_sync_method): + return no_sync_method() + + return contextlib.nullcontext() + + @contextlib.contextmanager + def _requires_grad_sync_context(self, set_sync_fn): + set_sync_fn(False) + try: + yield + finally: + set_sync_fn(True) + + def _clip_grad_norm(self, max_norm: float): + if not self.cpu_offload_enabled: + grad_norm = clip_grad_norm_( + self.model.parameters(), + max_norm=max_norm, + ) + else: + grad_norm = self._clip_grad_norm_cpu_offload(max_norm) + + if isinstance(grad_norm, DTensor): + grad_norm = grad_norm.full_tensor() + + return grad_norm + + def _clip_grad_norm_cpu_offload(self, max_norm: float): + """ + Mirror VERL's fsdp2_clip_grad_norm_: + 1. operate on local gradients + 2. move norm scalar to GPU (avoid CPU DTensor collectives) + + Reference: https://github.com/volcengine/verl/blob/main/verl/utils/fsdp_utils.py#L566 + Related discussion: https://github.com/volcengine/verl/pull/1026#discussion_r2064879123 + """ + parameters = list(self.model.parameters()) + grads = [p.grad for p in parameters if getattr(p, "grad", None) is not None] + if not grads: + device = current_platform.current_device() + return torch.zeros(1, device=device) + + total_norm = _get_total_norm( + grads, + norm_type=2.0, + error_if_nonfinite=False, + foreach=None, + ) + total_norm = total_norm.to(current_platform.current_device(), non_blocking=True) + _clip_grads_with_norm_( + parameters, + max_norm=max_norm, + total_norm=total_norm, + foreach=None, + ) + return total_norm + + def train_step( + self, + batch: DataProto, + loss_func: Callable[ + [DataProto, torch.Tensor], + Tuple[torch.Tensor, Dict[str, torch.Tensor]], + ], + no_sync: bool = False, + ): + """ + Comment: + no_sync: Usually, the inner step already handle no-sync, but leave this option for user if want other accumulation logic + """ + self.model.train() + mini_batch_size = self.worker_config.training_args.per_device_train_batch_size + data_iter = batch.make_iterator(mini_batch_size=mini_batch_size, epochs=1) + mini_steps = batch.batch.batch_size[0] // self.worker_config.training_args.per_device_train_batch_size + + cp_size = self.worker.rank_info.cp_size + batch_num_tokens = self._get_batch_num_tokens(batch) + batch.meta_info['batch_num_tokens'] = {k: v // cp_size for k, v in batch_num_tokens.items()} + global_valid_tokens = self._get_global_valid_samples(batch) + batch.meta_info['global_valid_samples'] = {k: v // cp_size for k, v in global_valid_tokens.items()} + loss_scale = mini_steps * self.worker.rank_info.dp_size + batch.meta_info['micro_batch_size'] = mini_batch_size + + gradient_accumulation_steps = self.worker_config.training_args.gradient_accumulation_steps + + metrics = {} + cp_size = max(self.worker.rank_info.cp_size, 1) + + for step in range(mini_steps): + data: DataProto = next(data_iter) + input_ids = data.batch["input_ids"] + attention_mask = data.batch["attention_mask"] + position_ids = data.batch["position_ids"] + forward_args = data.meta_info.get("forward_args", {}) + if position_ids.dim() == 3: + position_ids = position_ids.transpose(0, 1) # (bsz, C, seqlen) -> (C, bsz, seqlen) + if "multi_modal_inputs" in data.non_tensor_batch: + multi_modal_inputs = data.non_tensor_batch["multi_modal_inputs"] + multi_modal_data = defaultdict(list) + for sample_mm_inputs in multi_modal_inputs: + for key in sample_mm_inputs.keys(): + multi_modal_data[key].append(sample_mm_inputs[key]) + for key in multi_modal_data.keys(): + assert key not in forward_args + forward_args[key] = torch.concat(multi_modal_data[key], dim=0).to(input_ids.device) + forward_args.update({"force_vit_image": True}) + + sync_boundary = ((step + 1) % gradient_accumulation_steps == 0 or (step + 1 == mini_steps)) and not no_sync + + # PumpkinComment: + # model.no_sync is replaced by model.set_requires_gradient_sync(False) in FSDP2 + # but also add support for model.no_sync for compatibility + sync_context = ( + self._grad_accumulation_context() if not sync_boundary and not no_sync else contextlib.nullcontext() + ) + + with sync_context: + logits = self._fsdp2_forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + forward_args=forward_args, + ) + + loss, loss_reduced = loss_func(data, logits) + append_to_dict(metrics, loss_reduced) + + if self.worker_config.apply_loss_scale: + loss *= loss_scale + + loss = loss / gradient_accumulation_steps + + if self.scaler is not None: + self.scaler.scale(loss).backward() + else: + loss.backward() + + if sync_boundary: + if self.scaler is not None: + self.scaler.unscale_(self.optimizer) + grad_norm = self._clip_grad_norm( + max_norm=self.worker.pipeline_config.max_grad_norm, + ) + metrics[f"{self.worker_config.name}/grad_norm"] = grad_norm.item() + + if self.scaler is not None: + self.scaler.step(self.optimizer) + self.scaler.update() + else: + if not torch.isfinite(grad_norm): + logger.warning(f"WARN: rank {dist.get_rank()} grad_norm is not finite: {grad_norm}") + else: + self.optimizer.step() + self.scheduler.step() + self.optimizer.zero_grad(set_to_none=True) + + torch.cuda.empty_cache() + return metrics + + def setup_model_update(self, infer_cluster, model_update_name: str): + assert model_update_name not in self.weight_updaters + is_lora = self.worker_config.model_args.lora_target is not None + self.weight_updaters[model_update_name] = FSDP2WeightUpdater( + pipeline_config=self.worker.pipeline_config, + infer_cluster=infer_cluster, + worker_config=self.worker_config, + model_update_name=model_update_name, + model=self.unwrap_model(), + is_lora=is_lora, + ) + + def model_update(self, model_update_name: str): + return self.weight_updaters[model_update_name].model_update() diff --git a/roll/distributed/strategy/fsdp_strategy.py b/roll/distributed/strategy/fsdp_strategy.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/roll/distributed/strategy/hf_strategy.py b/roll/distributed/strategy/hf_strategy.py index 22c053dcf..a775a0cd7 100644 --- a/roll/distributed/strategy/hf_strategy.py +++ b/roll/distributed/strategy/hf_strategy.py @@ -1,14 +1,13 @@ -from concurrent import futures from collections import defaultdict +from concurrent import futures from datetime import timedelta -from typing import List, Optional, Callable, Dict, Tuple +from typing import Callable, Dict, List, Optional, Tuple import deepspeed import torch import torch.distributed as dist from accelerate import cpu_offload_with_hook from accelerate.hooks import UserCpuOffloadHook -from roll.utils.collective import collective from torch.nn.utils.rnn import pad_sequence from transformers import set_seed @@ -17,9 +16,12 @@ from roll.distributed.strategy.strategy import InferenceStrategy from roll.models.func_providers import log_probs_forward_step_func from roll.models.model_providers import default_tokenizer_provider -from roll.utils.logging import get_logger -from roll.utils.offload_states import OffloadStateType, offload_hf_model, load_hf_model from roll.platforms import current_platform +from roll.utils.collective import collective +from roll.utils.cuda_ipc_utils import MultiprocessingSerializer +from roll.utils.logging import get_logger +from roll.utils.offload_states import OffloadStateType, load_hf_model, offload_hf_model +from roll.utils.send_recv_utils import monkey_patch_torch_reductions, named_tensors_from_bucket logger = get_logger() @@ -31,10 +33,14 @@ def __init__(self, worker: "Worker"): super().__init__(worker) self.executor: futures.ThreadPoolExecutor = futures.ThreadPoolExecutor(max_workers=1) self.generate_config = None + self.buffer_cache = None def initialize(self, model_provider): set_seed(seed=self.worker.pipeline_config.seed) - dist.init_process_group(backend=current_platform.communication_backend, timeout=timedelta(minutes=self.worker_config.backend_timeout)) + dist.init_process_group( + backend=current_platform.communication_backend, + timeout=timedelta(minutes=self.worker_config.backend_timeout), + ) dist.all_reduce(torch.zeros(1).to(current_platform.device_type)) self.worker.rank_info.dp_rank = dist.get_rank() @@ -64,8 +70,9 @@ def forward_step( position_ids = data.batch["position_ids"] forward_args = data.meta_info.get("forward_args", {}) if position_ids.dim() == 3: - # qwen2vl mrope, maybe use a placeholder and let model generate position_ids - position_ids = position_ids.transpose(0, 1) # (bsz, 3, seqlen) -> (3, bsz, seqlen) + # qwen-vl mrope-style 3D position_ids stored in DataProto as (bsz, C, seqlen) + # transpose to (C, bsz, seqlen) for model forward. + position_ids = position_ids.transpose(0, 1) # (bsz, C, seqlen) -> (C, bsz, seqlen) if "multi_modal_inputs" in data.non_tensor_batch: multi_modal_inputs = data.non_tensor_batch["multi_modal_inputs"] multi_modal_data = defaultdict(list) @@ -95,6 +102,7 @@ def forward_step( return results def generate(self, batch: DataProto, generation_config): + generation_config.pop("logprobs", None) if self.generate_config is None: self.generate_config = generation_config logger.info(f"generate_config: {self.generate_config}") @@ -132,43 +140,41 @@ def generate(self, batch: DataProto, generation_config): def unwrap_model(self): return self.model - # 参数同步相关接口 - def broadcast_bucket(self, model_update_name, src_pp_rank, meta_infos, bucket_size): - if src_pp_rank not in self.model_update_comm_plan[model_update_name]: - return - comm_plan = self.model_update_comm_plan[model_update_name][src_pp_rank] - buffer = torch.empty(bucket_size, dtype=torch.int8, device=current_platform.device_type) - collective.broadcast(tensor=buffer, src_rank=0, group_name=comm_plan["group_name"]) - self.update_parameter_in_bucket(model_update_name, meta_infos, buffer, [dist.get_rank()]) - - def broadcast_parameter(self, model_update_name, src_pp_rank, dtype, shape, parameter_name, is_lora=False): + def broadcast_parameter(self, names, dtypes, shapes, group_name, is_lora=False): assert ( self.worker_config.num_gpus_per_worker == 1 ), "hf generate only support on device, please use vllm instead." - if src_pp_rank not in self.model_update_comm_plan[model_update_name]: - return - comm_plan = self.model_update_comm_plan[model_update_name][src_pp_rank] - weight = torch.empty(shape, dtype=dtype, device=current_platform.device_type) - collective.broadcast(tensor=weight, src_rank=0, group_name=comm_plan["group_name"]) - self.update_parameter(model_update_name, parameter_name, weight, [dist.get_rank()]) - - def update_parameter(self, model_update_name, parameter_name, weight, ranks_in_worker): - if dist.get_rank() not in ranks_in_worker: - return + assert not is_lora + + weights_and_handles = [] + for name, dtype, shape in zip(names, dtypes, shapes): + target_dtype = dtype if isinstance(dtype, torch.dtype) else getattr(torch, dtype) + weight = torch.empty(shape, dtype=target_dtype, device=self.device) + handle = collective.broadcast(tensor=weight, src_rank=0, group_name=group_name, async_op=True) + weights_and_handles.append((name, weight, handle)) + + def weights_iter(): + for name, weight, handle in weights_and_handles: + handle.wait() + yield name, weight + + for name, weight in weights_iter(): + self.update_parameter(name, weight) + + def update_parameter(self, parameter_name, weight): param = self.model.get_parameter(parameter_name) param.data.copy_(weight) del weight - def update_parameter_in_bucket(self, model_update_name, meta_infos, buffer, ranks_in_worker): - if dist.get_rank() not in ranks_in_worker: - return - from mcore_adapter.models.converter.convert_utils import RecvBucketManager + def update_parameter_in_bucket(self, serialized_named_tensors, is_lora=False): + # TODO: add lora + assert not is_lora - self.recv_manager = getattr(self, "recv_manager", RecvBucketManager()) - named_params = self.recv_manager.process_bucket(meta_infos, buffer) - del buffer - for name, weight in named_params.items(): - self.update_parameter(model_update_name, name, weight, ranks_in_worker) + monkey_patch_torch_reductions() + bucket_with_meta = MultiprocessingSerializer.deserialize(serialized_named_tensors[0]) + named_params = named_tensors_from_bucket(**bucket_with_meta) + for name, weight in named_params: + self.update_parameter(name, weight) # offload/load 相关接口 def load_states(self, *args, **kwargs): diff --git a/roll/distributed/strategy/megatron_strategy.py b/roll/distributed/strategy/megatron_strategy.py index e4038b8f5..68fa26b37 100644 --- a/roll/distributed/strategy/megatron_strategy.py +++ b/roll/distributed/strategy/megatron_strategy.py @@ -2,11 +2,13 @@ import os import random from collections import defaultdict +from contextlib import nullcontext from functools import partial -from typing import Callable, Dict, Iterator, List, Tuple +from typing import TYPE_CHECKING, Callable, Dict, Iterator, List, Tuple import numpy as np import ray +import ray.actor import torch import torch.distributed as dist from codetiming import Timer @@ -18,26 +20,33 @@ from megatron.core.distributed import DistributedDataParallelConfig, finalize_model_grads from megatron.core.models.common.embeddings import RotaryEmbedding from megatron.core.optimizer import MegatronOptimizer, OptimizerConfig +from megatron.core.packed_seq_params import PackedSeqParams from megatron.core.pipeline_parallel import get_forward_backward_func -from megatron.core.tensor_parallel import gather_from_tensor_model_parallel_region, reduce_from_tensor_model_parallel_region +from megatron.core.tensor_parallel import ( + gather_from_tensor_model_parallel_region, + reduce_from_tensor_model_parallel_region, +) +from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy from megatron.core.transformer.moe.moe_utils import ( clear_aux_losses_tracker, get_moe_layer_wise_logging_tracker, reduce_aux_losses_tracker_across_ranks, ) -from megatron.core.tensor_parallel.cross_entropy import vocab_parallel_cross_entropy +from megatron.core.transformer.multi_token_prediction import MTPLossLoggingHelper from megatron.core.packed_seq_params import PackedSeqParams from mcore_adapter import TrainingArguments from mcore_adapter.checkpointing import get_checkpoint_dir, load_state_dict_from_checkpoint from mcore_adapter.parallel_functions import context_parallel_gather, vocab_parallel_logprobs +from mcore_adapter.patcher import patch_torch_find_nd_overlapping_shards, patch_torch_validate_global_plan from mcore_adapter.trainer.utils import get_megatron_lr_scheduler from roll.datasets.collator import collate_fn_to_dict_list from roll.distributed.executor.worker import Worker from roll.distributed.scheduler.protocol import DataProto -from roll.distributed.scheduler.driver_utils import Barrier from roll.distributed.strategy.strategy import InferenceStrategy, TrainStrategy from roll.models.model_providers import default_processor_provider, default_tokenizer_provider +from roll.platforms import current_platform +from roll.third_party.megatron.model_update import MegatronWeightUpdater from roll.third_party.megatron.offload_states_patch import ( MegatronOffloadStateType, bind_megatron_offload_states_func, @@ -46,15 +55,23 @@ ) from roll.third_party.megatron.optimizer import get_megatron_optimizer from roll.third_party.megatron.tensor_parallel import vocab_parallel_entropy -from roll.utils.collective import collective -from roll.utils.constants import DIST_OPTIMIZER_DIR, IGNORE_INDEX, OPTIMIZER_NAME, RNG_STATE_DIR, SCHEDULER_NAME, RAY_NAMESPACE, BARRIER_NAME +from roll.utils.constants import ( + DIST_OPTIMIZER_DIR, + IGNORE_INDEX, + OPTIMIZER_NAME, + RNG_STATE_DIR, + SCHEDULER_NAME, +) from roll.utils.context_managers import disable_gradients -from roll.utils.functionals import append_to_dict +from roll.utils.dynamic_batching import make_micro_batch_iter_for_dynamic_batching +from roll.utils.functionals import append_to_dict, reduce_metrics, adjust_sequence_length from roll.utils.logging import get_logger from roll.utils.offload_states import OffloadStateType -from roll.utils.dynamic_batching import make_micro_batch_iter_for_dynamic_batching +from roll.utils.sequence_packing import make_micro_batch_iter_for_sequence_packing, restore_results_order -from roll.platforms import current_platform + +if TYPE_CHECKING: + from mcore_adapter.models.model_factory import VirtualModels logger = get_logger() @@ -63,6 +80,9 @@ class MegatronInferStrategy(InferenceStrategy): strategy_name = "megatron_infer" def __init__(self, worker: Worker): + #TODO remove the patches when the latest pytorch version > v2.9.1 + patch_torch_find_nd_overlapping_shards() + patch_torch_validate_global_plan() super().__init__(worker) config_dict = self.worker_config.training_args.to_dict() config_dict.update(self.worker_config.strategy_args.strategy_config) @@ -74,17 +94,13 @@ def __init__(self, worker: Worker): self.model = None self.forward_backward_func = None self.seq_length = None - self.use_remove_padding = self.worker_config.use_remove_padding self.use_sequence_packing = self.worker_config.use_sequence_packing - self.max_packed_len = None # hard to impl with offload states assert not self.megatron_train_args.overlap_param_gather, "overlap_param_gather is not supported" - if self.worker_config.use_remove_padding: - assert self.megatron_train_args.allow_variable_seq_lengths(), "when use_remove_padding=True, must set variable_seq_lengths=True for megatron." def initialize(self, model_provider): self.tokenizer = default_tokenizer_provider(model_args=self.worker_config.model_args) - self.model = model_provider( + self.model: "VirtualModels" = model_provider( tokenizer=self.tokenizer, model_args=self.worker_config.model_args, training_args=self.megatron_train_args, @@ -106,6 +122,10 @@ def initialize(self, model_provider): self.worker.rank_info.cp_size = mpu.get_context_parallel_world_size() self.worker.rank_info.cp_rank = mpu.get_context_parallel_rank() + if (self.worker_config.use_dynamic_batching_in_infer or self.worker_config.use_sequence_packing) and self.worker.rank_info.pp_size > 1: + self.model.config.variable_seq_lengths = True + logger.info("Set variable_seq_lengths to True when use dynamic batching and pipeline parallel.") + logger.info(f"{self.model.get_models()}") dist.barrier() @@ -144,29 +164,48 @@ def forward_step( forward_func: Callable[[DataProto, torch.Tensor], Tuple[torch.Tensor, Dict[str, torch.Tensor]]], ) -> Dict[str, torch.Tensor]: self.model.eval() + batch.meta_info['batch_num_tokens'] = self._get_batch_num_tokens(batch, dp_group=mpu.get_data_parallel_group()) + batch.meta_info['global_valid_samples'] = self._get_global_valid_samples(batch, dp_group=mpu.get_data_parallel_group()) + output_on_all_tp_cp_ranks = batch.meta_info.get("output_on_all_tp_cp_ranks", False) if self.worker_config.use_dynamic_batching_in_infer: micro_batches_list = list(make_micro_batch_iter_for_dynamic_batching(batch)) num_microbatches = batch.meta_info["num_micro_batchs"] micro_batch_size = 1 + elif self.use_sequence_packing: + vp_size = self.worker_config.strategy_args.strategy_config['virtual_pipeline_model_parallel_size'] \ + if 'virtual_pipeline_model_parallel_size' in self.worker_config.strategy_args.strategy_config else 1 + micro_batches_list = list( + make_micro_batch_iter_for_sequence_packing(batch, tp_size=self.worker.rank_info.tp_size, + cp_size=self.worker.rank_info.cp_size, + vp_size=vp_size, is_train=False, + dp_group=mpu.get_data_parallel_group(with_context_parallel=True), + micro_batch_size=batch.meta_info["micro_batch_size"], + config=self.worker_config.sequence_packing_args)) + num_microbatches = micro_batches_list[0].meta_info["num_micro_batchs"] + micro_batch_size = 1 else: batch_size = batch.batch.batch_size[0] micro_batch_size = batch.meta_info["micro_batch_size"] num_microbatches = max(batch_size // micro_batch_size, 1) micro_batches_list = batch.chunk(chunks=num_microbatches) - if self.use_sequence_packing: - micro_batch_size = 1 - self.max_packed_len = self._get_max_packed_len(micro_batches_list) + + disable_adapter = batch.meta_info.get("disable_adapter", False) + adapter_context = self.models_unwrapped[0].disable_adapter() if disable_adapter else nullcontext() + + for micro_batch in micro_batches_list: + micro_batch.meta_info['loss_scale'] = num_microbatches * mpu.get_data_parallel_world_size() + micro_batch.meta_info['micro_batch_size'] = micro_batch.batch.batch_size[0] data_iterator = [iter(micro_batches_list) for _ in range(len(self.model))] - with disable_gradients(models=self.model.get_models()): + with disable_gradients(models=self.model.get_models()), adapter_context: # List 是每个 micro-batch 构成的 losses_reduced: List[Dict[str, torch.Tensor]] = self.forward_backward_func( forward_step_func=partial(self.inner_forward_step, forward_func), data_iterator=data_iterator, model=self.model.get_models(), num_microbatches=num_microbatches, - seq_length=self.seq_length if not self.use_sequence_packing else self.max_packed_len, + seq_length=self.seq_length, micro_batch_size=micro_batch_size, forward_only=True, ) @@ -176,6 +215,11 @@ def forward_step( data[k] = torch.nn.functional.pad(v, (0, self.seq_length - data[k].size(-1) - 1), "constant", 0) results = collate_fn_to_dict_list(losses_reduced) + if self.use_sequence_packing: + results = restore_results_order(results, micro_batches_list[0].meta_info['partition_indices_list'], + self.worker_config.sequence_packing_args) + + if not ( ((self.worker.rank_info.tp_rank == 0 and self.worker.rank_info.cp_rank == 0) or output_on_all_tp_cp_ranks) @@ -207,29 +251,6 @@ def _get_pad_factor(self): pad_factor = math.lcm(16, pad_factor) return pad_factor - def _get_max_packed_len(self, micro_batches_list): - max_packed_len = -1 - for micro_batch in micro_batches_list: - input_ids = micro_batch.batch["input_ids"] - attention_mask = micro_batch.batch["attention_mask"] - - batch_size = input_ids.shape[0] - seq_lens = attention_mask.sum(dim=-1) - - pad_factor = self._get_pad_factor() - - packed_len = 0 - for b in range(batch_size): - seq_len = seq_lens[b].item() if torch.is_tensor(seq_lens[b]) else seq_lens[b] - if pad_factor > 1: - padded_seq_len = ((seq_len + pad_factor - 1) // pad_factor) * pad_factor - else: - padded_seq_len = seq_len - packed_len += padded_seq_len - - max_packed_len = max(packed_len, max_packed_len) - return max_packed_len - def _pack_sequences(self, input_tensor, attention_mask, pad_packed_seq_to=None, pad_val=0): """ Pack multiple sequences into a single continuous sequence by removing padding. @@ -237,8 +258,6 @@ def _pack_sequences(self, input_tensor, attention_mask, pad_packed_seq_to=None, Implements sequence packing for efficient batch processing with variable-length sequences. Removes per-sample padding and concatenates sequences while maintaining cumulative length info. - Reference: https://github.com/NVIDIA-NeMo/RL/blob/main/nemo_rl/models/megatron/common.py - Args: input_tensor (torch.Tensor): Shape [batch_size, seq_len, ...], padded sequences. attention_mask (torch.Tensor): Shape [batch_size, seq_len], 1=valid, 0=padding. @@ -300,58 +319,42 @@ def _pack_sequences(self, input_tensor, attention_mask, pad_packed_seq_to=None, # Track running sequence length for padding running_seq_len = 0 - if pad_factor > 1: - all_input_tensor_padded = [] - padded_tokens = [] - for b in range(batch_size): - seq_len = seq_lens[b].item() if torch.is_tensor(seq_lens[b]) else seq_lens[b] - if b == batch_size - 1 and pad_packed_seq_to is not None: - # Different from original implementation: calculate remaining length - padded_seq_len = pad_packed_seq_to - running_seq_len - else: - # Align to pad_factor boundary - padded_seq_len = ((seq_len + pad_factor - 1) // pad_factor) * pad_factor - - running_seq_len += padded_seq_len - - seq_tokens = input_tensor_unpadded[b] - - # Pad sequence if needed - if padded_seq_len > seq_len: - seq_tokens = torch.nn.functional.pad( - seq_tokens, (0, padded_seq_len - seq_len), value=pad_val - ) - all_input_tensor_padded.append(seq_tokens) - - if cp_size > 1: - # Handle Context Parallel distribution - # Add batch dimension for processing - seq_tokens_with_batch = seq_tokens.unsqueeze(0) # [1, seq_len] - seq_tokens_with_batch = self._get_feature_on_this_cp_rank( - seq_tokens_with_batch, "seq_tokens" - ) - seq_tokens = seq_tokens_with_batch.squeeze(0) # Remove batch dimension - - padded_tokens.append(seq_tokens) - - # Concatenate all sequences - packed_input_tensor = torch.cat(padded_tokens, dim=0).unsqueeze(0) - all_input_tensor_padded = torch.cat(all_input_tensor_padded, dim=0).unsqueeze(0) + all_input_tensor_padded = [] + padded_tokens = [] + for b in range(batch_size): + seq_len = seq_lens[b].item() if torch.is_tensor(seq_lens[b]) else seq_lens[b] + if b == batch_size - 1 and pad_packed_seq_to is not None: + # Different from original implementation: calculate remaining length + padded_seq_len = pad_packed_seq_to - running_seq_len + else: + # Align to pad_factor boundary + padded_seq_len = ((seq_len + pad_factor - 1) // pad_factor) * pad_factor - else: - # No padding factor: simply concatenate unpadded sequences - packed_input_tensor = torch.cat(input_tensor_unpadded, dim=0).unsqueeze(0) - all_input_tensor_padded = packed_input_tensor - if pad_packed_seq_to is not None: - # Pad to target length if specified - pad_len = pad_packed_seq_to - packed_input_tensor.shape[1] - if pad_len > 0: - packed_input_tensor = torch.nn.functional.pad( - packed_input_tensor, (0, pad_len), value=pad_val - ) - all_input_tensor_padded = torch.nn.functional.pad( - all_input_tensor_padded, (0, pad_len), value=pad_val - ) + running_seq_len += padded_seq_len + + seq_tokens = input_tensor_unpadded[b] + + # Pad sequence if needed + if padded_seq_len > seq_len: + seq_tokens = torch.nn.functional.pad( + seq_tokens, (0, padded_seq_len - seq_len), value=pad_val + ) + all_input_tensor_padded.append(seq_tokens) + + if cp_size > 1: + # Handle Context Parallel distribution + # Add batch dimension for processing + seq_tokens_with_batch = seq_tokens.unsqueeze(0) # [1, seq_len] + seq_tokens_with_batch = self._get_feature_on_this_cp_rank( + seq_tokens_with_batch, "seq_tokens" + ) + seq_tokens = seq_tokens_with_batch.squeeze(0) # Remove batch dimension + + padded_tokens.append(seq_tokens) + + # Concatenate all sequences + packed_input_tensor = torch.cat(padded_tokens, dim=0).unsqueeze(0) + all_input_tensor_padded = torch.cat(all_input_tensor_padded, dim=0).unsqueeze(0) if cu_seqlens_padded is None: cu_seqlens_padded = cu_seqlens.clone() @@ -384,42 +387,17 @@ def _pack_sequences(self, input_tensor, attention_mask, pad_packed_seq_to=None, cu_seqlens_padded, ) - def _get_tokens_on_this_cp_rank( - self, - input_ids: torch.Tensor, - cp_rank: int, - cp_size: int, - seq_dim: int = 1, - ) -> torch.Tensor: - """Get tokens on this context parallelism rank. - - Assumes that input_ids are already padded to a multiple of cp_size * 2 or cp_size == 1. - - Args: - input_ids: Input token IDs [seq_length, ] - cp_rank: Context parallelism rank - cp_size: Context parallelism size - - Returns: - Tokens on this context parallelism rank [1, seq_length // cp_size] + def _unpack_sequences(self, output_tensor, cu_seqlens_padded): """ - if cp_size == 1: - return input_ids - - # load balance for causal attention - shard_size = input_ids.shape[seq_dim] // (cp_size * 2) - shard_inds = (cp_rank, (cp_size * 2) - cp_rank - 1) - - # Create slices for each dimension - slices = [slice(None)] * input_ids.dim() - ids_chunks = [] - - for ind in shard_inds: - slices[seq_dim] = slice(ind * shard_size, (ind + 1) * shard_size) - ids_chunks.append(input_ids[slices]) + Unpack concatenated sequences into individual padded sequences. + """ + cp_size = mpu.get_context_parallel_world_size() + seq_starts = cu_seqlens_padded[:-1] // cp_size + seq_ends = cu_seqlens_padded[1:] // cp_size - ids = torch.cat(ids_chunks, dim=seq_dim) - return ids + for seq_idx, (seq_start, seq_end) in enumerate(zip(seq_starts, seq_ends)): + local_chunk = output_tensor[:, seq_start:seq_end] + yield local_chunk def inner_forward_step(self, loss_func, data_iterator: Iterator[DataProto], model): data = next(data_iterator) @@ -428,18 +406,12 @@ def inner_forward_step(self, loss_func, data_iterator: Iterator[DataProto], mode labels = data.batch["labels"] if "labels" in data.batch else None # labels is only used for sft packed_seq_params = None - if self.use_remove_padding: - unpad_seq_len = self._get_unpad_seqlen(attention_mask=attention_mask) - input_ids = input_ids[:, :unpad_seq_len].contiguous() - attention_mask = attention_mask[:, :unpad_seq_len].contiguous() if self.use_sequence_packing: input_ids, packed_seq_params, cu_seqlens, cu_seqlens_padded = self._pack_sequences( - input_ids, attention_mask, pad_packed_seq_to=self.max_packed_len + input_ids, attention_mask, ) if labels is not None: - labels, _, _, _ = self._pack_sequences(labels, attention_mask, pad_packed_seq_to=self.max_packed_len, - pad_val=IGNORE_INDEX) - data.meta_info['labels_packed'] = labels + labels, _, _, _ = self._pack_sequences(labels, attention_mask, pad_val=IGNORE_INDEX) attention_mask = None else: input_ids = self._get_feature_on_this_cp_rank(input_ids, "input_ids") @@ -451,7 +423,7 @@ def inner_forward_step(self, loss_func, data_iterator: Iterator[DataProto], mode # AttnMaskType.causal in which attention_mask would not be used, pass # it mainly for moe aux loss without pad token and it is 2D # position_ids: not used in LLM - # While TransformerTurbo Qwen2VlModel requires 4D attention_mask, and + # While MCA Qwen2VlModel requires 4D attention_mask, and # attention_mask and position_ids would be chunked for cp with dim 2 as # seq dim in it if they are provided forward_args = data.meta_info.get("forward_args", {}) @@ -459,9 +431,9 @@ def inner_forward_step(self, loss_func, data_iterator: Iterator[DataProto], mode # not support MoE VLM, not used temperarily attention_mask = None position_ids = data.batch["position_ids"] - position_ids = position_ids.transpose(0, 1) # (bsz, 3, seqlen) -> (3, bsz, seqlen) - if self.use_remove_padding: - position_ids = position_ids[:, :, :unpad_seq_len].contiguous() + if position_ids.size(1) == 4: + position_ids = position_ids[:, 1:, :].contiguous() # (bsz, 4, seqlen) -> (bsz, 3, seqlen) + position_ids = position_ids.transpose(0, 1) # (bsz, C, seqlen) -> (C, bsz, seqlen) if "multi_modal_inputs" in data.non_tensor_batch: multi_modal_inputs = data.non_tensor_batch["multi_modal_inputs"] multi_modal_data = defaultdict(list) @@ -476,22 +448,67 @@ def inner_forward_step(self, loss_func, data_iterator: Iterator[DataProto], mode forward_args[key] = torch.concat(multi_modal_data[key], dim=0).to(input_ids.device) forward_args.update({"force_vit_image": True}) + # megatron_llama_core need loss_mask to compute aux loss + if "loss_mask" not in forward_args: + if labels is not None: + forward_args["loss_mask"] = (labels != IGNORE_INDEX).float() + else: + forward_args["loss_mask"] = torch.ones_like(input_ids) + output_tensor = model( input_ids=input_ids, attention_mask=attention_mask, position_ids=position_ids, labels=labels, packed_seq_params=packed_seq_params, **forward_args ) if self.use_sequence_packing: - loss_func.set_packing_params(cu_seqlens=cu_seqlens, cu_seqlens_padded=cu_seqlens_padded, logger=logger) - - return output_tensor, partial(loss_func, data) - - def broadcast_parameter(self, model_update_name, src_pp_rank, dtype, shape, parameter_name): + cp_size = mpu.get_context_parallel_world_size() + def loss_wrapper(output_tensor): + unpacked_output_iter = self._unpack_sequences( + output_tensor, + cu_seqlens_padded, + ) + loss_result = torch.tensor(0.0, device=output_tensor.device) + metrics_result_list = [] + num_samples = len(data) + for i in range(num_samples): + single_output_tensor = next(unpacked_output_iter) + full_seq_len = single_output_tensor.size(1) * cp_size + if full_seq_len == 0: + # Create a mock output tensor when the sample is empty to ensure the subsequent pipeline works correctly. + full_seq_len = self._get_pad_factor() + local_seq_len = max(1, full_seq_len // cp_size) + new_shape = list(single_output_tensor.shape) + new_shape[1] = local_seq_len + single_output_tensor = torch.zeros(new_shape, dtype=single_output_tensor.dtype, + device=single_output_tensor.device) + single_data = data[i:i+1] + for key, val in single_data.batch.items(): + single_data.batch[key] = adjust_sequence_length(val, full_seq_len, self.seq_length, pad_value=IGNORE_INDEX + if key in {'labels', 'labels_for_loss'} else 0) + loss, metrics = loss_func(single_data, single_output_tensor) + loss_result += loss + for key, val in metrics.items(): + if isinstance(val, torch.Tensor): + metrics[key] = adjust_sequence_length(val, self.seq_length, full_seq_len, pad_value=0) + metrics_result_list.append(metrics) + del single_output_tensor + metrics_result_dict = collate_fn_to_dict_list(metrics_result_list) + if self.worker_config.apply_loss_scale: + loss_result *= data.meta_info['loss_scale'] + return loss_result, reduce_metrics(metrics_result_dict) + + return output_tensor, loss_wrapper + else: + def loss_wrapper(output_tensor): + loss, metrics = loss_func(data, output_tensor) + if self.worker_config.apply_loss_scale: + loss *= data.meta_info['loss_scale'] + return loss, metrics + return output_tensor, loss_wrapper + + def broadcast_parameter(self, *args, **kwargs): pass - def broadcast_bucket(self, model_update_name, src_pp_rank, meta_infos, bucket_size): - raise NotImplementedError - def load_states(self, include=None, non_blocking=False): reload_megatron_no_grad_module(model_chunks=self.model.get_models()) @@ -508,10 +525,7 @@ def op_compute_log_probs(self, logits: torch.Tensor, input_ids: torch.Tensor, at """ ori_seq_length = attention_mask.size(1) cp_size = mpu.get_context_parallel_world_size() - seq_len = logits.size(1) * cp_size if self.use_remove_padding else ori_seq_length - # remove padding token - if self.use_remove_padding: - input_ids = input_ids[:, :seq_len] + seq_len = ori_seq_length labels: torch.Tensor = input_ids[:, 1:].clone() labels[attention_mask[:, 1:seq_len] == 0] = 0 # avoid invalid token id @@ -522,21 +536,15 @@ def op_compute_log_probs(self, logits: torch.Tensor, input_ids: torch.Tensor, at log_probs = vocab_parallel_logprobs(logits, labels) if mpu.get_context_parallel_world_size() > 1: log_probs = context_parallel_gather(log_probs, parallel_dim=1) - # add pad to recover tensor shape - if self.use_remove_padding: - pad_token_num = ori_seq_length - seq_len - log_probs = torch.nn.functional.pad(log_probs, pad=(0, pad_token_num), value=0) log_probs = log_probs[:, :-1] * attention_mask[:, 1:] return log_probs def op_compute_entropy(self, logits: torch.Tensor, attention_mask: torch.Tensor): + if self.worker_config.logits_in_fp32: + logits = logits.float() entropy = vocab_parallel_entropy(logits) if mpu.get_context_parallel_world_size() > 1: entropy = context_parallel_gather(entropy, parallel_dim=1) - # add pad to recover shape - if self.use_remove_padding: - pad_token_num = attention_mask.size(1) - entropy.size(1) - entropy = torch.nn.functional.pad(entropy, pad=(0, pad_token_num), value=0) entropy = entropy[:, :-1] * attention_mask[:, 1:] return entropy @@ -927,31 +935,25 @@ def op_compute_various_divergence( else: raise ValueError(f"Unsupported reduction: {reduction}. Use 'mean', 'sum', or 'none'.") - def op_compute_language_loss(self, losses: torch.Tensor, labels: torch.Tensor): - if not self.use_sequence_packing: - labels = self._get_feature_on_this_cp_rank(labels, "labels") + def op_compute_language_loss(self, losses: torch.Tensor, labels: torch.Tensor, batch_num_tokens: int): + labels = self._get_feature_on_this_cp_rank(labels, "labels") loss_mask = (labels != IGNORE_INDEX).float() loss_mask = loss_mask.view(-1).float() losses = torch.sum(losses.view(-1) * loss_mask) - loss_mask = loss_mask.sum() if mpu.get_context_parallel_world_size() > 1: - loss_info = torch.cat([losses.view(1), loss_mask.view(1)]) + loss_info = torch.cat([losses.view(1)]) torch.distributed.all_reduce( loss_info, op=torch.distributed.ReduceOp.SUM, group=mpu.get_context_parallel_group() ) - losses, loss_mask = loss_info[0], loss_info[1] + losses = loss_info[0] - loss = losses.clone() # clone to make sure loss is not a view + loss = losses.clone() / batch_num_tokens# clone to make sure loss is not a view - local_num_tokens = loss_mask.clone().detach() - if local_num_tokens == 0: - local_num_tokens += 1 # avoid divide by zero + metrics = {f"{self.worker_config.name}/loss@sum": loss.clone().detach().item()} - metrics = {f"{self.worker_config.name}/loss": (loss / local_num_tokens).clone().detach().unsqueeze(0)} - - return loss, local_num_tokens.int(), metrics + return loss, metrics class MegatronTrainStrategy(MegatronInferStrategy, TrainStrategy): strategy_name = "megatron_train" @@ -965,11 +967,12 @@ def __init__(self, worker: Worker): def initialize(self, model_provider): self.seq_length = self.worker.pipeline_config.sequence_length + self.weight_updaters: dict[str, MegatronWeightUpdater] = {} self.tokenizer = default_tokenizer_provider(model_args=self.worker_config.model_args) self.processor = default_processor_provider(model_args=self.worker_config.model_args) # model provider will initialize megatron distributed groups - self.model = model_provider( + self.model: "VirtualModels" = model_provider( tokenizer=self.tokenizer, model_args=self.worker_config.model_args, training_args=self.megatron_train_args, @@ -1032,10 +1035,6 @@ def initialize(self, model_provider): self.worker.rank_info.cp_size = mpu.get_context_parallel_world_size() self.worker.rank_info.cp_rank = mpu.get_context_parallel_rank() - self.barrier = Barrier.options( - name=BARRIER_NAME, get_if_exists=True, namespace=RAY_NAMESPACE - ).remote(self.worker.world_size / self.worker.rank_info.pp_size) - logger.info(f"max steps pipeline {self.worker_config.training_args.max_steps}") self.worker_config.training_args.max_steps = ( self.worker_config.training_args.max_steps // self.worker.rank_info.dp_size @@ -1068,18 +1067,37 @@ def initialize(self, model_provider): if len(self.models_wrapped) == 1: model_config.grad_sync_func = model_config.grad_sync_func[0] + if (self.worker_config.use_dynamic_batching_in_train or self.worker_config.use_sequence_packing or + self.worker_config.use_sequence_packing) and self.worker.rank_info.pp_size > 1: + self.model.config.variable_seq_lengths = True + logger.info("Set variable_seq_lengths to True when use dynamic batching and pipeline parallel.") + logger.info(f"{self.model.get_models()}") dist.barrier() def train_step(self, batch: DataProto, loss_func: Callable): self.model.train() + global_step = batch.meta_info.get("global_step", 0) is_offload_optimizer_states_in_train_step = batch.meta_info.get("is_offload_optimizer_states_in_train_step", True) + batch.meta_info['batch_num_tokens'] = self._get_batch_num_tokens(batch, dp_group=mpu.get_data_parallel_group()) + batch.meta_info['global_valid_samples'] = self._get_global_valid_samples(batch, dp_group=mpu.get_data_parallel_group()) if self.worker_config.use_dynamic_batching_in_train: micro_batches_list = list(make_micro_batch_iter_for_dynamic_batching(batch)) num_microbatches = batch.meta_info["num_micro_batchs"] mini_batch_size = 1 + elif self.use_sequence_packing: + vp_size = self.worker_config.strategy_args.strategy_config['virtual_pipeline_model_parallel_size']\ + if 'virtual_pipeline_model_parallel_size' in self.worker_config.strategy_args.strategy_config else 1 + micro_batches_list = list(make_micro_batch_iter_for_sequence_packing(batch, tp_size=self.worker.rank_info.tp_size, + cp_size=self.worker.rank_info.cp_size, + vp_size=vp_size, is_train=True, + dp_group=mpu.get_data_parallel_group(with_context_parallel=True), + micro_batch_size=self.worker_config.training_args.per_device_train_batch_size, + config=self.worker_config.sequence_packing_args)) + num_microbatches = micro_batches_list[0].meta_info["num_micro_batchs"] + mini_batch_size = 1 else: mini_batch_size = self.worker_config.training_args.per_device_train_batch_size num_microbatches = batch.batch.batch_size[0] // self.worker_config.training_args.per_device_train_batch_size @@ -1087,10 +1105,10 @@ def train_step(self, batch: DataProto, loss_func: Callable): num_microbatches == self.megatron_train_args.gradient_accumulation_steps ), f"num_microbatches={num_microbatches} gradient_accumulation_steps={self.megatron_train_args.gradient_accumulation_steps}" micro_batches_list = batch.chunk(chunks=num_microbatches) - if self.use_sequence_packing: - mini_batch_size = 1 - self.max_packed_len = self._get_max_packed_len(micro_batches_list) - logger.info(f"max_packed_len: {self.max_packed_len}") + + for micro_batch in micro_batches_list: + micro_batch.meta_info['loss_scale'] = num_microbatches * mpu.get_data_parallel_world_size() + micro_batch.meta_info['micro_batch_size'] = micro_batch.batch.batch_size[0] data_iterator = [iter(micro_batches_list) for _ in range(len(self.model))] @@ -1099,13 +1117,14 @@ def train_step(self, batch: DataProto, loss_func: Callable): data_iterator=data_iterator, model=self.model.get_models(), num_microbatches=num_microbatches, - seq_length=self.seq_length if not self.use_sequence_packing else self.max_packed_len, + seq_length=self.seq_length, micro_batch_size=mini_batch_size, forward_only=False, ) # 只有step的时候需要load optimizer states self.load_states(include=[OffloadStateType.optimizer_states]) + update_successful, grad_norm, num_zeros_in_grad = self.optimizer.step() if is_offload_optimizer_states_in_train_step: self.offload_states(include=[OffloadStateType.optimizer_states], non_blocking=True) @@ -1117,6 +1136,13 @@ def train_step(self, batch: DataProto, loss_func: Callable): for model in self.model: model.zero_grad_buffer() + # Offload/reload does not update cached_param_buffer_shard_list/cached_grad_buffer_shard_list, + # resulting using old params in `start_param_sync`, which leads to wrong results. So we clear the cache. + for bucket_group in model.bucket_groups + model.expert_parallel_bucket_groups: + if hasattr(bucket_group, "cached_param_buffer_shard_list"): + bucket_group.cached_param_buffer_shard_list = [None] * len(bucket_group.buckets) + if hasattr(bucket_group, "cached_grad_buffer_shard_list"): + bucket_group.cached_grad_buffer_shard_list = [None] * len(bucket_group.buckets) self.optimizer.zero_grad() metrics = {} @@ -1136,49 +1162,23 @@ def train_step(self, batch: DataProto, loss_func: Callable): clear_aux_losses_tracker() metrics.update(moe_losses) + if self.model.config.mtp_num_layers is not None and self.model.config.mtp_num_layers > 0: + mtp_total_loss_dict = {} + MTPLossLoggingHelper.reduce_loss_in_tracker() + tracker = MTPLossLoggingHelper.tracker + if "values" in tracker: + loss_scale = 1 / self.megatron_train_args.gradient_accumulation_steps + mtp_losses = tracker["values"] * loss_scale + mtp_num_layers = mtp_losses.shape[0] + for i in range(mtp_num_layers): + name = self.worker_config.name + "/" + f"mtp_{i+1} loss" + mtp_total_loss_dict[name] = mtp_losses[i].item() + MTPLossLoggingHelper.clean_loss_in_tracker() + metrics.update(mtp_total_loss_dict) return metrics - def model_update(self, model_update_name, tgt_workers, broadcast_tgt_devices, p2p_tgt_devices): - comm_plan = self.model_update_comm_plan[model_update_name][self.worker.rank_info.pp_rank] - broadcast_time_cost = 0 - with Timer("model_update_total") as timer_total: - for meta_infos, buffer in self.model.all_gather_weights_as_hf_bucket( - models=self.models_unwrapped, bucket_size=256 * 1024 * 1024 - ): - ray.get(self.barrier.wait.remote()) - refs = [] - with Timer("broadcast") as timer_broadcast: - for p2p_tgt_device in p2p_tgt_devices: - p2p_tgt_worker = tgt_workers[p2p_tgt_device["rank"]] - ref = p2p_tgt_worker.update_parameter_in_bucket.remote(model_update_name=model_update_name, - meta_infos=meta_infos, buffer=buffer, ranks_in_worker=[p2p_tgt_device["device"]["rank"]] - ) - refs.append(ref) - - if ( - self.worker.rank_info.tp_rank == 0 - and self.worker.rank_info.cp_rank == 0 - and self.worker.rank_info.dp_rank == 0 - ): - for worker in tgt_workers: - ref = worker.broadcast_bucket.remote( - model_update_name=model_update_name, - src_pp_rank=self.worker.rank_info.pp_rank, - meta_infos=meta_infos, - bucket_size=buffer.numel() * buffer.element_size(), - ) - refs.append(ref) - if len(broadcast_tgt_devices) > 0: - collective.broadcast(tensor=buffer, src_rank=0, group_name=comm_plan["group_name"]) - ray.get(refs) - ray.get(self.barrier.wait.remote()) - broadcast_time_cost += timer_broadcast.last - - metrics = { - "all_gather": timer_total.last - broadcast_time_cost, - "broadcast": broadcast_time_cost, - } - return metrics + def model_update(self, model_update_name: str): + return self.weight_updaters[model_update_name].model_update() def load_states(self, include=None, non_blocking=False): if include is not None: @@ -1208,6 +1208,16 @@ def offload_states(self, include=None, non_blocking=False, pin_memory=True): RotaryEmbedding.forward.cache_clear() current_platform.empty_cache() + def setup_model_update(self, infer_cluster, model_update_name: str): + assert model_update_name not in self.weight_updaters + self.weight_updaters[model_update_name] = MegatronWeightUpdater( + pipeline_config=self.worker.pipeline_config, + worker_config=self.worker_config, + model_update_name=model_update_name, + models_unwrapped=self.models_unwrapped, + infer_cluster=infer_cluster, + ) + def save_checkpoint(self, save_dir, global_step, ckpt_id, tag="checkpoint", local_state_path=None, **kwargs): logger.info(f"save_dir: {save_dir}") if local_state_path is None: @@ -1215,6 +1225,11 @@ def save_checkpoint(self, save_dir, global_step, ckpt_id, tag="checkpoint", loca with Timer("load") as load_timer: self.load_states() + is_last_step = kwargs.get("is_last_step", False) + + if self.megatron_train_args.save_hf_model: + self.model.save_pretrained_as_hf(save_dir) + # save model and tokenizer if len(self.models_unwrapped) == 1: self.models_unwrapped[0].save_pretrained(save_dir) @@ -1269,7 +1284,7 @@ def save_checkpoint(self, save_dir, global_step, ckpt_id, tag="checkpoint", loca os.makedirs(os.path.dirname(rgn_path), exist_ok=True) torch.save(rng_states, rgn_path) - if self.worker_config.checkpoint_config.get("async_upload", True): + if self.worker_config.checkpoint_config.get("async_upload", True) and not is_last_step: self.thread_executor.submit(self.checkpoint_manager.upload, ckpt_id=ckpt_id, local_state_path=local_state_path) else: self.checkpoint_manager.upload(ckpt_id=ckpt_id, local_state_path=local_state_path) @@ -1292,6 +1307,8 @@ def load_checkpoint(self, load_dir, tag="checkpoint", **kwargs): f"Loading optimizer from {optimizer_checkpoint}, process_index: {self.megatron_train_args.process_index}" ) + self.offload_states() + if self.megatron_train_args.use_distributed_optimizer: model_shared_state_dict = self.model.sharded_state_dict() sharded_state_dict = self.optimizer.sharded_state_dict( @@ -1334,3 +1351,5 @@ def load_checkpoint(self, load_dir, tag="checkpoint", **kwargs): tensor_parallel.get_cuda_rng_tracker().set_states(checkpoint_rng_state["rng_tracker_states"]) else: logger.info(f"not load rng state, not found file: {rng_file}") + + self.load_states() diff --git a/roll/distributed/strategy/mock_strategy.py b/roll/distributed/strategy/mock_strategy.py index fcf626732..81da4179b 100644 --- a/roll/distributed/strategy/mock_strategy.py +++ b/roll/distributed/strategy/mock_strategy.py @@ -81,9 +81,6 @@ def unwrap_model(self): # return self.model raise NotImplementedError - def update_parameter(self, model_update_name, parameter_name, weight, ranks_in_worker): - logger.warning(f"update_parameter method is not implemented in {self.strategy_name} strategy") - def update_parameter_in_bucket(self, model_update_name, meta_infos, buffer, ranks_in_worker): logger.warning(f"update_parameter_in_bucket method is not implemented in {self.strategy_name} strategy") diff --git a/roll/distributed/strategy/sglang_strategy.py b/roll/distributed/strategy/sglang_strategy.py index cdd46ce4d..475e0eb70 100644 --- a/roll/distributed/strategy/sglang_strategy.py +++ b/roll/distributed/strategy/sglang_strategy.py @@ -1,12 +1,10 @@ -import asyncio import copy import gc import io import os -import queue -from concurrent import futures +import dataclasses +import pathlib from datetime import timedelta -from typing import List, Optional import torch import torch.distributed as dist @@ -16,10 +14,18 @@ from roll.distributed.executor.worker import Worker from roll.distributed.scheduler.protocol import DataProto from roll.distributed.strategy.strategy import InferenceStrategy -from roll.third_party.sglang import async_engine from roll.third_party.sglang import patch as sglang_patch -from roll.utils.functionals import GenerateRequestType, concatenate_input_and_output +from sglang.srt.managers.io_struct import ( + GenerateReqInput, + ReleaseMemoryOccupationReqInput, + ResumeMemoryOccupationReqInput, + UpdateWeightsFromDistributedReqInput, + InitWeightsUpdateGroupReqInput, + UpdateWeightsFromTensorReqInput, +) +from roll.utils.functionals import concatenate_input_and_output from roll.utils.logging import get_logger +from roll.utils.network_utils import collect_free_port from roll.utils.offload_states import OffloadStateType from roll.platforms import current_platform @@ -32,33 +38,50 @@ logger = get_logger() +def launch_server_process(server_args): + import multiprocessing + from sglang.srt.entrypoints.http_server import launch_server + p = multiprocessing.Process(target=launch_server, args=(server_args,)) + p.start() + return p + + +class SglangSlaveActor: + def __init__(self, sglang_config): + self.sglang_config = sglang_config + self.running_process = None + + def initialize(self): + os.environ.pop("PYTORCH_CUDA_ALLOC_CONF", None) + os.environ["FLASHINFER_WORKSPACE_BASE"] = os.path.join( + pathlib.Path.home().as_posix(), ".cache", os.environ.get("WORKER_NAME", "")) + from sglang.srt.server_args import ServerArgs + sargs = ServerArgs(**self.sglang_config) + self.running_process = launch_server_process(sargs) + + class SgLangStrategy(InferenceStrategy): strategy_name = "sglang" def __init__(self, worker: Worker): super().__init__(worker) self.model - self.async_model = None - self.sampling_params = None - self.use_gpu_executor = True - self.executor: futures.ThreadPoolExecutor = futures.ThreadPoolExecutor(max_workers=1) - self.sglang_outputs_list: List = [] - self.input_ids_list: List = [] - self.command_queue: Optional[queue.Queue] = None - - self.request_ids = set() - self.generation_config = None - self.running = None - - def initialize(self, model_provider): + self.slave_list = [] + + async def initialize(self, model_provider): + import ray set_seed(seed=self.worker.pipeline_config.seed) - self.command_queue = queue.Queue() dist.init_process_group(backend=current_platform.communication_backend, timeout=timedelta(minutes=self.worker_config.backend_timeout)) dist.all_reduce(torch.zeros(1).to(current_platform.device_type)) sglang_config = copy.deepcopy(self.worker_config.strategy_args.strategy_config) - tp_size = sglang_config.pop("tensor_parallel_size", current_platform.device_count()) + tp_size = sglang_config.pop("tensor_parallel_size", len(self.worker_config.resource_placement_groups)) + pp_size = sglang_config.pop("pipeline_parallel_size", 1) + gpu_per_worker = current_platform.device_count() + + assert (tp_size * pp_size) % gpu_per_worker == 0 + nnodes = (tp_size * pp_size) // gpu_per_worker dp_rank = dist.get_rank() dp_size = dist.get_world_size() @@ -89,13 +112,67 @@ def initialize(self, model_provider): "port": 30000 + dp_rank * 500, # 'disable_cuda_graph': True, "disable_custom_all_reduce": sglang_config.get("disable_custom_all_reduce", True), + 'nnodes': nnodes, + 'node_rank': 0, } ) + + if nnodes > 1: + sglang_config['dist_init_addr'] = f'{ray.util.get_node_ip_address()}:{collect_free_port()}' + logger.info(f"[sglang][sglang_config]: {sglang_config}") + + sglang_args_list = [] + for i in range(nnodes): + sglang_config_tmp = copy.deepcopy(sglang_config) + sglang_config_tmp['node_rank'] = i + sglang_args_list.append(sglang_config_tmp) + + if nnodes > 1: + node_index = 0 + sglang_pg_list = [] + for item in self.worker_config.resource_placement_groups: + if item['node_rank'] == node_index and item['gpu_rank'] == 0: + sglang_pg_list.append(item['placement_group']) + node_index += 1 + + sglang_ray_option_list = [] + from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy + from roll.utils.constants import RAY_NAMESPACE + for i in range(nnodes): + sglang_ray_option = { + 'scheduling_strategy': PlacementGroupSchedulingStrategy(sglang_pg_list[i]), + 'name': f'sglang-slave-{i}', + 'namespace': RAY_NAMESPACE, + 'runtime_env': + {'env_vars': + {'WORLD_SIZE': str(nnodes), + 'RANK': str(i), + 'WORKER_NAME': f'sglang-slave-{i}', + 'CUDA_VISIBLE_DEVICES': ','.join(map(str, list(range(gpu_per_worker)))), 'RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES': '1', + 'ROLL_LOG_DIR': os.getenv("ROLL_LOG_DIR", "./output/logs/") + } + }, + 'num_cpus': 0.01, + 'max_concurrency': 1000, + 'num_gpus': 0.01 + } + sglang_ray_option_list.append(sglang_ray_option) + + SglangSlaveActor_ray = ray.remote(SglangSlaveActor) + for i in range(1, nnodes): + _sglang_worker = SglangSlaveActor_ray.options(**sglang_ray_option_list[i]).remote(sglang_args_list[i]) + ray.get([_sglang_worker.initialize.remote()]) + self.slave_list.append(_sglang_worker) + os.environ.pop("PYTORCH_CUDA_ALLOC_CONF", None) - self.model = sglang_patch.engine.EngineSA(**sglang_config) - self.model.is_model_in_gpu = True + os.environ["FLASHINFER_WORKSPACE_BASE"] = os.path.join( + pathlib.Path.home().as_posix(), ".cache", os.environ.get("WORKER_NAME", "")) + self.model = sglang_patch.engine.engine_module.Engine(**sglang_args_list[0]) + self.is_model_in_gpu = True + self.is_kv_cache_in_gpu = True + self.has_run = False self.tokenizer = get_tokenizer(self.worker_config.model_args.model_name_or_path, trust_remote_code=True) @@ -109,59 +186,92 @@ def initialize(self, model_provider): {"additional_special_tokens": special_tokens}, replace_additional_special_tokens=False ) logger.info(f"add {special_tokens} to additional_special_tokens: {self.tokenizer.additional_special_tokens}") - self.event_loop = asyncio.get_event_loop() def op_compute_log_probs(self, logits: torch.Tensor, input_ids: torch.Tensor, attention_mask: torch.Tensor): pass - def start_server(self, data: DataProto, request_complete_callback): - self.running = True - self.command_queue = queue.Queue() - async_engine.start_async_sglang( - self.event_loop, - self.model, - request_complete_callback, - self.command_queue, - max_running_requests=self.worker.pipeline_config.max_running_requests, + async def abort_requests(self, request_ids=None): + if request_ids is None: # temporary solution to abort rquest with parallel sampling + request_ids = self.model.tokenizer_manager.rid_to_state + for rid in request_ids: + self.model.tokenizer_manager.abort_request(rid) + + async def generate_request(self, data: DataProto): + self.has_run = True + + input_ids = data.batch["input_ids"] + assert input_ids.size(0) == 1, f"data['input_ids'] must have exactly one batch dimension" + attention_mask = data.batch["attention_mask"] + rid = data.meta_info["request_id"] + assert isinstance(rid, str) + generation_config = data.meta_info.get("generation_config") + collect_unfinished = data.meta_info.get("collect_unfinished", False) + max_new_tokens = data.meta_info.get("max_new_tokens", generation_config["max_new_tokens"]) + max_new_tokens = min(max_new_tokens, generation_config["max_new_tokens"]) + sampling_params = create_sampling_params_for_sglang( + gen_kwargs={**generation_config, "max_new_tokens": max_new_tokens} ) - - def add_request(self, command, data: DataProto): - if command == GenerateRequestType.ADD: - input_ids = data.batch["input_ids"] - attention_mask = data.batch["attention_mask"] - request_id = data.meta_info["request_id"] - self.request_ids.add(request_id) - generation_config = data.meta_info.get("generation_config") - max_new_tokens = data.meta_info.get("max_new_tokens", generation_config["max_new_tokens"]) - max_new_tokens = min(max_new_tokens, generation_config["max_new_tokens"]) - sampling_params = create_sampling_params_for_sglang( - gen_kwargs={**generation_config, "max_new_tokens": max_new_tokens} - ) - prompt_token_ids = gather_unpadded_input_ids(input_ids=input_ids, attention_mask=attention_mask) - async_engine.add_request( - self.command_queue, ([request_id], prompt_token_ids, sampling_params, data.meta_info) + input_ids = gather_unpadded_input_ids(input_ids=input_ids, attention_mask=attention_mask) + assert isinstance(input_ids, list) and isinstance(input_ids[0], list) + if sampling_params['n'] > 1: + assert not collect_unfinished, "collect_unfinished is not supported in parallel sampling" + rid = None # sglang does not support using rid with parallel sampling + + obj_init_kw = {} # return logprobs may be in GenerateReqInput not SamplingParams + for field in dataclasses.fields(GenerateReqInput): + if field.name in sampling_params: + obj_init_kw[field.name] = sampling_params.pop(field.name) + from sglang import __version__ as version + if version >= '0.4.6.post4': + sampling_params['stream_interval'] = 50 + obj = GenerateReqInput( + input_ids=input_ids[0], + sampling_params=sampling_params, + rid=rid, + stream=True, + **obj_init_kw, + ) + chunks: list[dict] = [None for _ in range(sampling_params['n'])] + generator = self.model.tokenizer_manager.generate_request(obj, None) + async for chunk in generator: + index = chunk.get("index", 0) + chunks[index] = chunk + + output_data = DataProto(meta_info=data.meta_info) + + if not all(chunk is not None for chunk in chunks): + output_data.meta_info["finish_reasons"] = ["abort"] + return output_data + + output_token_ids = [chunk.get("output_ids", []) for chunk in chunks] + output_logprobs = [chunk["meta_info"].get("output_token_logprobs", None) for chunk in chunks] + has_logprobs = any(logprobs is not None for logprobs in output_logprobs) + if has_logprobs: + lens = [min(len(ids), len(logprobs)) for ids, logprobs in zip(output_token_ids, output_logprobs)] + output_token_ids = [ids[:l] for ids, l in zip(output_token_ids, lens)] + output_logprobs = [logprobs[:l] for logprobs, l in zip(output_logprobs, lens)] + output_logprobs = [[prob_info[0] for prob_info in logprobs] for logprobs in output_logprobs] + output_data.meta_info["output_logprobs"] = output_logprobs + assert all([len(ids) == len(logprobs) for ids, logprobs in zip(output_token_ids, output_logprobs)]), ( + "output_token_ids and output_logprobs length not match" ) - - elif command == GenerateRequestType.ABORT: - request_id = data.meta_info["request_id"] - async_engine.abort_request(self.command_queue, rid=request_id) - - elif command == GenerateRequestType.STOP: - self.command_queue.put(None) - self.request_ids.clear() - self.running = False - - def generate(self, batch: DataProto, generation_config): - if self.sampling_params is None: - self.sampling_params = create_sampling_params_for_sglang(gen_kwargs=generation_config) - old_sampling_params = self.sampling_params - logger.info(f"sampling_params: {self.sampling_params}") - else: - new_sampling_params = create_sampling_params_for_sglang(gen_kwargs=generation_config) - old_sampling_params = self.sampling_params - if not compare_sampling_params(new_sampling_params, self.sampling_params): - self.sampling_params = new_sampling_params - logger.info(f"switch sampling_params: {self.sampling_params}") + output_data.meta_info["output_token_ids"] = output_token_ids + output_data.meta_info["finish_reasons"] = [] + for chunk in chunks: + if isinstance(chunk["meta_info"].get("finish_reason"), dict): + finish_reason = chunk["meta_info"]["finish_reason"]["type"] + output_data.meta_info["finish_reasons"].append(finish_reason) + else: + # convert finish_reason None to 'abort' + output_data.meta_info["finish_reasons"].append("abort") + assert len(output_data.meta_info["finish_reasons"]) == len(output_data.meta_info["output_token_ids"]) + return output_data + + async def generate(self, batch: DataProto, generation_config): + self.has_run = True + assert self.is_model_in_gpu + sampling_params = create_sampling_params_for_sglang(gen_kwargs=generation_config) + logger.info(f"sampling_params: {sampling_params}") input_ids = batch.batch["input_ids"] # (bs, prompt_length) attention_mask = batch.batch["attention_mask"] # left-padded attention_mask @@ -195,8 +305,9 @@ def generate(self, batch: DataProto, generation_config): image_data.append(image_per_sample) else: prompt_token_ids = gather_unpadded_input_ids(input_ids=input_ids, attention_mask=attention_mask) - sglang_outputs = self.model.generate( - input_ids=prompt_token_ids, image_data=image_data, sampling_params=self.sampling_params + return_logprob = sampling_params.pop("return_logprob", False) + sglang_outputs = await self.model.async_generate( + input_ids=prompt_token_ids, image_data=image_data, sampling_params=sampling_params, return_logprob=return_logprob ) # (bs * num_return_sequences, max_response_len) @@ -208,45 +319,71 @@ def generate(self, batch: DataProto, generation_config): # (bs * num_return_sequences, input_len + max_response_len) output = concatenate_input_and_output( - input_ids=input_ids, output_ids=output_ids, num_return_sequences=self.sampling_params["n"] + input_ids=input_ids, output_ids=output_ids, num_return_sequences=sampling_params["n"] ) - - # 回归初始采样参数 - self.sampling_params = old_sampling_params - return output - # 参数同步相关接口 - def setup_collective_group(self, model_update_name, comm_plan, backend=None): - if backend is None: - backend = current_platform.communication_backend - self.model.setup_collective_group(comm_plan=comm_plan, backend=backend, rank_in_cluster=self.worker.rank) - - def broadcast_parameter(self, model_update_name, src_pp_rank, dtype, shape, parameter_name, is_lora=False): - self.model.broadcast_parameter(src_pp_rank, dtype, shape, parameter_name) - - def broadcast_bucket(self, model_update_name, src_pp_rank, meta_infos, bucket_size): - self.model.broadcast_bucket(src_pp_rank, meta_infos, bucket_size) - - def update_parameter(self, model_update_name, parameter_name, weight, ranks_in_worker): - self.model.update_parameter(parameter_name, weight, ranks_in_worker) - - def update_parameter_in_bucket(self, model_update_name, meta_infos, buffer, ranks_in_worker): - self.model.update_parameter_in_bucket(meta_infos, buffer, ranks_in_worker) - - def load_states(self, *args, **kwargs): - self.model.flush_cache() - if not self.model.is_model_in_gpu: - self.model.resume_memory_occupation() - logger.info("self.model.resume_memory_occupation exec ....") - self.model.is_model_in_gpu = True + async def setup_collective_group(self, master_address, master_port, rank_offset, world_size, group_name, backend=None): + logger.info(f"setup_collective_group {group_name=}") + return await self.model.tokenizer_manager.init_weights_update_group( + InitWeightsUpdateGroupReqInput( + master_address=master_address, + master_port=master_port, + group_name=group_name, + rank_offset=rank_offset, + world_size=world_size, + backend=backend if backend is not None else current_platform.communication_backend, + ) + ) - def offload_states(self, include=None, non_blocking=False): + async def broadcast_parameter(self, names, dtypes, shapes, group_name, is_lora=False): + await self._reload_model() + assert not is_lora, "lora training is not supported with sglang" + obj = UpdateWeightsFromDistributedReqInput( + names=names, dtypes=dtypes, shapes=shapes, group_name=group_name, flush_cache=False + ) + return await self.model.tokenizer_manager.update_weights_from_distributed(obj) + + async def update_parameter_in_bucket(self, serialized_named_tensors, is_lora=False): + await self._reload_model() + assert not is_lora, "lora training is not supported with sglang" + # required above sglang 0.5 + obj = UpdateWeightsFromTensorReqInput( + load_format="flattened_bucket", + flush_cache=False, + serialized_named_tensors=serialized_named_tensors, + ) + return await self.model.tokenizer_manager.update_weights_from_tensor(obj, None) + + async def _reload_model(self): + if self.is_model_in_gpu: + return + self.is_model_in_gpu = True + tags = ["weights"] + await self.model.tokenizer_manager.resume_memory_occupation(ResumeMemoryOccupationReqInput(tags=tags), None) + logger.info(f"self.model.resume_memory_occupation {tags=} exec ....") + + async def load_states(self, *args, **kwargs): + if self.has_run: # flush cache can't be called as the first request for tokenizer_manager + await self.model.tokenizer_manager.flush_cache() + tags = [] + if not self.is_model_in_gpu: + tags.append("weights") + if not self.is_kv_cache_in_gpu: + tags.extend(["kv_cache", "cuda_graph"]) + if tags: + await self.model.tokenizer_manager.resume_memory_occupation(ResumeMemoryOccupationReqInput(tags=tags), None) + logger.info(f"self.model.resume_memory_occupation {tags=} exec ....") + self.is_model_in_gpu, self.is_kv_cache_in_gpu = True, True + + async def offload_states(self, include=None, non_blocking=False): if include is None or OffloadStateType.model_params in include: - if self.worker.pipeline_config.is_train_infer_colocated and self.model.is_model_in_gpu: - self.model.release_memory_occupation() + if self.worker.pipeline_config.is_actor_infer_colocated and self.is_model_in_gpu: + await self.model.tokenizer_manager.release_memory_occupation(ReleaseMemoryOccupationReqInput(), None) logger.info("self.model.release_memory_occupation exec ....") - self.model.is_model_in_gpu = False + # always release all + self.is_model_in_gpu, self.is_kv_cache_in_gpu = False, False + gc.collect() current_platform.empty_cache() @@ -266,18 +403,6 @@ def gather_outputs_to_pad_tensor(request_outputs, pad_token_id, device=None) -> return output_tensor -def concatenate_input_and_output(input_ids, output_ids, num_return_sequences): - batch_size, input_seq_len = input_ids.size() - _, output_seq_len = output_ids.size() - repeated_input_ids = ( - input_ids.unsqueeze(1) - .repeat(1, num_return_sequences, 1) - .view(batch_size * num_return_sequences, input_seq_len) - ) - sequences = torch.cat((repeated_input_ids, output_ids), dim=1) - return sequences - - def create_sampling_params_for_sglang(gen_kwargs: dict): return dict( max_new_tokens=gen_kwargs["max_new_tokens"], @@ -287,33 +412,7 @@ def create_sampling_params_for_sglang(gen_kwargs: dict): stop_token_ids=gen_kwargs["eos_token_id"], repetition_penalty=gen_kwargs["repetition_penalty"], n=gen_kwargs["num_return_sequences"], - return_logprob=gen_kwargs.get("logprobs", 0) > 0, + return_logprob=gen_kwargs.get("logprobs", 0) is not None, stop=gen_kwargs["stop_strings"], no_stop_trim=gen_kwargs.get("include_stop_str_in_output", True), ) - - -def compare_sampling_params(params1: dict, params2: dict) -> bool: - # 只比较采样参数的配置 - param_attrs = [ - "temperature", - "top_p", - "top_k", - "max_new_tokens", - "n", - "stop_token_ids", - "presence_penalty", - "frequency_penalty", - "repetition_penalty", - "min_p", - "stop", - "ignore_eos", - ] - - # 比较每个采样参数 - for attr in param_attrs: - if attr in params1 and attr in params2: - if params1[attr] != params2[attr]: - print(f"采样参数 {attr} 不同: {params1[attr]} != {params2[attr]}") - return False - return True diff --git a/roll/distributed/strategy/strategy.py b/roll/distributed/strategy/strategy.py index 26c393d36..410af36ae 100644 --- a/roll/distributed/strategy/strategy.py +++ b/roll/distributed/strategy/strategy.py @@ -4,14 +4,18 @@ import torch import torch.nn.functional as F +import torch.distributed as dist from roll.distributed.scheduler.protocol import DataProto from roll.platforms import current_platform +from roll.distributed.executor.worker import Worker from roll.utils.checkpoint_manager import CheckpointManager from roll.utils.constants import IGNORE_INDEX from roll.utils.collective import collective from roll.utils.functionals import log_probs_from_logits, get_dist_info_from_comm_plan, entropy_from_logits from roll.utils.logging import get_logger +from roll.utils.cuda_ipc_utils import MultiprocessingSerializer + logger = get_logger() @@ -63,12 +67,6 @@ def get_metrics(self, metric_names: Optional[List[str]] = None) -> Dict[str, flo """ return {} - def start_server(self, *args, **kwargs): - raise NotImplementedError - - def add_request(self, command, data: DataProto, *args, **kwargs): - raise NotImplementedError() - def unwrap_model(self, *args, **kwargs): raise NotImplementedError @@ -85,16 +83,10 @@ def load_checkpoint(self, *args, **kwargs): def broadcast_parameter(self, model_update_name, src_pp_rank, dtype, shape, parameter_name): raise NotImplementedError - def broadcast_bucket(self, model_update_name, src_pp_rank, meta_infos, bucket_size): + def update_parameter_in_bucket(self, model_update_name, meta_infos, buffer, bucket_id, ranks_in_worker, is_lora=False): raise NotImplementedError - def update_parameter(self, model_update_name, parameter_name, weight, ranks_in_worker): - """ - engine模式中,p2p update要求engine能够将param 更新至指定的rank - """ - raise NotImplementedError - - def update_parameter_in_bucket(self, model_update_name, meta_infos, buffer, ranks_in_worker): + def setup_model_update(self, *args, **kwargs): raise NotImplementedError def _setup_collective_group_impl( @@ -155,7 +147,7 @@ def setup_collective_group(self, model_update_name, comm_plan, backend=None, mod self._setup_collective_group_impl(model_update_name, comm_plan, backend, mode=mode) # offload/load 相关接口 - def load_states(self): + def load_states(self, *args, **kwargs): raise NotImplementedError def offload_states(self, *args, **kwargs): @@ -180,17 +172,18 @@ def op_compute_entropy(self, logits: torch.Tensor, attention_mask: torch.Tensor) entropy = entropy[:, :-1] * attention_mask[:, 1:] return entropy - def op_compute_language_loss_from_logits(self, logits: torch.Tensor, targets: torch.Tensor): - # shift + def op_compute_language_loss_from_logits(self, logits: torch.Tensor, targets: torch.Tensor, reduction='mean'): logits = logits[..., :-1, :].contiguous() targets = targets[..., 1:].contiguous() + loss = F.cross_entropy( logits.view(-1, logits.size(-1)), targets.view(-1), - ignore_index=IGNORE_INDEX + ignore_index=IGNORE_INDEX, + reduction=reduction ) - mask = (targets != IGNORE_INDEX) - valid_tokens = mask.sum() + + valid_tokens = (targets.view(-1) != IGNORE_INDEX).sum() return loss, valid_tokens def op_compute_topk_logits(self, logits: torch.Tensor, topk: int = 0): @@ -311,11 +304,13 @@ def op_compute_various_divergence(self, loss_callable, logits, teacher_topk_prob # Both megatron and deepspeed can output language loss directly. # This op is mainly for computing context-parallel loss. - def op_compute_language_loss(self, losses: torch.Tensor, labels: torch.Tensor): + def op_compute_language_loss(self, losses: torch.Tensor, labels: torch.Tensor, batch_num_tokens: int): loss_mask = (labels != IGNORE_INDEX).float() loss_mask = loss_mask.view(-1).float() losses = torch.sum(losses.view(-1) * loss_mask) - return losses + losses = losses / batch_num_tokens + metrics = {f"{self.worker_config.name}/loss@sum": losses.clone().detach().item()} + return losses, metrics def op_compute_gather_by_teacher_indices( self, @@ -353,6 +348,87 @@ def op_compute_gather_by_teacher_indices( # Gather along vocab dimension (last dim) gathered_logits = torch.gather(student_logits, dim=-1, index=teacher_indices) return gathered_logits + + def process_weights_after_loading(self,*args, **kwargs): + pass + + def _get_batch_num_tokens(self, batch: DataProto, dp_group=None): + """ + Only supports `batch.meta_info["loss_mask_keys"]` as a `list[str]`. + """ + assert "loss_mask_keys" in batch.meta_info, ( + "Please set loss_mask_keys in meta info. " + "When batch_num_tokens is not required, set loss_mask_keys to an empty list []." + ) + + loss_mask_keys = batch.meta_info["loss_mask_keys"] + if not isinstance(loss_mask_keys, list): + raise TypeError(f"loss_mask_keys must be a list[str], got {type(loss_mask_keys)}") + if not all(isinstance(k, str) for k in loss_mask_keys): + raise TypeError("loss_mask_keys must be a list[str]") + + out = {} + for key in loss_mask_keys: + if key not in batch.batch: + continue + + loss_mask = batch.batch[key] + if key in ["labels", "labels_for_loss"]: + loss_mask = (loss_mask != IGNORE_INDEX) + elif key == "response_mask": + loss_mask = loss_mask[:, 1:] + + num = loss_mask.sum() + dist.all_reduce(num, op=dist.ReduceOp.SUM, group=dp_group) + + if num.item() == 0: + num = num.new_tensor(1) + + out[key] = num + + return out + + def _get_global_valid_samples(self, batch: DataProto, dp_group=None): + """ + Only supports `batch.meta_info["loss_mask_keys"]` as a `list[str]`. + """ + assert "loss_mask_keys" in batch.meta_info, ( + "Please set loss_mask_keys in meta info. " + "When global_num_tokens is not required, set loss_mask_keys to an empty list []." + ) + + loss_mask_keys = batch.meta_info["loss_mask_keys"] + if not isinstance(loss_mask_keys, list): + raise TypeError(f"loss_mask_keys must be a list[str], got {type(loss_mask_keys)}") + if not all(isinstance(k, str) for k in loss_mask_keys): + raise TypeError("loss_mask_keys must be a list[str]") + + out = {} + + num_valid = torch.tensor(len(batch), device=batch.batch["input_ids"].device) + dist.all_reduce(num_valid, op=dist.ReduceOp.SUM, group=dp_group) + out["default"] = num_valid + + for key in loss_mask_keys: + if key not in batch.batch: + continue + + loss_mask = batch.batch[key] + if key in ["labels", "labels_for_loss"]: + loss_mask = (loss_mask != IGNORE_INDEX) + elif key == "response_mask": + loss_mask = loss_mask[:, 1:] + + local_valid = torch.any(loss_mask > 0, dim=-1).to(torch.long) + num_valid = local_valid.sum() + dist.all_reduce(num_valid, op=dist.ReduceOp.SUM, group=dp_group) + + if num_valid.item() == 0: + num_valid = num_valid.new_tensor(1) + + out[key] = num_valid + + return out class TrainStrategy(InferenceStrategy): @@ -367,6 +443,70 @@ def setup_collective_group(self, model_update_name, comm_plan, backend=None, mod self._setup_collective_group_impl(model_update_name, comm_plan, backend, mode=mode) + def setup_p2p_collective_group(self, model_update_name, comm_plan, backend="nccl"): + (intra_rank, info), = comm_plan.items() + collective.init_collective_group( + info["world_size"], + intra_rank, + backend=backend, + group_name=info["group_name"], + master_addr=info["master_addr"], + master_port=info["master_port"], + global_ranks=info["global_ranks"] + ) + # 可选:warm-up + collective.allreduce(torch.zeros(1).cuda(), group_name=info["group_name"]) + # 保存元数据 + if model_update_name not in self.model_update_comm_plan: + self.model_update_comm_plan[model_update_name] = {} + self.model_update_comm_plan[model_update_name][info["group_name"]] = { + "rank": intra_rank, + "world_size": info["world_size"], + "group_name": info["group_name"], + "comm_plan": comm_plan, + } + + def model_update_set_write_done_handle(self,): + """ + Set the write synchronization event required for reading and writing shared memory + """ + if not hasattr(self, "_events_inited"): + # Sender -> Receiver:Write complete + self._write_done_event = torch.cuda.Event(interprocess=True) + self._write_done_handle = self._write_done_event.ipc_handle() + # Sender <- Receiver:Read complete + self._read_done_event_remote = None + self._events_inited = True + + def model_update_set_read_done_handle(self, read_done_handles): + """ + Set the read synchronization event required for reading and writing shared memory + """ + logger.warning(f"[Rank {dist.get_rank()}] model_update_set_read_done_handle called") + read_done_handle = None + + for p2p_tgt_device in self.p2p_tgt_devices: + worker_rank = p2p_tgt_device['rank'] + local_rank = p2p_tgt_device['device']['rank'] + for read_done_handle_full_dict in read_done_handles: + if worker_rank in read_done_handle_full_dict: + read_done_handle_list = read_done_handle_full_dict[worker_rank] + for read_done_handle_dict in read_done_handle_list: + if local_rank in read_done_handle_dict: + read_done_handle = read_done_handle_dict[local_rank] + + if not hasattr(self, "_read_done_event_remote"): + if read_done_handle is not None: + logger.warning(f"[Rank {dist.get_rank()}] Creating _read_done_event_remote from handle") + self._read_done_event_remote = torch.cuda.Event.from_ipc_handle( + device=torch.cuda.current_device(), + handle=read_done_handle + ) + else: + logger.warning( + f"[Rank {dist.get_rank()}] No read_done_handle found, setting _read_done_event_remote=None") + self._read_done_event_remote = None + def train_step( self, batch: DataProto, diff --git a/roll/distributed/strategy/vllm_strategy.py b/roll/distributed/strategy/vllm_strategy.py index 663bdf804..0e446efe5 100644 --- a/roll/distributed/strategy/vllm_strategy.py +++ b/roll/distributed/strategy/vllm_strategy.py @@ -2,29 +2,24 @@ import copy import gc import os -import queue -import threading -import time -from collections import defaultdict, deque -from concurrent import futures -from typing import Dict, List, Optional, Union -from packaging.version import Version +from collections import deque +from typing import Dict, List, Optional import torch import torch.distributed as dist from torch.nn.utils.rnn import pad_sequence from transformers import set_seed -import vllm from vllm import RequestOutput, SamplingParams from vllm.lora.request import LoRARequest from vllm.sampling_params import RequestOutputKind, BeamSearchParams +from vllm.inputs.data import TokensPrompt from vllm.utils import random_uuid from roll.distributed.executor.worker import Worker from roll.distributed.scheduler.protocol import DataProto, list_of_dict_to_dict_of_list from roll.distributed.strategy.strategy import InferenceStrategy -from roll.third_party.vllm import LLM, AsyncLLM -from roll.utils.functionals import GenerateRequestType, concatenate_input_and_output, reduce_metrics +from roll.third_party.vllm import create_async_llm +from roll.utils.functionals import concatenate_input_and_output, reduce_metrics from roll.utils.logging import get_logger from roll.utils.offload_states import OffloadStateType from roll.platforms import current_platform @@ -38,26 +33,33 @@ class VllmStrategy(InferenceStrategy): def __init__(self, worker: Worker): super().__init__(worker) - self.model: Union[LLM, AsyncLLM] - self.executor: futures.ThreadPoolExecutor = futures.ThreadPoolExecutor(max_workers=1) - self.pending_size = 1 - self.command_queue: Optional[queue.Queue] = None - - self.request_metas = {} - self.running = False - + # Metrics snapshot infrastructure self._metrics_snapshots = deque(maxlen=3600) self._metrics_snapshot_interval = 1.0 # Snapshot every 1 second - self._metrics_thread = None + self._metrics_task = None - def initialize(self, model_provider): + async def initialize(self, model_provider): set_seed(seed=self.worker.pipeline_config.seed) vllm_config = copy.deepcopy(self.worker_config.strategy_args.strategy_config) - engine_mode = vllm_config.pop("engine_mode", "sync") # sync/async - self.pending_size = vllm_config.pop("pending_size", 1) + # Must explicitly set VLLM_USE_V1 to pass this check: https://github.com/vllm-project/vllm/pull/14972 + os.environ["VLLM_USE_V1"] = str(vllm_config.pop("VLLM_USE_V1", 1)) self.sleep_level = vllm_config.pop("sleep_level", 1) - self.command_queue = queue.Queue() + + data_parallel_size = vllm_config.get("data_parallel_size", 1) + if data_parallel_size > 1: + logger.info( + f"VllmStrategy {self.worker.cluster_name} enable data parallel {data_parallel_size=} data_parallel_rank={self.worker.rank}" + f" data_parallel_address={os.environ['MASTER_ADDR']} data_parallel_rpc_port={os.environ['MASTER_PORT']}" + ) + assert data_parallel_size == self.worker.world_size, f"{data_parallel_size=} != {self.worker.world_size=}" + vllm_config.update( + { + "data_parallel_rank": self.worker.rank, # set data_parallel_rank to use external load balancing + "data_parallel_address": os.environ["MASTER_ADDR"], + "data_parallel_rpc_port": os.environ["MASTER_PORT"], + } + ) if self.worker_config.model_args.dtype == "fp32": dtype = "float32" @@ -79,6 +81,7 @@ def initialize(self, model_provider): ), # potentially hangs in tp>1 "enable_prefix_caching": vllm_config.get("enable_prefix_caching", True), "load_format": vllm_config.get("load_format", "dummy"), # use model update passed value + "max_num_batched_tokens": vllm_config.get("max_num_batched_tokens", 8192), # use default value of LLM class usage context } ) @@ -95,19 +98,17 @@ def initialize(self, model_provider): logger.info(f"vllm_config: {vllm_config}") assert not dist.is_initialized() - # set VLLM_PORT to avoid port conflict applied by vllm - vllm_port = self.worker.get_free_port() - os.environ["VLLM_PORT"] = str(vllm_port) + # Can not set VLLM_PORT explicitly in DP. Each call of get_engine_client_zmq_addr in + # DPCoordinator will return the same port, which will cause port conflict. + # https://github.com/vllm-project/vllm/blob/releases/v0.10.0/vllm/v1/engine/coordinator.py#L72 + if not data_parallel_size > 1: + # set VLLM_PORT to avoid port conflict applied by vllm + vllm_port = self.worker.get_free_port() + os.environ["VLLM_PORT"] = str(vllm_port) - if engine_mode == "sync": - self.model = LLM(resource_placement_groups=self.worker_config.resource_placement_groups, **vllm_config) - self.tokenizer = self.model.get_tokenizer() - else: - self.model = AsyncLLM( - resource_placement_groups=self.worker_config.resource_placement_groups, **vllm_config - ) - loop = asyncio.get_event_loop() - self.tokenizer = loop.run_until_complete(self.model.get_tokenizer()) + self.model = await create_async_llm(resource_placement_groups=self.worker_config.resource_placement_groups, **vllm_config) + + self.tokenizer = await self.model.get_tokenizer() additional_special_tokens = self.tokenizer.additional_special_tokens special_tokens = [ add_token @@ -124,11 +125,11 @@ def initialize(self, model_provider): self.is_model_in_gpu = True - self._metrics_thread = threading.Thread( - target=self._collect_metrics_snapshot, - name="metrics-collection" - ) - self._metrics_thread.start() + try: + from vllm.v1.metrics.reader import get_metrics_snapshot + self._metrics_task = asyncio.create_task(self._collect_metrics_snapshot()) + except Exception as e: + logger.warning(f"Failed to create metrics collector task: {e}") def op_compute_log_probs(self, logits: torch.Tensor, input_ids: torch.Tensor, attention_mask: torch.Tensor): """ @@ -136,57 +137,52 @@ def op_compute_log_probs(self, logits: torch.Tensor, input_ids: torch.Tensor, at """ pass - def generate(self, batch: DataProto, generation_config) -> torch.Tensor: + async def generate(self, batch: DataProto, generation_config) -> torch.Tensor: # Check if beam search is requested if self._should_use_beam_search(generation_config): - return self._generate_with_beam_search(batch, generation_config) + return await self._generate_with_beam_search(batch, generation_config) else: - return self._generate_standard(batch, generation_config) + return await self._generate_standard(batch, generation_config) def _should_use_beam_search(self, generation_config) -> bool: """Check if beam search should be used based on generation_config.""" return generation_config.get("num_beams", 1) > 1 or generation_config.get("use_beam_search", False) - def _generate_standard(self, batch: DataProto, generation_config) -> torch.Tensor: + async def _generate_standard(self, batch: DataProto, generation_config: Dict) -> torch.Tensor: """Standard generate method for non-beam search cases.""" sampling_params = create_sampling_params_for_vllm(gen_kwargs=generation_config) input_ids = batch.batch["input_ids"] # (bs, prompt_length) attention_mask = batch.batch["attention_mask"] # left-padded attention_mask - vllm_input_args = {} if "multi_modal_data" in batch.non_tensor_batch: - vllm_input_args["prompts"] = batch.non_tensor_batch["multi_modal_data"] + prompts = [TokensPrompt(data) for data in batch.non_tensor_batch["multi_modal_data"]] else: - if Version(vllm.__version__) >= Version("0.11.0"): - from vllm.inputs import TokensPrompt - prompt_token_ids_list=gather_unpadded_input_ids( - input_ids=input_ids, attention_mask=attention_mask - ) - vllm_input_args["prompts"] = [TokensPrompt(prompt_token_ids=prompt_token_ids)for prompt_token_ids in prompt_token_ids_list] - else: - vllm_input_args["prompt_token_ids"] = gather_unpadded_input_ids( - input_ids=input_ids, attention_mask=attention_mask - ) + prompts = [TokensPrompt(prompt_token_ids=prompt) + for prompt in gather_unpadded_input_ids(input_ids=input_ids, attention_mask=attention_mask) + ] - lora_requests = None + lora_request = None if self.is_lora: - batch_size = len(input_ids) - lora_int_ids = list(self.model.llm_engine.list_loras()) + lora_int_ids = list(await self.model.list_loras()) if len(lora_int_ids) > 0: lora_int_id = lora_int_ids[0] - lora_requests = [ - LoRARequest( - lora_name=f"{lora_int_id}", lora_int_id=lora_int_id, lora_path="dummy_lora_path" - ) - ] * batch_size + lora_request = LoRARequest(lora_name=f"{lora_int_id}", lora_int_id=lora_int_id, lora_path="dummy_lora_path") + + async def _generate(prompt): + request_id = random_uuid() + result_generator = self.model.generate( + prompt=prompt, + sampling_params=sampling_params, + request_id=request_id, + lora_request=lora_request, + ) + output: Optional[RequestOutput] = None + async for result in result_generator: + output = result + return output - vllm_outputs = self.model.generate( - sampling_params=sampling_params, - use_tqdm=False, - lora_request=lora_requests, - **vllm_input_args, - ) + vllm_outputs = await asyncio.gather(*[_generate(prompt) for prompt in prompts]) # (bs * num_return_sequences, max_response_len) output_ids = gather_outputs_to_pad_tensor( @@ -202,7 +198,7 @@ def _generate_standard(self, batch: DataProto, generation_config) -> torch.Tenso return output - def _generate_with_beam_search(self, batch: DataProto, generation_config) -> torch.Tensor: + async def _generate_with_beam_search(self, batch: DataProto, generation_config: Dict) -> torch.Tensor: """Generate using beam search method.""" # Create beam search parameters beam_params = BeamSearchParams( @@ -231,20 +227,24 @@ def _generate_with_beam_search(self, batch: DataProto, generation_config) -> tor prompts = [{"prompt_token_ids": token_ids} for token_ids in token_lists] # Call beam_search method - beam_search_outputs = self.model.beam_search( - prompts=prompts, - params=beam_params, - ) + async def _beam_search(prompt): + request_id = random_uuid() + result_generator = self.model.beam_search( + prompt=prompt, + request_id=request_id, + params=beam_params, + ) + output: Optional[RequestOutput] = None + async for result in result_generator: + output = result + return output + + beam_search_outputs = await asyncio.gather(*[_beam_search(prompt) for prompt in prompts]) generated_token_ids = [] - token_ids = [prompt['prompt_token_ids'] for prompt in prompts] - for batch_idx, output in enumerate(beam_search_outputs): - # Each output contains beam_width sequences - for beam_idx, sequence in enumerate(output.sequences): - # Get prompt length for this input - prompt_length = len(token_ids[batch_idx]) - # Extract only the generated tokens (exclude prompt) - generated_tokens = sequence.tokens[prompt_length:] + for request_output in beam_search_outputs: + for completion_output in request_output.outputs: + generated_tokens = completion_output.token_ids generated_token_ids.append(torch.tensor(generated_tokens, device=input_ids.device)) # Pad the sequences @@ -259,217 +259,134 @@ def _generate_with_beam_search(self, batch: DataProto, generation_config) -> tor return output - def process_vllm_output(self, vllm_outputs: List[RequestOutput], request_complete_callback, collect_unfinished=False): - # 转成response id, request_complete_callback - report_request_ids = [] - for request_output in vllm_outputs: - if not (request_output.finished or collect_unfinished): - continue - request_id = request_output.request_id - meta_info = self.request_metas.pop(request_id, None) - if meta_info is None: - continue - output_token_ids, finish_reasons, logprobs = [], [], [] - for completion_output in request_output.outputs: - output_token_ids.append(completion_output.token_ids) - finish_reasons.append(completion_output.finish_reason) - if completion_output.logprobs is not None: - logprobs.append( - [ - float(lps[token_id].logprob) - for token_id, lps in zip(completion_output.token_ids, completion_output.logprobs) - ] - ) - output_data = DataProto(meta_info=meta_info) - output_data.meta_info["output_token_ids"] = output_token_ids - output_data.meta_info["finish_reasons"] = finish_reasons - output_data.meta_info["output_logprobs"] = logprobs - request_complete_callback(data=output_data) - report_request_ids.append(request_id) - return report_request_ids - - def start_server(self, data: DataProto, request_complete_callback): - self.command_queue = queue.Queue() - self.running = True + async def generate_request(self, data: DataProto): collect_unfinished = data.meta_info.get("collect_unfinished", False) + input_ids = data.batch["input_ids"] + attention_mask = data.batch["attention_mask"] + request_id = data.meta_info["request_id"] + generation_config = data.meta_info.get("generation_config") + max_new_tokens = data.meta_info.get("max_new_tokens", generation_config["max_new_tokens"]) + max_new_tokens = min(max_new_tokens, generation_config["max_new_tokens"]) + output_kind = RequestOutputKind.CUMULATIVE if collect_unfinished else RequestOutputKind.FINAL_ONLY + sampling_params = create_sampling_params_for_vllm( + gen_kwargs={**generation_config, "max_new_tokens": max_new_tokens, "output_kind": output_kind} + ) + assert sampling_params.n == 1 or not collect_unfinished, "collect_unfinished is not supported in parallel sampling" + if "multi_modal_data" in data.non_tensor_batch: + assert len(data.non_tensor_batch["multi_modal_data"]) == 1 + prompt_token_ids = data.non_tensor_batch["multi_modal_data"][0]["prompt_token_ids"] + multi_modal_data = (data.non_tensor_batch["multi_modal_data"][0]["multi_modal_data"] + if "multi_modal_data" in data.non_tensor_batch["multi_modal_data"][0] else None) + prompt = TokensPrompt(prompt_token_ids=prompt_token_ids, multi_modal_data=multi_modal_data) + else: + assert input_ids.size(0) == 1, f"data['input_ids'] must have exactly one batch dimension" + prompt_token_ids = gather_unpadded_input_ids(input_ids=input_ids, attention_mask=attention_mask) + assert len(prompt_token_ids) == 1 + prompt = TokensPrompt(prompt_token_ids=prompt_token_ids[0]) + lora_request = None + if self.is_lora: + lora_int_ids = list(await self.model.list_loras()) + if len(lora_int_ids) > 0: + lora_int_id = lora_int_ids[0] + lora_request = LoRARequest(lora_name=f"{lora_int_id}", lora_int_id=lora_int_id, lora_path="dummy_lora_path") - while True: - while not self.command_queue.empty(): - command, batch = self.command_queue.get_nowait() - if command == GenerateRequestType.ADD: - input_ids = batch.batch["input_ids"] - attention_mask = batch.batch["attention_mask"] - request_id = batch.meta_info["request_id"] - self.request_metas[request_id] = batch.meta_info - generation_config = batch.meta_info.get("generation_config") - max_new_tokens = batch.meta_info.get("max_new_tokens", generation_config["max_new_tokens"]) - max_new_tokens = min(max_new_tokens, generation_config["max_new_tokens"]) - output_kind = RequestOutputKind.CUMULATIVE if collect_unfinished else RequestOutputKind.FINAL_ONLY - sampling_params = create_sampling_params_for_vllm( - gen_kwargs={**generation_config, "max_new_tokens": max_new_tokens, "output_kind": output_kind} - ) - if "multi_modal_data" in batch.non_tensor_batch: - prompt_token_ids = [ - batch.non_tensor_batch["multi_modal_data"][0] - ["prompt_token_ids"] - ] - multi_modal_data = ( - [batch.non_tensor_batch["multi_modal_data"][0]["multi_modal_data"]] - if "multi_modal_data" in batch.non_tensor_batch["multi_modal_data"][0] - else None - ) - else: - prompt_token_ids = gather_unpadded_input_ids( - input_ids=input_ids, attention_mask=attention_mask - ) - multi_modal_data = None - lora_requests = None - if self.is_lora: - batch_size = len(prompt_token_ids) - lora_int_ids = list(self.model.llm_engine.list_loras()) - if len(lora_int_ids) > 0: - lora_int_id = lora_int_ids[0] - lora_requests = [ - LoRARequest( - lora_name=f"{lora_int_id}", lora_int_id=lora_int_id, lora_path="dummy_lora_path" - ) - ] * batch_size - self.model.add_requests( - request_ids=[request_id], - prompt_token_ids=prompt_token_ids, - sampling_params=sampling_params, - multi_modal_data=multi_modal_data, - lora_requests=lora_requests, - ) - elif command == GenerateRequestType.ABORT: - request_id = batch.meta_info["request_id"] - self.model.abort_request(request_id=request_id) - elif command == GenerateRequestType.STOP: - stop_time = time.time() - wait_seconds = 120 - while collect_unfinished and len(self.request_metas) > 0: # for partial rollout - vllm_outputs: List[RequestOutput] = self.model.fetch_output() - processed_request_ids = self.process_vllm_output( - vllm_outputs=vllm_outputs, - request_complete_callback=request_complete_callback, - collect_unfinished=collect_unfinished, - ) - if time.time() - stop_time > wait_seconds: - logger.warning(f"Timeout after {wait_seconds}s waiting for running requests to complete. " - f"Remaining running requests: {len(self.request_metas)}") - break - self.model.abort_request(request_id=processed_request_ids) - self.model.abort_request(request_id=list(self.request_metas.keys())) - self.request_metas.clear() - while not self.command_queue.empty(): - self.command_queue.get_nowait() - # Run llm_engine again to consume all out standing requests and - # stop model execute loop, otherwise collective_rpc will stuck by - # model execute loop or there will be garbage output at next step. - self.model.clear_unfinished_requests() - self.running = False - return - - vllm_outputs: List[RequestOutput] = self.model.fetch_output() - self.process_vllm_output(vllm_outputs=vllm_outputs, request_complete_callback=request_complete_callback) - - def add_request(self, command, data: DataProto): - self.command_queue.put((command, data)) - - async def async_generate(self, batch: DataProto, generation_config: Dict) -> torch.Tensor: - # TODO: refactor async_generate interface. not supported now! - raise NotImplementedError() - from vllm.inputs.data import TokensPrompt - - sampling_params = create_sampling_params_for_vllm(gen_kwargs=generation_config) - - input_ids = batch.batch["input_ids"] # (bs, prompt_length) - attention_mask = batch.batch["attention_mask"] # left-padded attention_mask - assert input_ids.size(0) == 1, f"async_generate: batch['input_ids'] must have exactly one batch dimension" - - prompt_token_ids = gather_unpadded_input_ids(input_ids=input_ids, attention_mask=attention_mask) - - # TODO meaningful request id? - # async_generate如何实现abort_request - request_id = random_uuid() result_generator = self.model.generate( - prompt=TokensPrompt(prompt_token_ids=prompt_token_ids[0]), + prompt=prompt, sampling_params=sampling_params, request_id=request_id, + lora_request=lora_request, ) - vllm_output: Optional[RequestOutput] = None - async for request_output in result_generator: - vllm_output = request_output - assert vllm_output is not None + output: Optional[RequestOutput] = None + # vLLM support partial rollout in v1 from 0.10.1, and will return finished output + # with finish_reason setted no matter what RequestOutputKind is. + # For compatibility, the following except block are only for v0 and older version of v1. + try: + async for result in result_generator: + output = result + except asyncio.CancelledError: + if output is None: + output_data = DataProto(meta_info=data.meta_info) + output_data.meta_info["finish_reasons"] = ["abort"] + return output_data + + output_token_ids, finish_reasons, logprobs = [], [], [] + for completion_output in output.outputs: + output_token_ids.append(completion_output.token_ids) + # For compatibility, older version may return unfinished result, set finish_reason of those to 'abort'. + finish_reason = "abort" if completion_output.finish_reason is None else completion_output.finish_reason + finish_reasons.append(finish_reason) + if completion_output.logprobs is not None: + logprobs.append( + [ + float(lps[token_id].logprob) + for token_id, lps in zip(completion_output.token_ids, completion_output.logprobs) + ] + ) + output_data = DataProto(meta_info=data.meta_info) + output_data.meta_info["output_token_ids"] = output_token_ids + output_data.meta_info["finish_reasons"] = finish_reasons + output_data.meta_info["output_logprobs"] = logprobs + return output_data - # (bs * num_return_sequences, max_response_len) - output_ids = gather_outputs_to_pad_tensor( - request_outputs=[vllm_output], pad_token_id=self.tokenizer.pad_token_id, device=input_ids.device - ) - # (bs * num_return_sequences, input_len + max_response_len) - output = concatenate_input_and_output( - input_ids=input_ids, output_ids=output_ids, num_return_sequences=sampling_params.n - ) - return output + async def abort_requests(self, request_ids): + for id in request_ids: + await self.model.abort(request_id=id) # offload/reload 接口 - def load_states(self, *args, **kwargs): - self.model.reset_prefix_cache() + async def load_states(self, *args, **kwargs): + await self.model.reset_prefix_cache() if not self.is_model_in_gpu: - self.model.load_states() + await self.model.load_states() self.is_model_in_gpu = True - def offload_states(self, include=None, non_blocking=False): + async def offload_states(self, include=None, non_blocking=False): + await self.model.reset_prefix_cache() if include is None or OffloadStateType.model_params in include: - if self.is_model_in_gpu and self.worker.pipeline_config.is_train_infer_colocated: - self.model.offload_states(self.sleep_level) + if self.is_model_in_gpu and self.worker.pipeline_config.is_actor_infer_colocated: + await self.model.offload_states(self.sleep_level) self.is_model_in_gpu = False gc.collect() current_platform.empty_cache() + + def process_weights_after_loading(self,*args, **kwargs): + self.model.process_weights_after_loading() # 参数同步相关接口 - def setup_collective_group(self, model_update_name, comm_plan, backend=None): - if backend is None: - backend = current_platform.communication_backend - self.model.setup_collective_group(comm_plan=comm_plan, backend=backend, rank_in_cluster=self.worker.rank) - - def broadcast_parameter(self, model_update_name, src_pp_rank, dtype, shape, parameter_name, is_lora=False): - self.model.broadcast_parameter(src_pp_rank, dtype, shape, parameter_name, is_lora) - - def broadcast_bucket(self, model_update_name, src_pp_rank, meta_infos, bucket_size): - self.model.broadcast_bucket(src_pp_rank, meta_infos, bucket_size) + async def setup_collective_group(self, master_address, master_port, rank_offset, world_size, group_name, backend=None): + logger.info(f"setup_collective_group {group_name=}") + backend = backend if backend is not None else current_platform.communication_backend + await self.model.setup_collective_group(master_address, master_port, rank_offset, world_size, group_name, backend) - def update_parameter(self, model_update_name, parameter_name, weight, ranks_in_worker, is_lora=False): - self.model.update_parameter(parameter_name, weight, ranks_in_worker, is_lora) + async def broadcast_parameter(self, names, dtypes, shapes, group_name, is_lora=False): + await self.model.broadcast_parameter(names, dtypes, shapes, group_name, is_lora) - def update_parameter_in_bucket(self, model_update_name, meta_infos, buffer, ranks_in_worker): - self.model.update_parameter_in_bucket(meta_infos, buffer, ranks_in_worker) + async def update_parameter_in_bucket(self, serialized_named_tensors, is_lora=False): + await self.model.update_parameter_in_bucket(serialized_named_tensors, is_lora) - def add_lora(self, peft_config): - self.model.add_lora(peft_config) + async def add_lora(self, peft_config): + peft_config["target_modules"] = set(self.worker_config.model_args.lora_target) + await self.model.add_lora(peft_config) - def _collect_metrics_snapshot(self): + async def _collect_metrics_snapshot(self): """Collect metrics snapshots periodically in a background thread.""" - try: - while True: - raw_metrics = self.model.get_metrics() - snapshot = { - 'vllm/kv_cache_usage_perc_max': [], - 'vllm/num_requests_waiting_max': [], - 'vllm/num_preemptions_max': [] - } - for metric in raw_metrics: - if metric.name == "vllm:kv_cache_usage_perc": - snapshot['vllm/kv_cache_usage_perc_max'].append(metric.value) - elif metric.name == "vllm:num_requests_waiting": - snapshot['vllm/num_requests_waiting_max'].append(metric.value) - elif metric.name == "vllm:num_preemptions": - snapshot['vllm/num_preemptions_max'].append(metric.value) - self._metrics_snapshots.append(snapshot) - - time.sleep(self._metrics_snapshot_interval) - except Exception as e: - logger.warning(f"Failed to get metrics: {e}") + from vllm.v1.metrics.reader import get_metrics_snapshot + while True: + raw_metrics = get_metrics_snapshot() + snapshot = { + 'vllm/kv_cache_usage_perc_max': [], + 'vllm/num_requests_waiting_max': [], + 'vllm/num_preemptions_max': [] + } + for metric in raw_metrics: + if metric.name == "vllm:kv_cache_usage_perc": + snapshot['vllm/kv_cache_usage_perc_max'].append(metric.value) + elif metric.name == "vllm:num_requests_waiting": + snapshot['vllm/num_requests_waiting_max'].append(metric.value) + elif metric.name == "vllm:num_preemptions": + snapshot['vllm/num_preemptions_max'].append(metric.value) + self._metrics_snapshots.append(snapshot) + + await asyncio.sleep(self._metrics_snapshot_interval) def get_metrics(self, metric_names: Optional[List[str]] = None) -> Dict[str, float]: """ @@ -505,6 +422,7 @@ def gather_outputs_to_pad_tensor(request_outputs: List["RequestOutput"], pad_tok def create_sampling_params_for_vllm(gen_kwargs): + # TODO vllm 0.10.2 support partial rollout, and do not need to set RequestOutputKind to CUMULATIVE output_kind = gen_kwargs.get("output_kind", RequestOutputKind.FINAL_ONLY) if output_kind != RequestOutputKind.FINAL_ONLY: assert gen_kwargs["num_return_sequences"] == 1, ( diff --git a/roll/models/model_providers.py b/roll/models/model_providers.py index 939badc92..32ebc2b3f 100644 --- a/roll/models/model_providers.py +++ b/roll/models/model_providers.py @@ -1,7 +1,10 @@ +import inspect import os -from typing import List, Optional +import threading +from typing import Any, List, Optional import torch +import torch.nn as nn from peft import LoraConfig, TaskType, get_peft_model from transformers import ( AutoConfig, @@ -18,22 +21,50 @@ from transformers.integrations import is_deepspeed_zero3_enabled from transformers.modeling_utils import is_fsdp_enabled +from roll.configs import ModelArguments +from roll.platforms import current_platform +from roll.utils.checkpoint_manager import download_model, file_lock_context +from roll.utils.logging import get_logger +from roll.utils.packages import is_transformers_version_greater_than try: from mcore_adapter import TrainingArguments as mca_TrainingArguments + from mcore_adapter.adapters import ( + apply_megatron_lora, + find_all_embedding_modules, + find_all_linear_modules, + find_all_router_modules, + set_linear_is_expert, + ) from mcore_adapter.models import AutoModel except Exception as e: mca_TrainingArguments = None -from roll.configs import ModelArguments -from roll.utils.checkpoint_manager import download_model, file_lock_context -from roll.utils.logging import get_logger -from roll.utils.packages import is_transformers_version_greater_than -from roll.platforms import current_platform - logger = get_logger() +# Thread-local storage for FSDP2 initialization context +_fsdp2_init_context = threading.local() + + +def set_fsdp2_init_context(context): + _fsdp2_init_context.context = context + + +def clear_fsdp2_init_context(): + if hasattr(_fsdp2_init_context, "context"): + delattr(_fsdp2_init_context, "context") + + +def is_fsdp2_enabled(): + if hasattr(_fsdp2_init_context, "context"): + return True + return False + + +def is_fsdp_or_fsdp2_enabled(): + return is_fsdp_enabled() or is_fsdp2_enabled() + def prepare_automap_files(model_path: str): python_files = [] @@ -49,7 +80,7 @@ def prepare_automap_files(model_path: str): pass -def default_tokenizer_provider(model_args: "ModelArguments", model_name_or_path: str=None): +def default_tokenizer_provider(model_args: "ModelArguments", model_name_or_path: str = None): if model_args.model_type == "diffusion_module": return None if model_name_or_path is None: @@ -66,7 +97,7 @@ def default_tokenizer_provider(model_args: "ModelArguments", model_name_or_path: return tokenizer -def default_processor_provider(model_args: "ModelArguments", model_name_or_path: str=None): +def default_processor_provider(model_args: "ModelArguments", model_name_or_path: str = None): if model_args.model_type == "diffusion_module": return None if model_name_or_path is None: @@ -119,22 +150,38 @@ def freeze_model(model, model_args: "ModelArguments"): # Inspired by: https://github.com/hiyouga/LLaMA-Factory/blob/main/src/llamafactory/model/adapter.py -def setup_lora_training(config, model, model_args: "ModelArguments", is_trainable: Optional[bool] = False): +def setup_lora_training( + config, model, model_args: "ModelArguments", is_trainable: Optional[bool] = False, is_mca: Optional[bool] = False +): model.enable_input_require_grads() - if is_trainable: - target_modules = model_args.lora_target + def get_target_modules(model: "torch.nn.Module", model_args: "ModelArguments"): + target_modules = model_args.lora_target + if "all-linear" in model_args.lora_target: + target_modules.remove("all-linear") + target_modules += find_all_linear_modules(model) + if "all-embedding" in model_args.lora_target: + target_modules.remove("all-embedding") + target_modules += find_all_embedding_modules(model) + if "all-router" in model_args.lora_target: + target_modules.remove("all-router") + target_modules += find_all_router_modules(model) + return target_modules + + target_modules = get_target_modules(model, model_args) lora_config = { - "task_type": TaskType.CAUSAL_LM, "r": model_args.lora_rank, "target_modules": target_modules, "lora_alpha": model_args.lora_alpha, "lora_dropout": model_args.lora_dropout, "modules_to_save": model_args.additional_target, } - - model = get_peft_model(model, LoraConfig(**lora_config)) + if not is_mca: + lora_config.update({"task_type": TaskType.CAUSAL_LM}) + model = get_peft_model( + model, LoraConfig(**lora_config), autocast_adapter_dtype=model_args.autocast_adapter_dtype + ) return model @@ -148,10 +195,28 @@ def load_model( """ model_name_or_path = download_model(model_args.model_name_or_path) prepare_automap_files(model_args.model_name_or_path) - init_kwargs = {"trust_remote_code": True, **model_args.model_config_kwargs} + init_kwargs = { + "trust_remote_code": True, + **model_args.model_config_kwargs, + } config = AutoConfig.from_pretrained(model_name_or_path, **init_kwargs) if model_args.attn_implementation is not None and model_args.attn_implementation != "auto": setattr(config, "_attn_implementation", model_args.attn_implementation) + + # --------------------------------------------------------------------- + # PumpkinComment: + # Ref: https://github.com/volcengine/verl/blob/main/verl/workers/fsdp_workers.py + # Many VLMs have a separate vision attention stack. When Ulysses/CP is enabled, we patch + # HF flash-attention paths for text attention. However, vision attention often: + # - does not carry the same kwargs (e.g. missing position_ids), or + # - calls into different flash-attn wrappers, + # which can cause mismatched collectives / deadlocks across ranks. + ulysses_size = int(model_args.ulysses_size or 1) + if getattr(config, "vision_config", None) is not None: + vc = config.vision_config + setattr(vc, "_attn_implementation", "sdpa") + setattr(vc, "attn_implementation", "sdpa") + if not is_trainable: setattr(config, "use_cache", True) else: @@ -159,8 +224,8 @@ def load_model( if model_args.moe_aux_loss_coef is not None: setattr(config, "router_aux_loss_coef", model_args.moe_aux_loss_coef) setattr(config, "output_router_logits", is_trainable) - init_kwargs["low_cpu_mem_usage"] = not is_deepspeed_zero3_enabled() - if not is_deepspeed_zero3_enabled() and not is_fsdp_enabled(): + init_kwargs["low_cpu_mem_usage"] = not is_deepspeed_zero3_enabled() and not is_fsdp2_enabled() + if not is_deepspeed_zero3_enabled() and not is_fsdp_or_fsdp2_enabled(): init_kwargs["torch_dtype"] = model_args.compute_dtype if init_kwargs["low_cpu_mem_usage"]: # device map requires low_cpu_mem_usage=True if "device_map" not in init_kwargs and model_args.device_map: @@ -172,9 +237,26 @@ def load_model( model_class = AutoModelForVision2Seq # image and video else: model_class = AutoModelForCausalLM # text - model = model_class.from_pretrained(**init_kwargs) + + fsdp2_init_context = getattr(_fsdp2_init_context, "context", None) + + if fsdp2_init_context is not None: + with fsdp2_init_context(): + model = model_class.from_pretrained(**init_kwargs) + else: + model = model_class.from_pretrained(**init_kwargs) + if not model_args.disable_gradient_checkpointing: - model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": False}) + # PumpkinComment: + # - use_reentrant=False is generally preferred, but some MoE models can produce + # a different set of autograd-saved tensors between forward and recomputation, + # which triggers torch.utils.checkpoint.CheckpointError. + if model_args.gradient_checkpointing_use_reentrant is None: + use_reentrant = True if _is_moe_config(config) else False + else: + use_reentrant = bool(model_args.gradient_checkpointing_use_reentrant) + + model.gradient_checkpointing_enable(gradient_checkpointing_kwargs={"use_reentrant": use_reentrant}) if model_args.lora_target is None: freeze_model(model, model_args) @@ -209,6 +291,30 @@ def load_model( else: model.train() + # Debug case, we may not use the default_actor model provider + patch_model(model, config, use_mcore=False) + + if ulysses_size > 1 and getattr(config, "vision_config", None) is not None: + model_type = getattr(config, "model_type", None) or "" + if model_type in ("qwen2_5_vl", "qwen3_vl"): + from roll.utils.context_parallel.vlm_cp_patch import patch_vlm_decoder_for_cp + + decoder = find_vlm_text_decoder(model) + if decoder is None: + logger.warning(f"CP(VLM) enabled but failed to locate text decoder for model_type={model_type}") + else: + patch_vlm_decoder_for_cp(decoder, name=f"{model_type}.text_decoder") + else: + logger.warning(f"CP(VLM) enabled but model_type={model_type} not fully tested") + + if is_fsdp2_enabled() and getattr(config, "vision_config", None) is not None: + # PumpkinComment: + # Otherwise we will have precision issue + vision_tower_blocks = get_vl_model_vision_tower_blocks(model) + if vision_tower_blocks is not None: + for block in vision_tower_blocks: + block._fsdp2_cast_forward_inputs = False + return model @@ -217,16 +323,43 @@ def patch_model(model, config, use_mcore): model_type = config.model_type + # Avoid double-patching when multiple providers call patch_model() + if getattr(model, "_roll_forward_patched", False): + return + forward_patch = None # patch to force vit forward with mock image to avoid hang if not use_mcore: - if "qwen2_vl" == model_type or "qwen2_5_vl" == model_type: - if is_peft_model := getattr(model, "peft_config", None) is not None: + if model_type in ("qwen2_vl", "qwen2_5_vl", "qwen3_vl"): + if is_peft_model := (getattr(model, "peft_config", None) is not None): ori_forward = type(model.get_base_model()).forward else: ori_forward = type(model).forward def _handle_missing_visual(self, inputs_embeds: "torch.FloatTensor"): + if getattr(self.config, "model_type", None) == "qwen3_vl": + # Qwen3-VL vision forward returns (image_embeds, deepstack_embeds_list) + patch_dim = ( + self.config.vision_config.in_channels + * self.config.vision_config.temporal_patch_size + * self.config.vision_config.patch_size + * self.config.vision_config.patch_size + ) + mock_pixel_values = torch.zeros( + 16, + patch_dim, + device=inputs_embeds.device, + dtype=inputs_embeds.dtype, + ) + mock_grid_thw = torch.LongTensor([[1, 4, 4]]).to(inputs_embeds.device) + vision_out = self.visual(mock_pixel_values, grid_thw=mock_grid_thw) + image_embeddings = vision_out[0] if isinstance(vision_out, tuple) else vision_out + deepstack_list = vision_out[1] if isinstance(vision_out, tuple) and len(vision_out) > 1 else [] + inputs_embeds = inputs_embeds + image_embeddings.mean() * 0 + for emb in deepstack_list or []: + inputs_embeds = inputs_embeds + emb.mean() * 0 + return inputs_embeds + mock_pixel_values = torch.zeros( 4, self.config.vision_config.in_channels @@ -264,31 +397,32 @@ def forward_patch( assert inputs_embeds is None if kwargs.pop("force_vit_image", False) and pixel_values is None: # force vit forward with mock image to avoid hang - inputs_embeds = self.model.embed_tokens(input_ids) + inputs_embeds = self.get_input_embeddings()(input_ids) inputs_embeds = _handle_missing_visual(self, inputs_embeds) if kwargs.pop("force_vit_video", False) and pixel_values_videos is None: if inputs_embeds is None: - inputs_embeds = self.model.embed_tokens(input_ids) + inputs_embeds = self.get_input_embeddings()(input_ids) # force vit forward with mock image to avoid hang inputs_embeds = _handle_missing_visual(self, inputs_embeds) return ori_forward( self, - input_ids, - attention_mask, - position_ids, - past_key_values, - inputs_embeds, - labels, - use_cache, - output_attentions, - output_hidden_states, - # return_dict, - pixel_values, - pixel_values_videos, - image_grid_thw, - video_grid_thw, - rope_deltas, - cache_position, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + labels=labels, + use_cache=use_cache, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + rope_deltas=rope_deltas, + cache_position=cache_position, + **kwargs, ) if forward_patch is not None: @@ -296,6 +430,7 @@ def forward_patch( model.get_base_model().forward = types.MethodType(forward_patch, model.get_base_model()) else: model.forward = types.MethodType(forward_patch, model) + setattr(model, "_roll_forward_patched", True) def default_diffusion_module_provider( @@ -306,8 +441,9 @@ def default_diffusion_module_provider( ): if model_args.model_config_kwargs["model_name"] == "wan2_2": from roll.pipeline.diffusion.modules.wan_module import WanTrainingModule + print(f"{model_args.model_config_kwargs=}") - training_module = WanTrainingModule(**model_args.model_config_kwargs) + training_module = WanTrainingModule(**model_args.model_config_kwargs) else: raise NotImplementedError(f"model_type {model_args.model_type} not implemented yet") @@ -320,9 +456,9 @@ def default_actor_model_provider( training_args: "TrainingArguments" = None, is_trainable: Optional[bool] = False, ): + model_args.model_name_or_path = download_model(model_args.model_name_or_path) config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True) old_model_name_or_path = model_args.model_name_or_path - model_args.model_name_or_path = download_model(model_args.model_name_or_path) prepare_automap_files(model_args.model_name_or_path) if ( mca_TrainingArguments is not None @@ -341,7 +477,12 @@ def default_actor_model_provider( model.eval() for param in model.parameters(): param.requires_grad = False - freeze_model(model, model_args) + if model_args.lora_target is None: + freeze_model(model, model_args) + else: + apply_megatron_lora() + set_linear_is_expert(model[0]) + model.models[0] = setup_lora_training(model[0].config, model[0], model_args, is_trainable, is_mca=True) patch_model(model, config, use_mcore=True) else: # hf @@ -349,7 +490,7 @@ def default_actor_model_provider( "torch_dtype": model_args.compute_dtype, "trust_remote_code": True, } - if not is_deepspeed_zero3_enabled(): + if not is_deepspeed_zero3_enabled() and not is_fsdp2_enabled(): init_kwargs["low_cpu_mem_usage"] = True if is_trainable: init_kwargs["device_map"] = {"": current_platform.current_device()} @@ -409,7 +550,11 @@ class TokenClassifierOutput(ModelOutput): config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True) config.num_labels = model_args.num_labels if model_args.attn_implementation is not None and model_args.attn_implementation != "auto": - setattr(config, "_attn_implementation", model_args.attn_implementation) + setattr( + config, + "_attn_implementation", + model_args.attn_implementation, + ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, config=config, **init_kwargs ) @@ -418,7 +563,11 @@ class TokenClassifierOutput(ModelOutput): config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True) config.num_labels = model_args.num_labels if model_args.attn_implementation is not None and model_args.attn_implementation != "auto": - setattr(config, "_attn_implementation", model_args.attn_implementation) + setattr( + config, + "_attn_implementation", + model_args.attn_implementation, + ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, config=config, **init_kwargs ) @@ -434,8 +583,12 @@ class TokenClassifierOutput(ModelOutput): AutoModelForCausalLMWithValueHead.post_init = no_set_device_hook_post_init model = load_model(model_args, is_trainable, True) setattr(model, "forward", token_classifier_forward.__get__(model)) - setattr(model, "load_state_dict", value_head_load_state_dict.__get__(model)) - logger.info(f"patch AutoModelForCausalLMWithValueHead load_state_dict and forward") + setattr( + model, + "load_state_dict", + value_head_load_state_dict.__get__(model), + ) + logger.info("patch AutoModelForCausalLMWithValueHead load_state_dict and forward") else: raise NotImplementedError if model.config.pad_token_id is None: @@ -482,7 +635,11 @@ def default_value_model_provider( config = AutoConfig.from_pretrained(model_args.model_name_or_path, trust_remote_code=True) config.num_labels = model_args.num_labels if model_args.attn_implementation is not None and model_args.attn_implementation != "auto": - setattr(config, "_attn_implementation", model_args.attn_implementation) + setattr( + config, + "_attn_implementation", + model_args.attn_implementation, + ) model = AutoModelForTokenClassification.from_pretrained( model_args.model_name_or_path, config=config, **init_kwargs ) @@ -498,7 +655,11 @@ def default_value_model_provider( AutoModelForCausalLMWithValueHead.post_init = no_set_device_hook_post_init model = load_model(model_args, is_trainable, True) setattr(model, "forward", token_classifier_forward.__get__(model)) - setattr(model, "load_state_dict", value_head_load_state_dict.__get__(model)) + setattr( + model, + "load_state_dict", + value_head_load_state_dict.__get__(model), + ) else: raise NotImplementedError if model.config.pad_token_id is None: @@ -511,44 +672,219 @@ def default_value_model_provider( def get_extra_data_provider(model_name_or_path: str, processor=None): model_name_or_path = download_model(model_name_or_path) - config = AutoConfig.from_pretrained(model_name_or_path) - if "qwen2" in config.model_type: + try: + config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True) + model_type = config.model_type + except ValueError as e: + # mca ckpt use mca_config.json as config file + import json + + from mcore_adapter.constants import MCA_CONFIG_NAME + + config_file = os.path.join(model_name_or_path, MCA_CONFIG_NAME) + model_type = None + if os.path.isfile(config_file): + with open(config_file, "r", encoding="utf-8") as reader: + text = reader.read() + config_values = json.loads(text) + model_type = config_values.get("hf_model_type") + else: + raise e + + # NOTE: + if isinstance(model_type, str) and (("qwen2" in model_type) or (model_type in ("qwen3_vl", "qwen3_vl_moe"))): import types from transformers import BatchFeature # help define a object to accesss attr + def _call_get_rope_index(fn, input_ids: torch.LongTensor, **candidate_kwargs): + sig = inspect.signature(fn) + params = sig.parameters + accepts_kwargs = any(p.kind == inspect.Parameter.VAR_KEYWORD for p in params.values()) + if accepts_kwargs: + return fn(input_ids, **candidate_kwargs) + + filtered = {k: v for k, v in candidate_kwargs.items() if k in params} + return fn(input_ids, **filtered) + + spatial_merge_size = getattr(getattr(config, "vision_config", None), "spatial_merge_size", None) + if spatial_merge_size is None and processor is not None: + spatial_merge_size = getattr(getattr(processor, "image_processor", None), "merge_size", None) + if spatial_merge_size is None: + raise ValueError( + f"spatial_merge_size is required for model_type={model_type} get_rope_index, " + "but it was not found in config.vision_config nor processor.image_processor." + ) + vc = {"spatial_merge_size": spatial_merge_size} + tokens_per_second = getattr(getattr(config, "vision_config", None), "tokens_per_second", None) + if model_type == "qwen2_5_vl" and tokens_per_second is not None: + vc["tokens_per_second"] = tokens_per_second + + image_token_id = getattr(config, "image_token_id", None) + video_token_id = getattr(config, "video_token_id", None) + vision_start_token_id = getattr(config, "vision_start_token_id", None) + if processor is not None and hasattr(processor, "tokenizer"): + image_token_id = image_token_id or processor.tokenizer.convert_tokens_to_ids("<|image_pad|>") + video_token_id = video_token_id or processor.tokenizer.convert_tokens_to_ids("<|video_pad|>") + vision_start_token_id = vision_start_token_id or processor.tokenizer.convert_tokens_to_ids( + "<|vision_start|>" + ) + dummy_self = BatchFeature( { "config": BatchFeature( { - "vision_config": BatchFeature({"spatial_merge_size": processor.image_processor.merge_size}), - "image_token_id": processor.tokenizer.convert_tokens_to_ids("<|image_pad|>"), - "video_token_id": processor.tokenizer.convert_tokens_to_ids("<|video_pad|>"), - "vision_start_token_id": processor.tokenizer.convert_tokens_to_ids("<|vision_start|>"), + "vision_config": BatchFeature(vc), + "image_token_id": image_token_id, + "video_token_id": video_token_id, + "vision_start_token_id": vision_start_token_id, } ) } ) - if is_transformers_version_greater_than("4.52.0"): - from transformers.models.qwen2_vl import Qwen2VLModel - get_rope_index = types.MethodType(Qwen2VLModel.get_rope_index, dummy_self) + is_tf_ge_4_52 = is_transformers_version_greater_than("4.52.0") + if model_type == "qwen2_5_vl": + if is_tf_ge_4_52: + from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLModel + + get_rope_index = types.MethodType(Qwen2_5_VLModel.get_rope_index, dummy_self) + else: + from transformers.models.qwen2_5_vl.modeling_qwen2_5_vl import Qwen2_5_VLForConditionalGeneration + + get_rope_index = types.MethodType(Qwen2_5_VLForConditionalGeneration.get_rope_index, dummy_self) + elif model_type in ("qwen3_vl", "qwen3_vl_moe"): + from transformers.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLModel + + get_rope_index = types.MethodType(Qwen3VLModel.get_rope_index, dummy_self) else: - from transformers.models.qwen2_vl import Qwen2VLForConditionalGeneration + if is_tf_ge_4_52: + from transformers.models.qwen2_vl import Qwen2VLModel - get_rope_index = types.MethodType(Qwen2VLForConditionalGeneration.get_rope_index, dummy_self) + get_rope_index = types.MethodType(Qwen2VLModel.get_rope_index, dummy_self) + else: + from transformers.models.qwen2_vl import Qwen2VLForConditionalGeneration + + get_rope_index = types.MethodType(Qwen2VLForConditionalGeneration.get_rope_index, dummy_self) def extra_data_provider( input_ids: torch.LongTensor, image_grid_thw: Optional[torch.LongTensor] = None, video_grid_thw: Optional[torch.LongTensor] = None, attention_mask: Optional[torch.Tensor] = None, + second_per_grid_ts: Optional[torch.Tensor] = None, ): - rope_index = get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask)[0] - # (3, bsz, seqlen) -> (bsz, 3, seqlen) to put it into DataProto, - # transpose it batck to (3, bsz, seqlen) before forward for model + # Keep kwargs to be resilient to HF signature changes between versions/models. + out = _call_get_rope_index( + get_rope_index, + input_ids, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + second_per_grid_ts=second_per_grid_ts, + attention_mask=attention_mask, + ) + rope_index = out[0] + # PumpkinComment: + # HF Qwen-VL "mrope" position_ids are expected to be 4-channel in newer transformers: + # [text_pos_ids, mrope_t, mrope_h, mrope_w] + # while some HF get_rope_index implementations return only the 3D vision part (3, bsz, seqlen). + # + # I normalize here so downstream strategies don't need model-specific hacks. + # Note: transformers < 4.54 only accepts vision position ids in some Qwen-VL variants. + if is_transformers_version_greater_than("4.53.3") and rope_index.dim() == 3 and rope_index.size(0) == 3: + bsz, seqlen = input_ids.shape + if attention_mask is not None: + text_pos_full = attention_mask.long().cumsum(-1) - 1 + text_pos_full = torch.clamp(text_pos_full, min=0).to( + dtype=rope_index.dtype, device=rope_index.device + ) + text_pos_full = text_pos_full.unsqueeze(0) # (1, bsz, seqlen) + else: + text_pos_full = ( + torch.arange(seqlen, dtype=rope_index.dtype, device=rope_index.device) + .view(1, 1, -1) + .expand(1, bsz, -1) + ) + rope_index = torch.cat([text_pos_full, rope_index], dim=0) # (4, bsz, seqlen) + + # (C, bsz, seqlen) -> (bsz, C, seqlen) to put it into DataProto, + # transpose it back to (C, bsz, seqlen) before forward for model. rope_index = rope_index.transpose(0, 1) return {"position_ids": rope_index} return extra_data_provider - return None \ No newline at end of file + + def default_extra_data_provider( + input_ids: torch.LongTensor, + attention_mask: Optional[torch.Tensor] = None, + ): + bsz, seqlen = input_ids.shape + position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device) + position_ids = position_ids.unsqueeze(0).expand(bsz, -1) + if attention_mask is not None: + position_ids = position_ids.masked_fill(attention_mask == 0, 0) + return {"position_ids": position_ids} + + return default_extra_data_provider + + +def find_vlm_text_decoder(model: nn.Module) -> Optional[nn.Module]: + """ + Best-effort extractor for the text decoder stack of common VLM wrappers. + """ + # Unwrap PEFT if present. + base = getattr(model, "get_base_model", None) + if callable(base): + model = base() + + # Common attribute patterns across HF VLMs. + for path in ( + ("language_model",), + ("model", "language_model"), + ("model", "text_model"), + ("text_model",), + ("model",), + ): + cur: Any = model + ok = True + for p in path: + if not hasattr(cur, p): + ok = False + break + cur = getattr(cur, p) + if ok and isinstance(cur, nn.Module): + # Heuristic: the decoder usually has an embedding or layers attr. + return cur + return None + + +def get_vl_model_vision_tower_blocks(vl_model_instance): + """ + Util to extract Vision Tower from a VL model instance + + Reference: https://github.com/volcengine/verl/blob/main/verl/workers/fsdp_workers.py#L128-L138 + """ + if hasattr(vl_model_instance, "model") and hasattr(vl_model_instance.model, "visual"): + # transformers >= 4.52.0 + return vl_model_instance.model.visual.blocks + elif hasattr(vl_model_instance, "visual"): + # transformers < 4.52.0 + return vl_model_instance.visual.blocks + return None + + +def _is_moe_config(cfg) -> bool: + if cfg is None: + return False + # Heuristic: cover common HF config fields for MoE models. + moe_keys = ( + "num_experts", + "n_experts", + "moe_num_experts", + "num_local_experts", + "num_experts_per_tok", + "router_aux_loss_coef", + "output_router_logits", + "moe_layer_freq", + ) + return any(getattr(cfg, k, None) not in (None, 0, False) for k in moe_keys) diff --git a/roll/pipeline/agentic/agentic_actor_pg_worker.py b/roll/pipeline/agentic/agentic_actor_pg_worker.py new file mode 100644 index 000000000..28c84fcc4 --- /dev/null +++ b/roll/pipeline/agentic/agentic_actor_pg_worker.py @@ -0,0 +1,588 @@ +import numpy as np +import torch + +from roll.distributed.scheduler.protocol import DataProto +from roll.pipeline.base_worker import ActorWorker as BaseActorWorker +from roll.utils.functionals import masked_mean, agg_loss, compute_approx_kl +from roll.pipeline.agentic.utils import compute_segment_masked_mean +from roll.pipeline.agentic.agentic_pipeline import get_episode_scores +from roll.utils.train_infer_corrections import compute_train_infer_correction +from roll.platforms import current_platform + + +class ActorWorker(BaseActorWorker): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + # 缓存PG变体的配置参数 + self._pg_config_cache = {} + self._pg_variant_logged = True + self._topr_sample_logged = False + self._cispo_config_logged = False + self._kimi15_config_logged = False + + def _get_or_cache_config(self, key, default_value): + """获取或缓存配置值""" + if key not in self._pg_config_cache: + self._pg_config_cache[key] = getattr(self.pipeline_config.actor_train, key, default_value) + return self._pg_config_cache[key] + + def loss_func(self, data: DataProto, output_tensor: torch.Tensor): + """ + loss func接口定义: + data: DataProto, 由train_step透传 + output_tensor: torch.Tensor, model.forward()的输出Tensor + """ + response_mask = data.batch["response_mask"][:, 1:].long() + ref_log_probs = data.batch["ref_log_probs"] + advantages = data.batch["advantages"] + + batch_num_tokens = data.meta_info['batch_num_tokens'] + global_valid_samples = data.meta_info['global_valid_samples'] + + log_probs = self.strategy.op_compute_log_probs( + logits=output_tensor, input_ids=data.batch["input_ids"], attention_mask=data.batch["response_mask"] + ) + old_log_probs = self.get_old_log_probs_with_cache(data, log_probs) + infer_log_probs = data.batch.get("infer_logprobs", old_log_probs) + infer_log_probs = infer_log_probs if len(infer_log_probs) > 0 else old_log_probs + + train_infer_metric = {} + if not self.pipeline_config.enable_old_logprobs_recompute: + train_infer_is_weight, filter_mask, train_infer_metric = compute_train_infer_correction( + cfg=self.pipeline_config.train_infer_correction, + response_mask=response_mask, + old_log_probs=old_log_probs, + infer_log_probs=infer_log_probs, + global_valid_samples=global_valid_samples['response_mask'], + global_valid_tokens=batch_num_tokens['response_mask'], + ) + + # Apply filter mask to response_mask + response_mask = response_mask.long() * filter_mask.long() + else: + train_infer_is_weight = data.batch['train_infer_is_weight'] + + if self.pipeline_config.ratio_type == "segment": + # 计算序列级别的 ratio:对每段连续的1分别计算 masked_mean,不连续的段不相乘 + log_ratio = log_probs - old_log_probs + masked_log_ratio = compute_segment_masked_mean(log_ratio, response_mask) + ratio = masked_log_ratio.exp() + else: + ratio = (log_probs - old_log_probs).exp() + + pg_variant = self._get_or_cache_config("pg_variant", "vanilla") + self._cached_metrics = { + "pg_variant": pg_variant, + "ratio": ratio, + "response_mask": response_mask, + } + + if pg_variant == "vanilla": # Basic Policy Gradient + pg_loss = self._compute_vanilla_pg_loss(ratio, log_probs, advantages) + elif pg_variant == "ppo": # Proximal Policy Optimization + pg_loss = self._compute_ppo_loss(ratio, advantages, response_mask, batch_num_tokens=batch_num_tokens, + global_valid_samples=global_valid_samples) + elif pg_variant == "tis": # Truncated Importance Sampling + pg_loss = self._compute_tis_loss(ratio, log_probs, old_log_probs, response_mask, advantages, data, + batch_num_tokens=batch_num_tokens, global_valid_samples=global_valid_samples) + elif pg_variant == "topr": # Tapered off-policy REINFORCE + pg_loss = self._compute_topr_loss(ratio, log_probs, old_log_probs, advantages, data) + elif pg_variant == "cispo": # Clipped Importance Sampling Policy Optimization Minimax-M1 + pg_loss = self._compute_cispo_loss(ratio, log_probs, advantages) + elif pg_variant == "kimi15": # Kimi15 + pg_loss = self._compute_kimi15_loss(ratio, log_probs, old_log_probs, advantages) + else: + raise ValueError(f"Unsupported pg_variant: {pg_variant}") + + if self.pipeline_config.train_infer_correction.is_weight.enabled: + pg_loss = pg_loss * train_infer_is_weight + + pg_loss = agg_loss(loss_mat=pg_loss, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], global_valid_samples=global_valid_samples['response_mask']) + # 缓存损失相关指标 + self._cached_metrics.update({"original_pg_loss": pg_loss}) + + kl_loss = compute_approx_kl( + log_probs=log_probs, log_probs_base=ref_log_probs, action_mask=response_mask, kl_penalty="k3" + ) + kl_loss = agg_loss(loss_mat=kl_loss, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], global_valid_samples=global_valid_samples['response_mask']) + + approxkl = compute_approx_kl( + log_probs=log_probs, log_probs_base=old_log_probs, action_mask=response_mask, kl_penalty="mse" + ) + policykl = compute_approx_kl( + log_probs=log_probs, log_probs_base=old_log_probs, action_mask=response_mask, kl_penalty="kl" + ) + + if self.pipeline_config.use_kl_loss: + total_loss = pg_loss + kl_loss * self.pipeline_config.kl_loss_coef + else: + total_loss = pg_loss + if self.pipeline_config.entropy_loss_coef > 0: + entropy = self.strategy.op_compute_entropy( + logits=output_tensor, attention_mask=data.batch["response_mask"] + ) + entropy_loss = agg_loss( + loss_mat=entropy, + loss_mask=response_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'], + ) + total_loss = total_loss - entropy_loss * self.pipeline_config.entropy_loss_coef + + self._cached_metrics.update( + { + "kl_loss": kl_loss, + "approxkl": approxkl, + "policykl": policykl, + } + ) + + self._cached_metrics["total_loss"] = total_loss + + # 使用缓存的指标 + pg_metrics = self._get_pg_metrics(data, batch_num_tokens=batch_num_tokens, global_valid_samples=global_valid_samples,) + pg_metrics.update(train_infer_metric) + return total_loss, pg_metrics + + def _compute_ppo_loss(self, ratio: torch.Tensor, advantages: torch.Tensor, response_mask: torch.Tensor, + batch_num_tokens: dict, global_valid_samples: dict): + """ + 计算PPO损失 + """ + pg_clip = self.pipeline_config.pg_clip + pg_clip_low = ( + self.pipeline_config.pg_clip_low + if self.pipeline_config.use_pg_clip_range + else self.pipeline_config.pg_clip + ) + pg_clip_high = ( + self.pipeline_config.pg_clip_high + if self.pipeline_config.use_pg_clip_range + else self.pipeline_config.pg_clip + ) + surr1 = ratio * advantages + surr2 = ratio.clamp(1 - pg_clip_low, 1 + pg_clip_high) * advantages + loss = -torch.min(surr1, surr2) + if self.pipeline_config.dual_clip_loss: + dual_clip_loss = -torch.max(-loss, (1 + pg_clip * 2) * advantages) + loss = torch.where(advantages < 0, dual_clip_loss, loss) + + # 缓存PPO相关指标 + clipped_low = (ratio < 1 - pg_clip_low).float() + clipped_high = (ratio > 1 + pg_clip_high).float() + clipped = (clipped_low + clipped_high).float() + + self._cached_metrics.update( + { + "ppo_ratio_high_clipfrac": agg_loss(loss_mat=clipped_high, loss_mask=response_mask, loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['response_mask'],).detach().item(), + "ppo_ratio_low_clipfrac": agg_loss(loss_mat=clipped_low, loss_mask=response_mask, loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['response_mask'],).detach().item(), + "ppo_ratio_clipfrac": agg_loss(loss_mat=clipped, loss_mask=response_mask, loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['response_mask'],).detach().item(), + "clipfrac": agg_loss( + loss_mat=torch.lt(surr2, surr1).float(), + loss_mask=response_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'], + ) + .detach() + .item(), + } + ) + + return loss + + def _compute_vanilla_pg_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, advantages: torch.Tensor): + """ + 计算原始Policy Gradient损失 + + Args: + ratio: 重要性采样比率 π(a|s) / π_old(a|s) + advantages: 优势函数值 + + Returns: + pg_loss: Policy Gradient损失 + """ + + return -log_probs * advantages + + def _compute_tis_loss( + self, + ratio: torch.Tensor, + log_probs: torch.Tensor, + old_log_probs: torch.Tensor, + response_mask: torch.Tensor, + advantages: torch.Tensor, + data: DataProto, + batch_num_tokens: dict, + global_valid_samples: dict + ): + """ + 计算Truncated Importance Sampling (TIS) 损失 + 根据论文: Truncated Importance Sampling for Value-based Reinforcement Learning + TIS将重要性采样比率截断在[0, 1]范围内 + """ + # 缓存TIS配置 + tis_lower_bound = self._get_or_cache_config("tis_lower_bound", 0.0) + tis_upper_bound = self._get_or_cache_config("tis_upper_bound", 1.0) + + # 截断重要性采样比率 + clipped_ratio = torch.clamp(ratio, min=tis_lower_bound, max=tis_upper_bound) + + TIS_loss = -clipped_ratio.detach() * advantages * log_probs + + # 缓存TIS相关指标 + lower_clipped = (ratio < tis_lower_bound).float() + upper_clipped = (ratio > tis_upper_bound).float() + total_clipped = (lower_clipped + upper_clipped).float() + + self._cached_metrics.update( + { + "tis_lower_bound": tis_lower_bound, + "tis_upper_bound": tis_upper_bound, + "tis_lower_clipfrac": agg_loss(loss_mat=lower_clipped, loss_mask=response_mask, loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['response_mask'],).detach().item(), + "tis_upper_clipfrac": agg_loss(loss_mat=upper_clipped, loss_mask=response_mask, loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['response_mask'],).detach().item(), + "tis_total_clipfrac": agg_loss(loss_mat=total_clipped, loss_mask=response_mask, loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['response_mask'],).detach().item(), + "tis_clipped_ratio": clipped_ratio.detach(), + } + ) + + return TIS_loss + + def _compute_topr_loss( + self, + ratio: torch.Tensor, + log_probs: torch.Tensor, + old_log_probs: torch.Tensor, + advantages: torch.Tensor, + data: DataProto, + ): + """ + 计算TOPR (Tapered off-policy REINFORCE) 损失. https://arxiv.org/abs/2503.14286 + + 根据论文公式(8): + ∇J_TOPR(π) = Σ_{τ∈T^+} μ(τ)R(τ)∇log π(τ) + Σ_{τ∈T^-} μ(τ)[π(τ)/μ(τ)]_0^1 R(τ)∇log π(τ) + + - 正样本(T^+): SFT更新, 直接对log π(τ)求导, 不使用importance sampling + - 负样本(T^-): TIS更新, 使用clipped importance sampling ratio [0,1] + + Args: + ratio: 重要性采样比率 π(a|s) / π_old(a|s) [batch_size, seq_len] + log_probs: 当前策略的log概率 [batch_size, seq_len] + old_log_probs: 旧策略的log概率 [batch_size, seq_len] + advantages: 优势函数值 [batch_size, seq_len] + data: 数据,包含奖励/分数信息 + + Returns: + topr_loss: TOPR损失 [batch_size, seq_len] + """ + # 缓存TOPR配置 + positive_weight = self._get_or_cache_config("topr_positive_weight", 1.0) + negative_weight = self._get_or_cache_config("topr_negative_weight", 1.0) + + # scores = data.batch['scores']dim=@).to(current_platform.device_type) + scores = get_episode_scores(data).to(current_platform.device_type) + positive_mask = (scores > 0).float() + negative_mask = (scores <= 0).float() + + if not self._topr_sample_logged: + total_samples = len(scores) + positive_count = positive_mask.sum().item() + negative_count = negative_mask.sum().item() + self.logger.info( + f"TOPR样本分布 - 总样本: {total_samples}, 正样本: {positive_count} ({positive_count/total_samples*100:.1f}%), 负样本: {negative_count} ({negative_count/total_samples*100:.1f}%)" + ) + self.logger.info( + f"TOPR奖励统计 - 平均: {scores.mean().item():.4f}, 标准差: {scores.std().item():.4f}, 最大: {scores.max().item():.4f}, 最小: {scores.min().item():.4f}" + ) + self.logger.info(f"TOPR权重配置 - 正样本权重: {positive_weight}, 负样本权重: {negative_weight}") + self._topr_sample_logged = True + + # 计算损失组件 + positive_token_mask = positive_mask.unsqueeze(-1).expand_as(log_probs) + negative_token_mask = negative_mask.unsqueeze(-1).expand_as(log_probs) + + positive_loss = -advantages * log_probs * positive_token_mask + + # 负样本: TIS更新,使用clipped importance sampling ratio + # 梯度是: -[π(τ)/μ(τ)]_0^1 * R(τ) * ∇log π(τ) + clipped_ratio = torch.clamp(ratio, min=0.0, max=1.0).detach() + negative_loss = -clipped_ratio * advantages * log_probs * negative_token_mask + + weighted_positive_loss = positive_weight * positive_loss + weighted_negative_loss = negative_weight * negative_loss + + topr_loss = weighted_positive_loss + weighted_negative_loss + + # 缓存TOPR相关指标 + negative_lower_clipped = ((ratio < 0.0) & (negative_token_mask > 0)).float() + negative_upper_clipped = ((ratio > 1.0) & (negative_token_mask > 0)).float() + negative_total_clipped = negative_lower_clipped + negative_upper_clipped + self._cached_metrics.update( + { + "topr_positive_loss": positive_loss, + "topr_negative_loss": negative_loss, + "topr_weighted_positive_loss": weighted_positive_loss, + "topr_weighted_negative_loss": weighted_negative_loss, + "topr_positive_weight": positive_weight, + "topr_negative_weight": negative_weight, + "topr_positive_samples": positive_mask.sum().detach().item(), + "topr_negative_samples": negative_mask.sum().detach().item(), + "topr_positive_ratio": (positive_mask.sum() / (positive_mask.size(0) + 1e-8)).detach().item(), + "topr_negative_ratio": (negative_mask.sum() / (negative_mask.size(0) + 1e-8)).detach().item(), + "topr_negative_lower_clipfrac": negative_lower_clipped.mean().detach().item(), + "topr_negative_upper_clipfrac": negative_upper_clipped.mean().detach().item(), + "topr_negative_total_clipfrac": negative_total_clipped.mean().detach().item(), + "topr_scores_mean": scores.mean().detach().item(), + "topr_scores_std": scores.std().detach().item(), + } + ) + + return topr_loss + + def _compute_cispo_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, advantages: torch.Tensor): + """ + 计算CISPO (Clipped Importance Sampling Policy Optimization) 损失 + + 根据论文: https://arxiv.org/abs/2503.14286 + CISPO使用截断的重要性采样权重, 同时使用stop-gradient操作来稳定训练 + + 公式: J_CISPO(θ) = E[sg(r̂_t(θ)) * Â_t * log π_θ(a_t|s_t)] + 其中: r̂_t(θ) = clip(r_t(θ), 1-ε_low^IS, 1+ε_high^IS) + + Args: + ratio: 重要性采样比率 π(a|s) / π_old(a|s) [batch_size, seq_len] + log_probs: 当前策略的log概率 [batch_size, seq_len] + advantages: 优势函数值 [batch_size, seq_len] + + Returns: + cispo_loss: CISPO损失 [batch_size, seq_len] + """ + # 缓存CISPO配置 + epsilon_low = self._get_or_cache_config("cispo_epsilon_low", 0.1) + epsilon_high = self._get_or_cache_config("cispo_epsilon_high", 0.1) + use_unified_mask = self._get_or_cache_config("cispo_use_unified_mask", False) + + clip_lower = 1.0 - epsilon_low + clip_upper = 1.0 + epsilon_high + + if not self._cispo_config_logged: + self.logger.info(f"CISPO配置 - epsilon_low: {epsilon_low}, epsilon_high: {epsilon_high}") + self.logger.info(f"CISPO截断范围: [{clip_lower:.3f}, {clip_upper:.3f}]") + self.logger.info(f"CISPO使用统一mask: {use_unified_mask}") + self._cispo_config_logged = True + + clipped_ratio = torch.clamp(ratio, min=clip_lower, max=clip_upper) + + # 缓存CISPO相关指标 + lower_clipped = (ratio < clip_lower).float() + upper_clipped = (ratio > clip_upper).float() + total_clipped = (lower_clipped + upper_clipped).float() + + if use_unified_mask: + # 使用统一mask公式 (论文公式7). 实际上应该和PPO一致了 + # M_t = 0 if (A_t > 0 and r_t > 1+ε_high) or (A_t < 0 and r_t < 1-ε_low), else 1 + positive_advantages = advantages > 0 + negative_advantages = advantages < 0 + + mask_positive = positive_advantages & (ratio > clip_upper) + mask_negative = negative_advantages & (ratio < clip_lower) + token_mask = ~(mask_positive | mask_negative) + + cispo_loss = -clipped_ratio.detach() * advantages * log_probs * token_mask.float() + else: + cispo_loss = -clipped_ratio.detach() * advantages * log_probs + + cispo_metrics = { + "cispo_epsilon_low": epsilon_low, + "cispo_epsilon_high": epsilon_high, + "cispo_clip_lower": clip_lower, + "cispo_clip_upper": clip_upper, + "cispo_use_unified_mask": float(use_unified_mask), + "cispo_lower_clipfrac": lower_clipped.mean().detach().item(), + "cispo_upper_clipfrac": upper_clipped.mean().detach().item(), + "cispo_total_clipfrac": total_clipped.mean().detach().item(), + "cispo_clipped_ratio": clipped_ratio.detach(), + } + if use_unified_mask: + cispo_metrics.update( + { + "cispo_masked_positive_tokens": mask_positive.float().mean().detach().item(), + "cispo_masked_negative_tokens": mask_negative.float().mean().detach().item(), + "cispo_kept_tokens": token_mask.float().mean().detach().item(), + } + ) + + self._cached_metrics.update(cispo_metrics) + return cispo_loss + + def _compute_kimi15_loss( + self, ratio: torch.Tensor, log_probs: torch.Tensor, old_log_probs: torch.Tensor, advantages: torch.Tensor + ): + """ + 计算Kimi15损失 https://arxiv.org/pdf/2501.12599 + + 根据论文公式(3): + 1/k Σ (∇_θ log π_θ(y_j, z_j|x)(r(x, y_j, y*) - r̄) - τ/2 ∇_θ (log π_θ(y_j, z_j|x)/π_θ_i(y_j, z_j|x))^2) + + 这相当于最小化损失函数的负值: + L = -[(r - r̄) * log π_θ - τ/2 * (log π_θ/π_θ_i)^2] + """ + # 缓存Kimi15配置 + tau = self._get_or_cache_config("kimi15_tau", 0.1) + + if not self._kimi15_config_logged: + self.logger.info(f"Kimi15配置 - tau (正则化参数): {tau}") + self._kimi15_config_logged = True + + # 计算并缓存指标 + log_ratio = torch.log(ratio + 1e-8) + policy_grad_magnitude = (advantages * log_ratio).abs().mean().item() + kl_reg_magnitude = (tau * log_ratio.pow(2) * 0.5).mean().item() + + kimi15_loss = -advantages * log_probs + tau * 0.5 * (log_probs - old_log_probs).pow(2) + + self._cached_metrics.update( + { + "kimi15_tau": tau, + "kimi15_log_ratio_mean": log_ratio.mean().item(), + "kimi15_log_ratio_std": log_ratio.std().item(), + "kimi15_log_ratio_abs_mean": log_ratio.abs().mean().item(), + "kimi15_policy_grad_magnitude": policy_grad_magnitude, + "kimi15_kl_reg_magnitude": kl_reg_magnitude, + "kimi15_reg_ratio": kl_reg_magnitude / (policy_grad_magnitude + 1e-8), + } + ) + + return kimi15_loss + + def _get_pg_metrics(self, data: DataProto, batch_num_tokens: dict, global_valid_samples: dict,): + """ + 获取Policy Gradient相关的指标,使用缓存的值避免重复计算 + """ + # 从缓存中获取基础值 + cached = self._cached_metrics + ratio = cached["ratio"] + response_mask = cached["response_mask"] + + scores = get_episode_scores(data).to(current_platform.device_type) + positive_mask = (scores > 0).float() + negative_mask = (scores <= 0).float() + positive_token_mask = positive_mask.unsqueeze(-1).expand_as(response_mask) * response_mask + negative_token_mask = negative_mask.unsqueeze(-1).expand_as(response_mask) * response_mask + + # 构建基础指标 + base_metrics = { + "actor/ratio_mean@sum": agg_loss(loss_mat=ratio, loss_mask=response_mask, loss_agg_mode='seq-mean-token-mean', + global_valid_samples=global_valid_samples['response_mask'],).detach().item(), + "actor/ratio_max@max": torch.max(ratio * response_mask).detach().item(), + "actor/ratio_min@min": torch.min(ratio * response_mask + (1 - response_mask) * 1e10).detach().item(), + "actor/pg_loss@sum": cached["original_pg_loss"].detach().item(), + "actor/kl_loss@sum": cached["kl_loss"].detach().item(), + "actor/total_loss@sum": cached["total_loss"].detach().item(), + "actor/approxkl@sum": agg_loss( + loss_mat=cached["approxkl"], loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], global_valid_samples=global_valid_samples['response_mask'] + ).detach().item(), + "actor/policykl@sum": agg_loss( + loss_mat=cached["policykl"], loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], global_valid_samples=global_valid_samples['response_mask'] + ).detach().item(), + } + + # 根据PG变体添加特定指标 + pg_variant = cached["pg_variant"] + + if pg_variant == "ppo": + ppo_metrics = { + "actor/ppo_ratio_high_clipfrac@sum": cached["ppo_ratio_high_clipfrac"], + "actor/ppo_ratio_low_clipfrac@sum": cached["ppo_ratio_low_clipfrac"], + "actor/ppo_ratio_clipfrac@sum": cached["ppo_ratio_clipfrac"], + } + base_metrics.update(ppo_metrics) + + elif pg_variant == "tis": + tis_metrics = { + "actor/tis_lower_clipfrac@sum": cached["tis_lower_clipfrac"], + "actor/tis_upper_clipfrac@sum": cached["tis_upper_clipfrac"], + "actor/tis_total_clipfrac@sum": cached["tis_total_clipfrac"], + "actor/tis_clipped_ratio_mean@sum": agg_loss( + loss_mat=cached["tis_clipped_ratio"], loss_mask=response_mask, loss_agg_mode='seq-mean-token-mean', + global_valid_samples=global_valid_samples['response_mask']).detach().item(), + "actor/tis_lower_bound": cached["tis_lower_bound"], + "actor/tis_upper_bound": cached["tis_upper_bound"], + } + base_metrics.update(tis_metrics) + + elif pg_variant == "topr": + # 计算TOPR损失组件的聚合指标 + topr_loss_metrics = { + "actor/topr_positive_loss": agg_loss( + loss_mat=cached["topr_positive_loss"], + loss_mask=positive_token_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + ).detach().item(), + "actor/topr_negative_loss": agg_loss( + loss_mat=cached["topr_negative_loss"], + loss_mask=negative_token_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + ).detach().item(), + "actor/topr_weighted_positive_loss": agg_loss( + loss_mat=cached["topr_weighted_positive_loss"], + loss_mask=positive_token_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + ).detach().item(), + "actor/topr_weighted_negative_loss": agg_loss( + loss_mat=cached["topr_weighted_negative_loss"], + loss_mask=negative_token_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + ).detach().item(), + } + + topr_metrics = { + "actor/topr_positive_samples@sum": cached["topr_positive_samples"], + "actor/topr_negative_samples@sum": cached["topr_negative_samples"], + "actor/topr_positive_ratio": cached["topr_positive_ratio"], + "actor/topr_negative_ratio": cached["topr_negative_ratio"], + "actor/topr_negative_lower_clipfrac": cached["topr_negative_lower_clipfrac"], + "actor/topr_negative_upper_clipfrac": cached["topr_negative_upper_clipfrac"], + "actor/topr_negative_total_clipfrac": cached["topr_negative_total_clipfrac"], + "actor/topr_scores_mean": cached["topr_scores_mean"], + "actor/topr_scores_std": cached["topr_scores_std"], + "actor/topr_positive_weight": cached["topr_positive_weight"], + "actor/topr_negative_weight": cached["topr_negative_weight"], + **topr_loss_metrics, + } + base_metrics.update(topr_metrics) + + elif pg_variant == "cispo": + cispo_metrics = { + f"actor/cispo_{key}": value + for key, value in cached.items() + if key.startswith("cispo_") and key != "cispo_clipped_ratio" + } + + # 特殊处理需要计算的指标 + cispo_metrics["actor/cispo_clipped_ratio_mean@sum"] = agg_loss(loss_mat=cached["cispo_clipped_ratio"], + loss_mask=response_mask, + loss_agg_mode='seq-mean-token-mean', + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'])\ + .detach().item() + base_metrics.update(cispo_metrics) + + elif pg_variant == "kimi15": + kimi15_metrics = { + f"actor/kimi15_{key}": value for key, value in cached.items() if key.startswith("kimi15_") + } + base_metrics.update(kimi15_metrics) + + return base_metrics diff --git a/roll/pipeline/agentic/agentic_actor_worker.py b/roll/pipeline/agentic/agentic_actor_worker.py index 75510c675..fad7b3996 100644 --- a/roll/pipeline/agentic/agentic_actor_worker.py +++ b/roll/pipeline/agentic/agentic_actor_worker.py @@ -4,7 +4,8 @@ from roll.distributed.scheduler.protocol import DataProto from roll.pipeline.base_worker import ActorWorker as BaseActorWorker from roll.utils.functionals import masked_mean, agg_loss, compute_approx_kl - +from roll.pipeline.agentic.utils import compute_segment_masked_mean +from roll.utils.train_infer_corrections import compute_train_infer_correction class ActorWorker(BaseActorWorker): def loss_func(self, data: DataProto, output_tensor: torch.Tensor): @@ -17,6 +18,9 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): ref_log_probs = data.batch["ref_log_probs"] advantages = data.batch["advantages"] + batch_num_tokens = data.meta_info['batch_num_tokens'] + global_valid_samples = data.meta_info['global_valid_samples'] + log_probs = self.strategy.op_compute_log_probs( logits=output_tensor, input_ids=data.batch["input_ids"], attention_mask=data.batch["response_mask"] ) @@ -24,16 +28,40 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): infer_log_probs = data.batch.get("infer_logprobs", old_log_probs) infer_log_probs = infer_log_probs if len(infer_log_probs) > 0 else old_log_probs + train_infer_metric = {} + if not self.pipeline_config.enable_old_logprobs_recompute: + train_infer_is_weight, filter_mask, train_infer_metric = compute_train_infer_correction( + cfg=self.pipeline_config.train_infer_correction, + response_mask=response_mask, + old_log_probs=old_log_probs, + infer_log_probs=infer_log_probs, + global_valid_samples=global_valid_samples['response_mask'], + global_valid_tokens=batch_num_tokens['response_mask'], + ) + + # Apply filter mask to response_mask + response_mask = response_mask.long() * filter_mask.long() + else: + train_infer_is_weight = data.batch['train_infer_is_weight'] + if self.pipeline_config.ratio_type == "segment": - raise NotImplemented(f"ratio_type: {self.pipeline_config.ratio_type} not implemented") + # 计算序列级别的 ratio:对每段连续的1分别计算 masked_mean,不连续的段不相乘 + log_ratio = log_probs - old_log_probs + masked_log_ratio = compute_segment_masked_mean(log_ratio, response_mask) + ratio = masked_log_ratio.exp() else: ratio = (log_probs - old_log_probs).exp() - - train_infer_ratio = (log_probs - infer_log_probs).exp() - train_infer_diff = log_probs.exp() - infer_log_probs.exp() - pg_clip_low = self.pipeline_config.pg_clip_low if self.pipeline_config.use_pg_clip_range else self.pipeline_config.pg_clip - pg_clip_high = self.pipeline_config.pg_clip_high if self.pipeline_config.use_pg_clip_range else self.pipeline_config.pg_clip + pg_clip_low = ( + self.pipeline_config.pg_clip_low + if self.pipeline_config.use_pg_clip_range + else self.pipeline_config.pg_clip + ) + pg_clip_high = ( + self.pipeline_config.pg_clip_high + if self.pipeline_config.use_pg_clip_range + else self.pipeline_config.pg_clip + ) surr1 = ratio * advantages surr2 = ratio.clamp(1 - pg_clip_low, 1 + pg_clip_high) * advantages pg_loss = -torch.min(surr1, surr2) @@ -41,11 +69,17 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): dual_clip_loss = -torch.max(-pg_loss, (1 + self.pipeline_config.pg_clip * 2) * advantages) pg_loss = torch.where(advantages < 0, dual_clip_loss, pg_loss) - pg_loss = agg_loss(loss_mat=pg_loss, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode) + if self.pipeline_config.train_infer_correction.is_weight.enabled: + pg_loss = pg_loss * train_infer_is_weight - kl_loss = compute_approx_kl(log_probs=log_probs, log_probs_base=ref_log_probs, action_mask=response_mask, - kl_penalty="k3") - kl_loss = agg_loss(loss_mat=kl_loss, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode) + pg_loss = agg_loss(loss_mat=pg_loss, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], global_valid_samples=global_valid_samples['response_mask']) + + kl_loss = compute_approx_kl( + log_probs=log_probs, log_probs_base=ref_log_probs, action_mask=response_mask, kl_penalty="k3" + ) + kl_loss = agg_loss(loss_mat=kl_loss, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], global_valid_samples=global_valid_samples['response_mask']) approxkl = compute_approx_kl( log_probs=log_probs, log_probs_base=old_log_probs, action_mask=response_mask, kl_penalty="mse" @@ -62,37 +96,52 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): else: total_loss = pg_loss if self.pipeline_config.entropy_loss_coef > 0: - entropy = self.strategy.op_compute_entropy(logits=output_tensor, attention_mask=data.batch["response_mask"]) + entropy = self.strategy.op_compute_entropy( + logits=output_tensor, attention_mask=data.batch["response_mask"] + ) entropy_loss = agg_loss( loss_mat=entropy, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'], ) total_loss = total_loss - entropy_loss * self.pipeline_config.entropy_loss_coef - train_infer_prob_metric = { - "actor/train_infer_ratio_mean": masked_mean(train_infer_ratio, response_mask, dim=-1).mean().detach().item(), - "actor/train_infer_diff_mean": masked_mean(train_infer_diff, response_mask, dim=-1).mean().detach().item(), - } - pg_metrics = { - "actor/ppo_ratio_high_clipfrac": clipped_high.mean().detach().item(), - "actor/ppo_ratio_low_clipfrac": clipped_low.mean().detach().item(), - "actor/ppo_ratio_clipfrac": clipped.mean().detach().item(), - "actor/ratio_mean": masked_mean(ratio, response_mask, dim=-1).mean().detach().item(), - "actor/ratio_max": torch.max(ratio * response_mask).detach().item(), - "actor/ratio_min": torch.min(ratio * response_mask + (1 - response_mask) * 1e10).detach().item(), - "actor/clipfrac": agg_loss(loss_mat=torch.lt(surr2, surr1).float(), loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).detach().item(), - "actor/pg_loss": pg_loss.detach().item(), - "actor/kl_loss": kl_loss.detach().item(), - "actor/total_loss": total_loss.detach().item(), - "actor/approxkl": agg_loss(loss_mat=approxkl, loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).detach().item(), - "actor/policykl": agg_loss(loss_mat=policykl, loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).detach().item(), - **train_infer_prob_metric + "actor/ppo_ratio_high_clipfrac@sum": agg_loss(loss_mat=clipped_high, + loss_mask=response_mask, loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['response_mask'],).detach().item(), + "actor/ppo_ratio_low_clipfrac@sum": agg_loss(loss_mat=clipped_low, + loss_mask=response_mask, loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['response_mask'],).detach().item(), + "actor/ppo_ratio_clipfrac@sum": agg_loss(loss_mat=clipped, + loss_mask=response_mask, loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['response_mask'],).detach().item(), + "actor/ratio_mean@sum": agg_loss(loss_mat=ratio, + loss_mask=response_mask, loss_agg_mode='seq-mean-token-mean', + global_valid_samples=global_valid_samples['response_mask'],).detach().item(), + "actor/ratio_max@max": torch.max(ratio * response_mask).detach().item(), + "actor/ratio_min@min": torch.min(ratio * response_mask + (1 - response_mask) * 1e10).detach().item(), + "actor/clipfrac@sum": agg_loss( + loss_mat=torch.lt(surr2, surr1).float(), + loss_mask=response_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'], + ).detach().item(), + "actor/pg_loss@sum": pg_loss.detach().item(), + "actor/kl_loss@sum": kl_loss.detach().item(), + "actor/total_loss@sum": total_loss.detach().item(), + "actor/approxkl@sum": agg_loss( + loss_mat=approxkl, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], global_valid_samples=global_valid_samples['response_mask'] + ).detach().item(), + "actor/policykl@sum": agg_loss( + loss_mat=policykl, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], global_valid_samples=global_valid_samples['response_mask'] + ).detach().item(), + **train_infer_metric, } - - return total_loss, pg_metrics + return total_loss, pg_metrics diff --git a/roll/pipeline/agentic/agentic_config.py b/roll/pipeline/agentic/agentic_config.py index fc2e53cbb..def3246cd 100644 --- a/roll/pipeline/agentic/agentic_config.py +++ b/roll/pipeline/agentic/agentic_config.py @@ -41,7 +41,7 @@ def _resolve_reward_norm_defaults(method: str, grouping: str) -> Dict[str, Optio @dataclass class RewardNormalizationConfig: - grouping: str = field(default="state", metadata={"help": "state / batch / inductive"}) + grouping: str = field(default="batch", metadata={"help": "state / batch / inductive / global"}) method: str = field( default="identity", metadata={ @@ -61,6 +61,16 @@ class RewardNormalizationConfig: "help": "Std type for reward normalization: 'batch' (normalize across batch), 'group' (normalize within groups), None (without dividing by std)" }, ) + global_scale_factor: Optional[float] = field( + default=None, + metadata={ + "help": "Scale factor for global reward transformation. Applied after mapping if both are specified." + }, + ) + global_shift_value: Optional[float] = field( + default=None, + metadata={"help": "Shift value for global reward transformation. Applied after scaling if specified."}, + ) def __post_init__(self): @@ -160,6 +170,18 @@ def final_group_size(self): return self.group_size + self.group_size_redundancy +@dataclass +class RewardConfig(WorkerConfig): + llm_proxy: LLMProxyConfig = field(default_factory=LLMProxyConfig, metadata={"help": "llm proxy config."}) + +@dataclass +class EnvMonitorConfig: + """Configuration for environment activity monitoring.""" + enable: bool = field(default=True, metadata={"help": "Enable env monitoring"}) + monitor_interval: int = field(default=30, metadata={"help": "Monitor interval in seconds"}) + hung_timeout: int = field(default=3600, metadata={"help": "Hung timeout threshold in seconds (default: 1 hour)"}) + + @dataclass class AgenticConfig(PPOConfig): # agentic related @@ -167,9 +189,16 @@ class AgenticConfig(PPOConfig): train_env_manager: EnvManagerConfig = field(default_factory=EnvManagerConfig) val_env_manager: EnvManagerConfig = field(default_factory=EnvManagerConfig) render_save_dir: str = field(default=None, metadata={"help": "Directory to save rendered frames."}) + reward: RewardConfig = field(default=None, metadata={"help": "Configuration for reward inference."}) reward_normalization: RewardNormalizationConfig = field( default_factory=RewardNormalizationConfig, metadata={"help": "Reward normalization configuration."} ) + env_monitor: EnvMonitorConfig = field( + default_factory=EnvMonitorConfig, metadata={"help": "Environment monitoring configuration."} + ) + dirty_data_mask: bool = field(default=False, metadata={"help": "if dirty data mask is True, will mask dirty data"}) + open_feedback_turn: bool = field(default=False, metadata={"help": "open feedback turn"}) + use_token_reward: bool = field(default=False, metadata={"help": "use token reward"}) batch_adjust_mode: Literal["copy", "delete", "auto", "random_sample"] = field( default="copy", metadata={"help": "batch adjust mode: copy or delete"} @@ -178,20 +207,61 @@ class AgenticConfig(PPOConfig): step_reward_weight: float = field(default=1.0, metadata={"help": "Step reward weight, used in GiGPO."}) step_reward_gamma: float = field(default=0.95, metadata={"help": "Gamma parameter for step reward calculation"}) ratio_type: Literal["token", "segment"] = field(default="token", metadata={"help": "Ratio type: token or segment"}) + exp_mode: str = field( + default="train", + metadata={ + "help": "experiment mode: 'train' for training, 'eval_gt' for ground truth validation, 'eval_test' for unit test validation" + }, + ) + + partial_gpu_mode: bool = field( + default=True, + metadata={ + "help": "Enable partial GPU mode. When True, AgenticPipeline will validate and derive " + "partial_gpu_mode from device_mapping; when False, partial GPU logic is disabled." + }, + ) + + parse_tool_call_parameter_to_dict: bool = field(default=False, metadata={"help": "Parse tool call parameter to dict. for https://github.com/QwenLM/Qwen3-Coder/issues/444"}) def __post_init__(self): - self.actor_infer.generating_args.num_return_sequences = 1 + assert self.actor_infer.generating_args or self.train_env_manager.generating_args, "must have generating_args in env_manager or actor infer." + + # If actor_infer.generating_args exists, set it for both env managers + if self.actor_infer.generating_args: + self.train_env_manager.generating_args = self.actor_infer.generating_args + self.val_env_manager.generating_args = self.actor_infer.generating_args + # If train_env_manager.generating_args exists, set it for actor_infer + elif self.train_env_manager.generating_args: + self.actor_infer.generating_args = self.train_env_manager.generating_args + + # Ensure num_return_sequences is 1 for all generating_args + if self.actor_infer.generating_args: + self.actor_infer.generating_args.num_return_sequences = 1 + if self.train_env_manager.generating_args: + self.train_env_manager.generating_args.num_return_sequences = 1 + if self.val_env_manager.generating_args: + self.val_env_manager.generating_args.num_return_sequences = 1 + + logger.info(f"actor_infer.generating_args: {self.actor_infer.generating_args}\n" + f"train_env_manager.generating_args: {self.train_env_manager.generating_args}\n" + f"val_env_manager.generating_args: {self.val_env_manager.generating_args}") super().__post_init__() # default worker_cls if self.actor_train.worker_cls is None: self.actor_train.worker_cls = "roll.pipeline.agentic.agentic_actor_worker.ActorWorker" if self.actor_infer.worker_cls is None: - self.actor_infer.worker_cls = "roll.pipeline.base_worker.ActorWorker" + self.actor_infer.worker_cls = "roll.pipeline.base_worker.InferWorker" if self.reference.worker_cls is None: self.reference.worker_cls = "roll.pipeline.base_worker.ActorWorker" if self.critic.worker_cls is None: self.critic.worker_cls = "roll.pipeline.base_worker.CriticWorker" + if self.reward: + if self.reward.worker_cls is None: + self.reward.worker_cls = "roll.pipeline.base_worker.InferWorker" + if self.reward.name is None: + self.reward.name = "reward" self.train_env_manager.name = "train_env" self.val_env_manager.name = "val_env" @@ -205,9 +275,7 @@ def __post_init__(self): assert self.max_steps > 0 or self.max_steps == -1, "max_steps must be greater than 0 or -1" self.train_env_manager.model_args.model_name_or_path = self.pretrain - self.train_env_manager.generating_args = self.actor_infer.generating_args self.val_env_manager.model_args.model_name_or_path = self.pretrain - self.val_env_manager.generating_args = self.actor_infer.generating_args self.custom_envs = DictConfig(self.custom_envs) self.make_env_configs(self.train_env_manager) self.make_env_configs(self.val_env_manager) @@ -222,10 +290,30 @@ def __post_init__(self): logger.info(f"train_env_manager.max_traj_per_env: {self.train_env_manager.max_traj_per_env}") assert self.train_env_manager.max_traj_per_env >= traj_per_env, f"max_traj_per_env must be >= {traj_per_env}" + # Validate rollout_batch_size is compatible with group_size + # The scheduler collects trajectories in complete groups to maintain variance reduction properties + if self.rollout_batch_size > 0: # Skip validation if negative (unlimited batch) + assert self.rollout_batch_size % self.train_env_manager.group_size == 0, ( + f"rollout_batch_size ({self.rollout_batch_size}) must be a multiple of " + f"train_env_manager.group_size ({self.train_env_manager.group_size}). " + f"The scheduler collects trajectories in complete groups, so batch_size must be divisible by group_size. " + f"Suggested values: rollout_batch_size={self.rollout_batch_size} with group_size in {[i for i in [1, 2, 4, 8, 16] if self.rollout_batch_size % i == 0]}, " + f"or group_size={self.train_env_manager.group_size} with rollout_batch_size as a multiple of {self.train_env_manager.group_size}." + ) + val_env_num = self.val_env_manager.num_env_groups * self.val_env_manager.group_size if self.val_batch_size < 0: self.val_env_manager.max_traj_per_env = sys.maxsize else: + + # Validate val_batch_size is compatible with group_size (similar to rollout_batch_size validation) + assert self.val_batch_size % self.val_env_manager.group_size == 0, ( + f"val_batch_size ({self.val_batch_size}) must be a multiple of " + f"val_env_manager.group_size ({self.val_env_manager.group_size}). " + f"Suggested values: val_batch_size={self.val_batch_size} with group_size in {[i for i in [1, 2, 4, 8, 16] if self.val_batch_size % i == 0]}, " + f"or group_size={self.val_env_manager.group_size} with val_batch_size as a multiple of {self.val_env_manager.group_size}." + ) + assert ( self.val_batch_size % val_env_num == 0 ), f"val_batch_size {self.val_batch_size} must be divisible by val_env_num {val_env_num}, equal best" @@ -236,7 +324,19 @@ def __post_init__(self): logger.info(f"val_env_manager.max_traj_per_env: {self.val_env_manager.max_traj_per_env}") assert self.val_env_manager.max_traj_per_env >= traj_per_env, f"max_traj_per_env must be >= {traj_per_env}" - self.validate_worker_config() + if ( + hasattr(self, "actor_infer") + and isinstance(self.actor_infer, WorkerConfig) + and self.actor_infer.strategy_args is not None + ): + strategy_name = self.actor_infer.strategy_args.strategy_name + assert strategy_name in ["vllm", "sglang"] + max_concurrency = max( + self.train_env_manager.world_size * self.train_env_manager.max_env_num_per_worker + 1, + self.val_env_manager.world_size * self.val_env_manager.max_env_num_per_worker + 1, + ) + self.actor_infer.max_concurrency = max(self.actor_infer.max_concurrency, max_concurrency) + logger.info(f"Set max_concurrency of actor_infer to {self.actor_infer.max_concurrency}") def make_env_configs(self, env_manager_config: EnvManagerConfig): # construct env configs diff --git a/roll/pipeline/agentic/agentic_pipeline.py b/roll/pipeline/agentic/agentic_pipeline.py index 4e666e6fd..d3e812cd7 100644 --- a/roll/pipeline/agentic/agentic_pipeline.py +++ b/roll/pipeline/agentic/agentic_pipeline.py @@ -1,7 +1,7 @@ import json import os.path -import random import time +from concurrent.futures import ThreadPoolExecutor from typing import Any, Dict, List import numpy as np @@ -12,37 +12,52 @@ from ray.util.timer import _Timer from roll.datasets.global_dataset import GlobalDatasetManager -from roll.distributed.scheduler.rollout_scheduler import RolloutScheduler from roll.distributed.executor.cluster import Cluster from roll.distributed.scheduler.protocol import DataProto +from roll.distributed.scheduler.generate_scheduler import RequestScheduler +from roll.distributed.scheduler.rollout_scheduler import RolloutScheduler from roll.models.model_providers import default_tokenizer_provider from roll.pipeline.agentic.agentic_config import AgenticConfig, EnvManagerConfig -from roll.pipeline.agentic.utils import (dump_rollout_render, compute_discounted_returns, - compute_response_level_rewards, dump_rollout_trajectories, get_agentic_response_level_mask, agentic_compute_advantage) +from roll.pipeline.agentic.utils import ( + agentic_compute_advantage, + compute_discounted_returns, + compute_response_level_rewards, + dump_rollout_trajectories, + get_agentic_response_level_mask, +) from roll.pipeline.base_pipeline import BasePipeline from roll.utils.constants import RAY_NAMESPACE +from roll.utils.dynamic_batching import dynamic_batching_shard from roll.utils.functionals import ( - apply_kl_penalty, - compute_advantage, - reduce_metrics, - masked_mean, RunningMoments, - compute_clip_fraction, agg_loss, compute_token_reward, + masked_mean, + reduce_metrics, + batch_balance ) +from roll.utils.train_infer_corrections import apply_train_infer_correction_to_batch from roll.utils.kl_controller import get_kl_controller from roll.utils.logging import get_logger +from roll.utils.offload_states import OffloadStateType + logger = get_logger() +def is_lora_training(pipeline_config: AgenticConfig) -> bool: + return pipeline_config.actor_train.model_args.lora_target is not None + class AgenticPipeline(BasePipeline): def __init__(self, pipeline_config: AgenticConfig): super().__init__(pipeline_config) self.pipeline_config: AgenticConfig self.pipeline_config.set_max_steps(max_steps=self.pipeline_config.max_steps) + self.use_ref_model = self.pipeline_config.enable_reference and (not is_lora_training(self.pipeline_config)) + + # Derived configuration for partial GPU mode (auto-detected from device_mapping) + self.partial_gpu_mode: bool = False self.kl_ctrl = get_kl_controller( init_kl_coef=self.pipeline_config.init_kl_coef, @@ -50,12 +65,14 @@ def __init__(self, pipeline_config: AgenticConfig): kl_horizon=self.pipeline_config.kl_horizon, ) + # INIT PHASE: Create Clusters self.actor_train: Any = Cluster( name=self.pipeline_config.actor_train.name, worker_cls=self.pipeline_config.actor_train.worker_cls, resource_manager=self.resource_manager, worker_config=self.pipeline_config.actor_train, ) + self.actor_infer: Any = Cluster( name=self.pipeline_config.actor_infer.name, worker_cls=self.pipeline_config.actor_infer.worker_cls, @@ -64,7 +81,7 @@ def __init__(self, pipeline_config: AgenticConfig): ) download_clusters = [self.actor_train, self.actor_infer] - if self.pipeline_config.enable_reference: + if self.use_ref_model: self.reference: Any = Cluster( name=self.pipeline_config.reference.name, worker_cls=self.pipeline_config.reference.worker_cls, @@ -73,6 +90,7 @@ def __init__(self, pipeline_config: AgenticConfig): ) download_clusters.append(self.reference) + if self.pipeline_config.adv_estimator == "gae": self.critic: Any = Cluster( name=self.pipeline_config.critic.name, @@ -81,10 +99,46 @@ def __init__(self, pipeline_config: AgenticConfig): worker_config=self.pipeline_config.critic, ) download_clusters.append(self.critic) + + # INIT PHASE: Create Reward Cluster (if device_mapping is configured) + self.reward = None + self.reward_scheduler = None + if ( + self.pipeline_config.reward is not None + and len(self.pipeline_config.reward.device_mapping) > 0 + ): + self.reward: Any = Cluster( + name=self.pipeline_config.reward.name, + worker_cls=self.pipeline_config.reward.worker_cls, + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.reward, + ) + download_clusters.append(self.reward) + + # INIT PHASE: Download Models self.download_models(*download_clusters) self.tokenizer = default_tokenizer_provider(model_args=self.pipeline_config.actor_train.model_args) + if self.reward: + # Create reward scheduler as Ray named actor for environment managers to access + self.reward_scheduler = RequestScheduler.options( + name=f"RewardScheduler-{self.pipeline_config.reward.name}", + get_if_exists=True, + namespace=RAY_NAMESPACE, + scheduling_strategy=NodeAffinitySchedulingStrategy( + node_id=ray.get_runtime_context().get_node_id(), + soft=False, + ), + ).remote( + infer_cluster=self.reward, + pipeline_config=self.pipeline_config, + resource_manager=self.resource_manager, + ) + logger.info(f"Created reward scheduler as Ray named actor: RewardScheduler-{self.pipeline_config.reward.name}") + + # INIT PHASE: Create RolloutSchedulers self.train_rollout_scheduler = ray.remote(RolloutScheduler).options( + name="RolloutScheduler-train", scheduling_strategy=NodeAffinitySchedulingStrategy( node_id=ray.get_runtime_context().get_node_id(), soft=False)).remote( @@ -94,7 +148,9 @@ def __init__(self, pipeline_config: AgenticConfig): infer_cluster=self.actor_infer, mode="train", ) + self.val_rollout_scheduler = ray.remote(RolloutScheduler).options( + name="RolloutScheduler-val", scheduling_strategy=NodeAffinitySchedulingStrategy( node_id=ray.get_runtime_context().get_node_id(), soft=False)).remote( @@ -107,16 +163,23 @@ def __init__(self, pipeline_config: AgenticConfig): self.val_dataset_manager = GlobalDatasetManager.options(name=f"val_dataset_manager", get_if_exists=True, namespace=RAY_NAMESPACE).remote() + # INIT PHASE: Initialize Clusters refs: List[ray.ObjectRef] = [] refs.extend(self.actor_train.initialize(pipeline_config=self.pipeline_config, blocking=False)) if self.pipeline_config.adv_estimator == "gae": refs.extend(self.critic.initialize(pipeline_config=self.pipeline_config, blocking=False)) ray.get(refs) - self.actor_infer.initialize(pipeline_config=self.pipeline_config, blocking=True) + refs = [] + if self.reward: + # INIT PHASE: Initialize Reward Cluster + refs.extend(self.reward.initialize(pipeline_config=self.pipeline_config, blocking=False)) + refs.extend(self.actor_infer.initialize(pipeline_config=self.pipeline_config, blocking=False)) + ray.get(refs) - if self.pipeline_config.enable_reference: + if self.use_ref_model: refs.extend(self.reference.initialize(pipeline_config=self.pipeline_config, blocking=True)) + # INIT PHASE: Setup Operations self.set_model_update_pair( src_cluster=self.actor_train, tgt_cluster=self.actor_infer, @@ -130,6 +193,12 @@ def __init__(self, pipeline_config: AgenticConfig): self.running = RunningMoments() + # Validate partial GPU mode configuration and set self.partial_gpu_mode + if self.pipeline_config.partial_gpu_mode: + self.partial_gpu_mode = self._validate_partial_gpu_config() + else: + self.partial_gpu_mode = False + @torch.no_grad() def run(self): # Calculate tokens-per-second system throughput @@ -145,58 +214,156 @@ def run(self): # Add overall step timing with Timer(name="pipeline_step_total", logger=None) as step_timer: with tps_timer: + # PHASE 1: Offload States if self.pipeline_config.adv_estimator == "gae": self.critic.offload_states(blocking=True) self.actor_train.offload_states(blocking=True) + # PHASE 2: Suspend & Stop Server + # Suspend rollout scheduler to pause request processing ray.get(self.train_rollout_scheduler.suspend.remote()) - if self.pipeline_config.async_generation_ratio > 0: - self.actor_infer.stop_server() + # Stop generation server if using async mode (will restart after model update) + if self.pipeline_config.async_pipeline: + self.actor_infer.offload_states(include=OffloadStateType.other_params) + + # PHASE 3: Model Update with Timer(name="model_update", logger=None) as model_update_timer: model_update_metrics: Dict = self.model_update(global_step) metrics["time/step_model_update"] =model_update_timer.last - metrics.update(model_update_metrics) - if self.pipeline_config.async_generation_ratio > 0: - self.actor_infer.start_server(data=DataProto(meta_info={"global_step": global_step, "is_offload_states": False})) - else: - self.actor_infer.start_server(data=DataProto(meta_info={"global_step": global_step, "is_offload_states": True})) + + # PHASE 4: init kv cache + self.actor_infer.load_states() + if self.reward: + self.reward.load_states() + + # PHASE 5: Expand Sampler (partial GPU mode, step > 0) + # Restore routing state: model_update loaded states to ALL GPUs, now update active_dp_ranks + # Step 0: active_dp_ranks initialized with all ranks {0,1,2,3}, no expand needed + # Step 1+: After shrink in previous iteration, active_dp_ranks was {2,3}. + # model_update just loaded states to [0,1,2,3], so update routing state to match. + # Use skip_load=True to avoid re-loading already-loaded model states. + if self.partial_gpu_mode and global_step > 0: + target_gpus = [] + if hasattr(self.actor_train.worker_config, 'device_mapping') and self.actor_train.worker_config.device_mapping: + target_gpus.extend(self.actor_train.worker_config.device_mapping) + if self.pipeline_config.adv_estimator == "gae": + if hasattr(self.critic.worker_config, 'device_mapping') and self.critic.worker_config.device_mapping: + target_gpus.extend(self.critic.worker_config.device_mapping) + + if target_gpus: + expand_metrics = ray.get( + self.train_rollout_scheduler.expand_sampler.remote(target_gpus, skip_load=True) + ) + logger.info(f"Expand routing state (skip_load): {expand_metrics}") + metrics.update({"expand/" + k: v for k, v in expand_metrics.items()}) batch: DataProto = DataProto() batch.meta_info = {"global_step": global_step} - if self.pipeline_config.eval_steps > 0 and global_step % self.pipeline_config.eval_steps == 0: - with Timer(name="val", logger=None) as val_timer: - metrics.update(self.val(global_step=global_step)) - metrics["time/step_val"] = val_timer.last + # PHASE 6: Validation (every eval_steps) - Async + val_future = None + val_metrics = {} + with Timer(name="val", logger=None) as val_timer: + if self.pipeline_config.eval_steps > 0 and global_step % self.pipeline_config.eval_steps == 0: + # Submit val task to thread pool asynchronously + val_future = self.executor.submit(self.val, global_step) + + # PHASE 7: Rollout Get Batch + with Timer(name="rollout", logger=None) as rollout_timer: + batch = ray.get(self.train_rollout_scheduler.get_batch.remote(batch, self.pipeline_config.rollout_batch_size)) + sample_uuids = [f"{traj_id}_{i}" for i, traj_id in enumerate(batch.non_tensor_batch['traj_id'])] + batch.non_tensor_batch['sample_uuid'] = np.array(sample_uuids, dtype=object) + if "get_batch_return_start_time" in batch.meta_info: + metrics["time/get_batch_cost_train"] = time.time() - batch.meta_info.pop("get_batch_return_start_time") + actor_infer_metrics = self.actor_infer.get_metrics() + metrics.update(reduce_metrics(actor_infer_metrics.meta_info.pop("metrics", {}))) + metrics.update(compute_rollout_traj_metrics(batch)) + + dump_rollout_trajectories(self.pipeline_config.rollout_dump_dir, global_step, batch) + + metrics["time/step_rollout"] = rollout_timer.last + metrics.update(reduce_metrics(batch.meta_info.pop("metrics", {}))) + batch.meta_info["global_step"] = global_step + batch.meta_info["_broadcast_non_tensor_batch"] = True + batch.meta_info["loss_mask_keys"] = ["response_mask"] - with Timer(name="rollout", logger=None) as rollout_timer: - batch.meta_info["is_offload_states"] = True - batch = ray.get(self.train_rollout_scheduler.get_batch.remote(batch, self.pipeline_config.rollout_batch_size)) - sample_uuids = [f"{traj_id}_{i}" for i, traj_id in enumerate(batch.non_tensor_batch['traj_id'])] - batch.non_tensor_batch['sample_uuid'] = np.array(sample_uuids, dtype=object) - if "get_batch_return_start_time" in batch.meta_info: - metrics["time/get_batch_cost_train"] = time.time() - batch.meta_info.pop("get_batch_return_start_time") - actor_infer_metrics = self.actor_infer.get_metrics() - metrics.update(reduce_metrics(actor_infer_metrics.meta_info.pop("metrics", {}))) + # PHASE 8: Stop Server Sync (sync mode only) - Wait for async val to complete + if val_future is not None: + val_metrics = val_future.result() - dump_rollout_trajectories(self.pipeline_config.rollout_dump_dir, global_step, batch) + if len(val_metrics) > 0: + metrics.update(val_metrics) + metrics["time/step_val"] = val_timer.last - metrics["time/step_rollout"] = rollout_timer.last - metrics.update(reduce_metrics(batch.meta_info.pop("metrics", {}))) - batch.meta_info["global_step"] = global_step - if not (self.pipeline_config.async_generation_ratio > 0): - self.actor_infer.stop_server() + if not self.pipeline_config.async_pipeline: + # Suspend scheduler before offload actor infer, because there may be + # some inflight redundant trajectories. + ray.get(self.train_rollout_scheduler.suspend.remote()) + self.actor_infer.offload_states() + if self.reward: + self.reward.offload_states() + + # PHASE 9: Shrink Sampler (partial GPU mode) + # Partial GPU overlap: Shrink sampler to free training GPUs before training phase + # This offloads actor_infer models from training GPUs (e.g., [0,1]) so they can be + # used by actor_train and critic for the training phase. After shrink, actor_infer + # only has models loaded on inference-dedicated GPUs (e.g., [2,3]). + # + # Example with actor_infer on [0,1,2,3], actor_train on [0,1]: + # Before shrink: actor_infer has models on all GPUs [0,1,2,3] + # After shrink: actor_infer offloads from [0,1], keeps models on [2,3] + # During training: actor_train uses freed GPUs [0,1] + # Next iteration: model_update reloads actor_infer to all GPUs [0,1,2,3] + elif self.partial_gpu_mode: + with Timer(name="cal_ref_log_probs", logger=None) as shrink_timer: + target_gpus = [] + # Collect actor_train GPUs + if hasattr(self.actor_train.worker_config, 'device_mapping') and self.actor_train.worker_config.device_mapping: + target_gpus.extend(self.actor_train.worker_config.device_mapping) + # Collect critic GPUs if using GAE + if self.pipeline_config.adv_estimator == "gae": + if hasattr(self.critic.worker_config, 'device_mapping') and self.critic.worker_config.device_mapping: + target_gpus.extend(self.critic.worker_config.device_mapping) + + assert target_gpus, "cannot be empty" + shrink_metrics = ray.get(self.train_rollout_scheduler.shrink_sampler.remote(target_gpus)) + logger.info(f"Shrink sampler: {shrink_metrics}") + metrics.update({"shrink/" + k: v for k, v in shrink_metrics.items()}) + metrics["time/step_shrink"] = shrink_timer.last batch = compute_discounted_returns(batch, self.pipeline_config.adv_estimator, self.pipeline_config.step_reward_gamma) batch = self.adjust_batch(batch, mode=self.pipeline_config.batch_adjust_mode) metrics.update(reduce_metrics(batch.meta_info.pop("metrics", {}))) + # PHASE 11: Reference Log Probs with Timer(name="cal_ref_log_probs", logger=None) as cal_timer: + # TODO better the code structure, move the dynamic batching and sequence packing to worker/strategy if self.pipeline_config.enable_reference: - ref_log_probs_refs: List[ray.ObjectRef] = self.reference.compute_log_probs(batch, blocking=False) + worker_config = self.pipeline_config.reference if self.use_ref_model else self.pipeline_config.actor_train + worker = self.reference if self.use_ref_model else self.pipeline_config.actor_train + if worker_config.use_dynamic_batching_in_infer: + batch, dynamic_batching_metrics = dynamic_batching_shard( + batch, + worker.dp_size, + worker_config.max_tokens_per_microbatch_in_infer, + worker_config.sequence_length_round_in_infer, + worker_config.strategy_args.strategy_config.get("pipeline_model_parallel_size", 1), + worker_config.strategy_args.strategy_config.get("virtual_pipeline_model_parallel_size", None), + "reference/compute_log_probs", + ) + metrics.update(dynamic_batching_metrics) + if not self.use_ref_model: + batch.meta_info["disable_adapter"] = True + batch.meta_info["is_offload_states"] = False + batch_balance(batch, dp_size=self.actor_train.dp_size, minibatch_size=len(batch)) + ref_log_probs_refs: List[ray.ObjectRef] = self.actor_train.compute_log_probs(batch, blocking=False) + else: + batch_balance(batch, dp_size=self.reference.dp_size, minibatch_size=len(batch)) + ref_log_probs_refs: List[ray.ObjectRef] = self.reference.compute_log_probs(batch, blocking=False) + ref_log_probs = DataProto.materialize_concat(data_refs=ref_log_probs_refs) ref_log_probs.rename(old_keys="log_probs", new_keys="ref_log_probs") batch = batch.union(ref_log_probs) @@ -205,9 +372,24 @@ def run(self): metrics.update({"critic/ref_log_prob/mean": avg_ref_log_prob.item()}) metrics["time/step_ref_log_probs_values_reward"] = cal_timer.last + # PHASE 12: Old Log Probs & Values with Timer(name="cal_old_log_probs_values", logger=None) as cal_old_logpb_timer: + if self.pipeline_config.enable_reference and not self.use_ref_model: + batch.meta_info["disable_adapter"] = False batch.meta_info["is_offload_states"] = False if self.pipeline_config.enable_old_logprobs_recompute: + batch_balance(batch, dp_size=self.actor_train.dp_size, minibatch_size=len(batch)) + if self.pipeline_config.actor_train.use_dynamic_batching_in_infer: + batch, dynamic_batching_metrics = dynamic_batching_shard( + batch, + self.actor_train.dp_size, + self.pipeline_config.actor_train.max_tokens_per_microbatch_in_infer, + self.pipeline_config.actor_train.sequence_length_round_in_infer, + self.pipeline_config.actor_train.strategy_args.strategy_config.get("pipeline_model_parallel_size", 1), + self.pipeline_config.actor_train.strategy_args.strategy_config.get("virtual_pipeline_model_parallel_size", None), + "actor_train/compute_log_probs", + ) + metrics.update(dynamic_batching_metrics) old_log_probs: DataProto = self.actor_train.compute_log_probs(batch, blocking=True) batch.batch["old_log_probs"] = old_log_probs.batch["log_probs"] avg_old_log_prob = masked_mean(batch.batch["old_log_probs"], batch.batch["response_mask"][:, 1:]) @@ -245,6 +427,7 @@ def run(self): metrics.update(mask_metrics) metrics["time/step_cal_response_level_mask"] = timer.last + # PHASE 13: Advantage Computation with Timer(name="cal_response_norm_rewards", logger=None) as timer: # Rewards need to be processed after grouping # We can group by tag(env_type)/traj_group_id(group)/batch(rollout_batch)... to compute rewards / advantages @@ -275,13 +458,35 @@ def run(self): metrics.update(reduce_metrics(batch.meta_info.pop("metrics", {}))) metrics["time/step_adv"] = timer.last + if self.pipeline_config.enable_old_logprobs_recompute: + batch, corr_metrics = apply_train_infer_correction_to_batch(self.pipeline_config, batch, + update_mask_keys=batch.meta_info['loss_mask_keys']) + metrics.update(corr_metrics) + + # PHASE 14: Training (critic + actor) with Timer(name="train_timer", logger=None) as train_timer: if self.pipeline_config.adv_estimator == "gae": critic_train_metrics_refs: List[ray.ObjectRef] = self.critic.train_step(batch, blocking=False) # implement critic warmup if self.pipeline_config.critic_warmup <= global_step: + batch_balance_metrics = batch_balance(batch, dp_size=self.actor_train.dp_size, + minibatch_size=self.actor_train.dp_size * self.pipeline_config.actor_train.training_args.per_device_train_batch_size * + self.pipeline_config.actor_train.training_args.gradient_accumulation_steps, + logging_prefix="global_seqlen/actor_train") + metrics.update(batch_balance_metrics) # update actor + if self.pipeline_config.actor_train.use_dynamic_batching_in_train: + batch, dynamic_batching_metrics = dynamic_batching_shard( + batch, + self.actor_train.dp_size, + self.pipeline_config.actor_train.max_tokens_per_microbatch_in_train, + self.pipeline_config.actor_train.sequence_length_round_in_train, + self.pipeline_config.actor_train.strategy_args.strategy_config.get("pipeline_model_parallel_size", 1), + self.pipeline_config.actor_train.strategy_args.strategy_config.get("virtual_pipeline_model_parallel_size", None), + "actor_train/train_step", + ) + metrics.update(dynamic_batching_metrics) actor_train_metrics_refs = self.actor_train.train_step(batch, blocking=False) actor_train_metrics: DataProto = DataProto.materialize_concat(data_refs=actor_train_metrics_refs) metrics.update(reduce_metrics(actor_train_metrics.meta_info.pop("metrics", {}))) @@ -293,7 +498,7 @@ def run(self): metrics["time/step_train"] = train_timer.last with Timer(name="compute_data_metrics", logger=None) as data_metrics_timer: - data_metrics = compute_data_metrics(batch=batch) + data_metrics = compute_train_data_metrics(batch=batch) metrics["time/step_compute_data_metrics"] = data_metrics_timer.last metrics.update(data_metrics) @@ -318,6 +523,10 @@ def run(self): log_res = [] batch_grouped = batch.group_by(keys="traj_id") for group_name, group_batch in batch_grouped.items(): + if "step" in group_batch.non_tensor_batch.keys(): + indices = torch.argsort(torch.from_numpy(group_batch.non_tensor_batch["step"].astype(np.int64))) + group_batch.reorder(indices) + prompt_mask = group_batch.batch["prompt_mask"] non_prompt_mask = torch.logical_not(group_batch.batch["prompt_mask"]) * group_batch.batch["attention_mask"] input_ids = group_batch.batch["input_ids"] @@ -327,7 +536,7 @@ def run(self): responses = self.tokenizer.batch_decode(response_ids_list, skip_special_tokens=False) episode_scores = group_batch.non_tensor_batch["episode_scores"].tolist() step_scores = group_batch.non_tensor_batch["step_scores"].tolist() - if not isinstance(step_scores[0], float): + if isinstance(step_scores[0], np.ndarray): step_scores = [t.tolist() for t in step_scores] log_item = [] @@ -361,8 +570,11 @@ def run(self): self.train_rollout_scheduler.shutdown.remote(), self.val_rollout_scheduler.shutdown.remote(), ]) + + logger.info("pipeline complete!") + def val(self, global_step): batch = DataProto() metrics = {} @@ -434,6 +646,11 @@ def adjust_batch(self, data: DataProto, mode="copy") -> DataProto: metrics = data.meta_info.get("metrics", {}) metrics["system/batch_add_count"] = 0 metrics["system/batch_remove_count"] = 0 + + # 防止删除所有样本导致空批次 + if mode == "delete" and threshold >= batch_size: + mode = "copy" + if mode == "delete": remove_indices = np.random.choice(batch_size, threshold, replace=False) remove_indices = np.sort(remove_indices) @@ -463,6 +680,190 @@ def adjust_batch(self, data: DataProto, mode="copy") -> DataProto: return adjusted_batch + def _validate_partial_gpu_config(self) -> bool: + """Derive partial_gpu_mode from device_mapping and validate all requirements. + + Universal validations (both Model A and B): + - Reference colocation with actor_train + + Partial mode validations (Model B only - when train ⊂ infer): + 1. Minimum DP size (≥2) + 2. Async generation requirement (>0) + 3. Critic disjoint from actor_train + 4. Freed GPU capacity check + 5. TP/PP/EP compatibility + 6. At least 1 rank remains active + + Returns: + partial_gpu_mode: True if train ⊂ infer (Configuration Model B), + False if train ∩ infer = ∅ (Configuration Model A) + + Raises: + ValueError: Invalid configuration (device_mapping overlap, capacity issues, + DP size too small, missing async_generation_ratio, reference not colocated) + """ + # rvst: yangpeng + # Extract device mappings + train_devices = set(self.actor_train.worker_config.device_mapping) + infer_devices = set(self.actor_infer.worker_config.device_mapping) + critic_devices = set(self.critic.worker_config.device_mapping) if hasattr(self, 'critic') and self.critic else set() + ref_devices = set(self.reference.worker_config.device_mapping) if self.pipeline_config.enable_reference else set() + reward_devices = set(self.reward.worker_config.device_mapping) if self.reward else set() + + # VAL: VAL_NON_EMPTY - ensure device_mapping not empty + if not train_devices or not infer_devices: + raise ValueError( + f"device_mapping cannot be empty: " + f"train={list(train_devices)}, infer={list(infer_devices)}" + ) + + # Universal validation: Reference must always colocate with actor_train (both Model A and B) + # VAL: VAL_SUBSET (exact match) - reference colocation + if self.pipeline_config.enable_reference: + assert ref_devices == train_devices, ( + f"Reference device_mapping must match actor_train exactly: " + f"ref={list(ref_devices)}, train={list(train_devices)}" + ) + + # Determine configuration mode + if train_devices.isdisjoint(infer_devices): + # Configuration Model A: Disjoint GPUs + partial_gpu_mode = False + logger.info("Detected Configuration Model A: Disjoint device_mapping, partial_gpu_mode=False") + return partial_gpu_mode + + elif train_devices.issubset(infer_devices) and len(train_devices) < len(infer_devices): + # Configuration Model B: Partial overlap + partial_gpu_mode = True + logger.info("Detected Configuration Model B: Subset device_mapping, partial_gpu_mode=True") + + # CRITICAL VALIDATIONS (6 checks for partial mode) + + # Validation 1: Minimum DP size + # VAL: VAL_INT_RANGE(min=2, max=inf) - infer_dp_size + infer_dp_size = self.actor_infer.worker_config.world_size + assert infer_dp_size >= 2, ( + f"partial_gpu_mode requires actor_infer.dp_size >= 2, " + f"got {infer_dp_size}" + ) + + # Validation 2: Async generation required + # VAL: VAL_INT_RANGE(min=0.0, exclusive) - async_generation_ratio + async_ratio = self.pipeline_config.async_generation_ratio + assert async_ratio > 0, ( + f"partial_gpu_mode requires async_generation_ratio > 0, got {async_ratio}" + ) + + # Validation 3: Critic disjoint validation + # VAL: VAL_SUBSET(critic_devices, infer_devices) + disjoint check + if hasattr(self, 'critic') and self.critic is not None: + assert critic_devices.issubset(infer_devices), ( + f"Critic device_mapping must be subset of actor_infer: " + f"critic={list(critic_devices)}, infer={list(infer_devices)}" + ) + assert critic_devices.isdisjoint(train_devices), ( + f"Critic device_mapping must be disjoint from actor_train: " + f"critic={list(critic_devices)}, train={list(train_devices)}" + ) + + # Validation 4: Freed GPU capacity + # VAL: VAL_INT_RANGE - freed GPU count check (no overlap) + + + # Validation 5: TP/PP/EP compatibility + # VAL: VAL_INT_RANGE(min=1) + device_mapping divisibility check + # Extract TP and PP sizes from strategy config since workers aren't initialized yet + infer_strategy_config = self.actor_infer.worker_config.strategy_args.strategy_config + tp_size = infer_strategy_config.get("tensor_parallel_size", 1) + pp_size = infer_strategy_config.get("pipeline_parallel_size", 1) + + assert tp_size >= 1 and pp_size >= 1, ( + f"tp_size and pp_size must be >= 1: tp={tp_size}, pp={pp_size}" + ) + + expected_gpu_count = tp_size * pp_size * infer_dp_size + actual_gpu_count = len(infer_devices) + assert expected_gpu_count == actual_gpu_count, ( + f"Parallelism configuration mismatch: " + f"tp_size * pp_size * dp_size = {tp_size} * {pp_size} * {infer_dp_size} = {expected_gpu_count}, " + f"but device_mapping has {actual_gpu_count} GPUs" + ) + + # Validation 6: At least 1 rank remains active + # VAL: VAL_SUBSET, AST: AST_POSTCONDITION(remaining_ranks >= 1) + gpus_per_dp_rank = tp_size * pp_size + freed_gpus = train_devices | critic_devices + freed_gpu_list = list(freed_gpus) + self._validate_minimum_active_ranks( + infer_dp_size, infer_devices, freed_gpu_list, gpus_per_dp_rank + ) + + logger.info( + f"Partial GPU mode validated: infer_dp_size={infer_dp_size}, " + f"freed_gpus={sorted(freed_gpus)}" + ) + + return partial_gpu_mode + + else: + partial_gpu_mode = False + assert len(train_devices) == len(infer_devices) + len(reward_devices), "colocating mode" + assert self.pipeline_config.async_generation_ratio == 0, "colocating mode only support sync/on-policy training" + + return partial_gpu_mode + + + def _validate_minimum_active_ranks( + self, + infer_dp_size: int, + infer_devices: set, + freed_gpu_list: list, + gpus_per_dp_rank: int + ) -> None: + """Validate at least 1 DP rank remains active after shrink. + + Args: + infer_dp_size: Total DP size + infer_devices: Infer device_mapping (as set for validation) + freed_gpu_list: List of GPUs to free (train_devices | critic_devices) + gpus_per_dp_rank: GPUs per DP rank (tp * pp) + + Raises: + ValueError: If all ranks would be offloaded + """ + # First validate that freed GPUs are subset of infer GPUs + freed_gpu_set = set(freed_gpu_list) + if not freed_gpu_set.issubset(infer_devices): + raise ValueError( + f"Freed GPUs (train + critic) must be subset of infer device_mapping: " + f"freed={sorted(freed_gpu_list)}, infer={sorted(infer_devices)}" + ) + + # Convert infer_devices to ordered list to match DP rank assignment + infer_devices_list = sorted(list(infer_devices)) + + # Iterate through all DP ranks to find at least one that remains active + # Each DP rank uses gpus_per_dp_rank consecutive GPUs from device_mapping + at_least_one_active = False + for dp_rank in range(infer_dp_size): + # Get GPU range for this DP rank + start_idx = dp_rank * gpus_per_dp_rank + end_idx = start_idx + gpus_per_dp_rank + dp_rank_gpus = set(infer_devices_list[start_idx:end_idx]) + + # Check if this DP rank's GPUs are NOT in the freed set + if dp_rank_gpus.isdisjoint(freed_gpu_set): + at_least_one_active = True + break + + if not at_least_one_active: + raise ValueError( + f"At least 1 DP rank must remain active after shrink. " + f"All {infer_dp_size} DP ranks have at least one GPU in freed set. " + f"infer_devices={sorted(infer_devices_list)}, freed_gpus={sorted(freed_gpu_list)}, " + f"gpus_per_rank={gpus_per_dp_rank}" + ) + def get_episode_scores(batch: DataProto) -> torch.Tensor: batch_group_by_traj: Dict[str, DataProto] = batch.group_by(keys="traj_id") scores = [] @@ -487,17 +888,48 @@ def get_traj_env_time(batch: DataProto) -> torch.Tensor: scores.append(episode_scores) return torch.tensor(scores, dtype=torch.float32) -def compute_data_metrics(batch): + +def compute_rollout_traj_metrics(batch) -> Dict: + """ + Compute metrics for the rollout trajectory, before sample for train + """ + episode_scores = get_episode_scores(batch) + # fix: https://github.com/volcengine/verl/pull/60 + response_mask = batch.batch["response_mask"][:, 1:].bool() + prompt_mask = batch.batch["prompt_mask"].bool() # 首轮 prompt length + prompt_lengths = prompt_mask.sum(-1).float() # (batch_size,) + response_length = response_mask.sum(-1).float() # (batch_size,) + non_prompt_mask = (torch.logical_not(batch.batch["prompt_mask"]) * batch.batch["attention_mask"]).float().sum(-1) + + metrics = { + # score, sequence_score from env + "rollout/score/mean": torch.mean(episode_scores).detach().item(), + "rollout/score/max": torch.max(episode_scores).detach().item(), + "rollout/score/min": torch.min(episode_scores).detach().item(), + # response length + "rollout/response_length/mean": torch.mean(response_length).detach().item(), + "rollout/response_length/max": torch.max(response_length).detach().item(), + "rollout/response_length/min": torch.min(response_length).detach().item(), + # prompt length + "rollout/prompt_length/mean": torch.mean(prompt_lengths).detach().item(), + "rollout/prompt_length/max": torch.max(prompt_lengths).detach().item(), + "rollout/prompt_length/min": torch.min(prompt_lengths).detach().item(), + # non-prompt length + "rollout/non_prompt_length/mean": torch.mean(non_prompt_mask).detach().item(), + "rollout/non_prompt_length/max": torch.max(non_prompt_mask).detach().item(), + "rollout/non_prompt_length/min": torch.min(non_prompt_mask).detach().item(), + } + return metrics + +def compute_train_data_metrics(batch): + """ + Compute metrics on the training data. + This is different from `rollout_traj`: `rollout_traj` contains trajectory data for the entire batch, + while under `step_wise`, `train_batch` is sampled from `rollout_batch`, so the data distributions will differ. + """ # token_level_scores are per-token scores assigned by the reward model, possibly after normalization/clipping # score denotes the raw environment reward episode_scores = get_episode_scores(batch) - try: - traj_rollout_times = get_traj_rollout_time(batch) - traj_env_times = get_traj_env_time(batch) - except Exception as e: - traj_rollout_times = torch.zeros(batch.batch.batch_size[0], dtype=torch.float32) - traj_env_times = torch.zeros(batch.batch.batch_size[0], dtype=torch.float32) - sequence_reward = batch.batch["token_level_rewards"].sum(-1) advantages = batch.batch["advantages"] # fix: https://github.com/volcengine/verl/pull/60 @@ -543,17 +975,6 @@ def compute_data_metrics(batch): "tokens/non_prompt_length/mean": torch.mean(non_prompt_mask).detach().item(), "tokens/non_prompt_length/max": torch.max(non_prompt_mask).detach().item(), "tokens/non_prompt_length/min": torch.min(non_prompt_mask).detach().item(), - - # # traj_rollout_time - "env/traj_rollout_time/mean": torch.mean(traj_rollout_times).detach().item() if traj_rollout_times.numel() > 0 else 0.0, - "env/traj_rollout_time/max": torch.max(traj_rollout_times).detach().item() if traj_rollout_times.numel() > 0 else 0.0, - "env/traj_rollout_time/min": torch.min(traj_rollout_times).detach().item() if traj_rollout_times.numel() > 0 else 0.0, - - # traj_env_times - "env/traj_env_time/mean": torch.mean(traj_env_times).detach().item() if traj_env_times.numel() > 0 else 0.0, - "env/traj_env_time/max": torch.max(traj_env_times).detach().item() if traj_env_times.numel() > 0 else 0.0, - "env/traj_env_time/min": torch.min(traj_env_times).detach().item() if traj_env_times.numel() > 0 else 0.0, - } if "values" in batch.batch.keys(): diff --git a/roll/pipeline/agentic/agentic_rollout_pipeline.py b/roll/pipeline/agentic/agentic_rollout_pipeline.py index 0586d3ec9..f304cf8f2 100644 --- a/roll/pipeline/agentic/agentic_rollout_pipeline.py +++ b/roll/pipeline/agentic/agentic_rollout_pipeline.py @@ -13,6 +13,7 @@ from roll.distributed.scheduler.protocol import DataProto from roll.models.model_providers import default_tokenizer_provider from roll.pipeline.agentic.agentic_config import AgenticConfig +from roll.pipeline.agentic.agentic_pipeline import get_episode_scores from roll.pipeline.agentic.utils import dump_rollout_trajectories from roll.pipeline.base_pipeline import BasePipeline from roll.utils.functionals import ( @@ -64,9 +65,7 @@ def run(self): batch.meta_info = {"global_step": global_step} with Timer(name="rollout", logger=None) as rollout_timer: - if self.use_policy_model: - batch.meta_info["is_offload_states"] = True - self.actor_infer.start_server(data=batch) + self.actor_infer.load_states() batch = ray.get(self.rollout_scheduler.get_batch.remote(batch, self.pipeline_config.rollout_batch_size)) if batch is None: break @@ -78,14 +77,14 @@ def run(self): metrics["time/step_rollout"] = rollout_timer.last eval_metrics = reduce_metrics(batch.meta_info.get("metrics", {})) - eval_score = batch.batch["scores"].sum(-1) + eval_score = get_episode_scores(batch) eval_metrics["score/mean"] = torch.mean(eval_score).detach().item() eval_metrics["score/max"] = torch.max(eval_score).detach().item() eval_metrics["score/min"] = torch.min(eval_score).detach().item() batch_grouped = batch.group_by(keys="tags") for group_name, group_batch in batch_grouped.items(): - eval_score = group_batch.batch["scores"].sum(-1) + eval_score = get_episode_scores(group_batch) eval_metrics[f"{group_name}/score/mean"] = torch.mean(eval_score).detach().item() eval_metrics[f"{group_name}/score/max"] = torch.max(eval_score).detach().item() eval_metrics[f"{group_name}/score/min"] = torch.min(eval_score).detach().item() diff --git a/roll/pipeline/agentic/env/__init__.py b/roll/pipeline/agentic/env/__init__.py index e32f6d6c7..1dc5e771d 100644 --- a/roll/pipeline/agentic/env/__init__.py +++ b/roll/pipeline/agentic/env/__init__.py @@ -8,11 +8,13 @@ gem.register("sokoban", entry_point="roll.pipeline.agentic.env.sokoban:SokobanEnv") gem.register("frozen_lake", entry_point="roll.pipeline.agentic.env.frozen_lake:FrozenLakeEnv") +gem.register("sokoban_mcp", entry_point="roll.pipeline.agentic.env.mcp:SokobanMCPEnv") gem.register("roll_math", entry_point="roll.pipeline.agentic.env.gem.math_env:MathEnv") gem.register("roll_code", entry_point="roll.pipeline.agentic.env.gem.code_env:CodeEnv") gem.register("roll_qa", entry_point="roll.pipeline.agentic.env.gem.qa_env:QaEnv") gem.register("sokoban_sandbox", entry_point="roll.pipeline.agentic.env.sandbox:SokobanSandboxEnv") - +gem.register("sokoban_native_env", entry_point="roll.pipeline.agentic.env.sokoban.native_env:SokobanNativeEnv") +gem.register("deepeyes", entry_point="roll.pipeline.agentic.env.deepeyes:DeepEyesEnv") try: # add webshop-minimal to PYTHONPATH diff --git a/roll/pipeline/agentic/env/deepeyes/__init__.py b/roll/pipeline/agentic/env/deepeyes/__init__.py new file mode 100644 index 000000000..7756b3fa5 --- /dev/null +++ b/roll/pipeline/agentic/env/deepeyes/__init__.py @@ -0,0 +1,7 @@ +""" +Adapted from the nicely written code from gym_sokoban +""" + +from .env import DeepEyesEnv + +__all__ = ["DeepEyesEnv"] diff --git a/roll/pipeline/agentic/env/deepeyes/env.py b/roll/pipeline/agentic/env/deepeyes/env.py new file mode 100644 index 000000000..8b3b31cfc --- /dev/null +++ b/roll/pipeline/agentic/env/deepeyes/env.py @@ -0,0 +1,451 @@ +import os +import random +import requests +import hashlib +import json +import PIL.Image as Image +from io import BytesIO +from typing import Optional, Dict, List, Tuple + +import datasets +import ray +import numpy as np +from dacite import from_dict +from gem import Env +from transformers.image_utils import load_image + +from roll.configs.data_args import DataArguments +from roll.distributed.scheduler.protocol import DataProto +from roll.datasets.global_dataset import GlobalDataset, GlobalDatasetManager +from roll.pipeline.rlvr.rlvr_config import RewardConfig +from roll.pipeline.agentic.llm_proxy.proxy_utils import generate_by_proxy +from roll.utils.checkpoint_manager import file_lock_context +from roll.utils.constants import RAY_NAMESPACE, EpisodeStopReason +from roll.utils.random_utils import all_seed +from roll.utils.logging import get_logger + +from .utils import VisualToolBoxV2, get_prompt + + +logger = get_logger() + + +def load_images(images, timeout=None): + out_images = [] + for image in images: + if isinstance(image, dict): + image = Image.open(BytesIO(image["bytes"])) + image = load_image(image, timeout) + out_images.append(image) + return out_images + + +def encode_function( + data, + prompt_getter, + ground_truth_getter, + image_getter, + env_getter, + data_source_getter, + question_getter, +): + image_list = [] + for idx, image in enumerate(image_getter(data)): + try: + image_out = load_images(image if isinstance(image, (list, tuple)) else [image], timeout=None) + except Exception as e: + image_num = len(image) if isinstance(image, (list, tuple)) else 1 + image_out = [Image.new("RGB", (224, 224), (255, 255, 255))] * image_num + image_list.append(image_out) + encodings = { + "data_source": data_source_getter(data), + "images": image_list, + "prompt": prompt_getter(data), + "env_name": env_getter(data), + "ground_truth": ground_truth_getter(data), + "question": question_getter(data), + } + return encodings + + +def encode_dataset(dataset, num_proc, encode_function, new_fingerprint=None): + # regularized data filed + features = datasets.Features( + { + "data_source": datasets.Value(dtype="string"), + "images": datasets.Sequence(feature=datasets.Image(mode=None, decode=True)), + "prompt": dataset.features["prompt"], + "env_name": datasets.Value(dtype="string"), + "ground_truth": datasets.Value(dtype="string"), + "question": datasets.Value(dtype="string"), + # use index to match dataset item with rollout item + # "index": datasets.Value(dtype="int"), + } + ) + remove_columns = list(dataset.features.keys() - features.keys()) + prompt_getter = lambda data: data["prompt"] + ground_truth_getter = lambda data: [x["ground_truth"] for x in data["reward_model"]] + image_getter = lambda data: data["images"] + env_getter = lambda data: data["env_name"] + data_source_getter = lambda data: data["data_source"] + question_getter = lambda data: [x["question"] for x in data["extra_info"]] + logger.info(f"Begin : {dataset}") + dataset = dataset.map( + lambda data: encode_function( + data, + prompt_getter, + ground_truth_getter, + image_getter, + env_getter, + data_source_getter, + question_getter, + ), + batched=True, + batch_size=100, + num_proc=num_proc, + features=features, + remove_columns=remove_columns, + new_fingerprint=new_fingerprint, + desc="Encoding dataset", + ) + logger.info(f"Encoding: {dataset}") + return dataset + + +@ray.remote +class DeepEyesDataset(GlobalDataset.__ray_actor_class__): + def __init__( + self, + dataset_name, + split: str = "train", + mode="sample", + dataset_kwargs: Dict = None, + seed: Optional[int] = None, + epoch: Optional[int] = 0, + idx: Optional[int] = 0, + ): + num_proc = dataset_kwargs.pop("num_proc", 1) + logger.info("load dataset") + super().__init__(dataset_name, split, mode, dataset_kwargs) + # use seed/epoch/idx to resume + self.seed = seed + self.epoch = epoch + self.idx = idx + logger.info("encode dataset") + self.dataset = encode_dataset(dataset=self.dataset, num_proc=num_proc, encode_function=encode_function) + if self.seed is not None and self.mode != "traversal": + self.dataset = self.dataset.shuffle(seed=self.seed + self.epoch) + + async def get_data_item(self, seed: int, **kwargs): + if self.idx == len(self.dataset): + self.epoch += 1 + self.idx = 0 + if self.mode != "traversal": + self.dataset = self.dataset.shuffle(seed=self.seed + self.epoch) + data = None + if seed not in self.seed_to_idx: + self.seed_to_idx[seed] = self.idx + if self.idx < len(self.dataset): + data = self.dataset[self.idx] + self.idx += 1 + else: + stored_idx = self.seed_to_idx[seed] + if stored_idx < len(self.dataset): + data = self.dataset[stored_idx] + return data + + + +class DeepEyesEnv(Env): + image_placeholder: str = "" + + def __init__( + self, + data_args, + mode: str = "train", + seed: Optional[int] = None, + epoch: Optional[int] = 0, + idx: Optional[int] = 0, + max_steps: int = 10, + acc_weight: float = 0.8, + format_weight: float = 0.2, + tool_weight: float = 1.2, + reward_tokenizer=None, + reward_proxy=None, + enable_thinking: bool = False, + reward_generating_args: Optional[Dict] = None, + current_env_id: Optional[int] = None, + ): + data_args: DataArguments = from_dict(data_class=DataArguments, data=data_args) + self.mode = mode + self.visual_toolbox = VisualToolBoxV2() + self.max_steps = max_steps + + # Reward weights + self.acc_weight = acc_weight + self.format_weight = format_weight + self.tool_weight = tool_weight + + # Reward inference components + self.reward_tokenizer = reward_tokenizer + self.reward_proxy = reward_proxy + self.enable_thinking = enable_thinking + # Default generation config for reward model if not provided + self.reward_generating_args = reward_generating_args or { + "temperature": 0.2, + "max_new_tokens": 2048, + "top_p": 0.95, + } + + # Store current_env_id for src_rank tracking in reward inference + self.current_env_id = current_env_id if current_env_id is not None else 0 + + # Episode tracking + self.step_count = 0 + self.has_tool_call_failure = False + + # Convert train/val mode to sample/traversal for GlobalDataset + global_dataset_mode = "sample" if self.mode == "train" else "traversal" + self.dataset = DeepEyesDataset.options( + name=f"{self.mode}_deepeyes", get_if_exists=True, namespace=RAY_NAMESPACE + ).remote( + dataset_name=data_args.file_name, + split="train", + dataset_kwargs={"num_proc": data_args.preprocessing_num_workers}, + mode=global_dataset_mode, + seed=seed, + epoch=epoch, + idx=idx, + ) + self.dataset_manager = GlobalDatasetManager.options( + name=f"{self.mode}_dataset_manager", get_if_exists=True, namespace=RAY_NAMESPACE + ).remote() + ray.get(self.dataset_manager.register.remote(dataset_name="deepeyes", dataset_ref=self.dataset)) + + def reset(self, seed=None): + data: Optional[Dict] = ray.get(self.dataset.get_data_item.remote(seed=seed)) + self._data_item = data + first_obs = {"prompt": self._data_item["prompt"], "image": [self._data_item["images"][0]]} + self.visual_toolbox.reset(first_obs["image"]) + + # Reset episode tracking + self.step_count = 0 + self.has_tool_call_failure = False + + return first_obs, {} + + def step(self, action: str): + self.step_count += 1 + + # Handle control-type actions (EpisodeStopReason) + # Similar to terminal_native_env.py:281-286 + if isinstance(action, EpisodeStopReason) and action == EpisodeStopReason.MAX_LENGTH: + # Force termination and compute reward + logger.info(f"[MAX_LENGTH] Episode terminated due to MAX_LENGTH, step_count={self.step_count}") + reward, reward_info = self.obtain_outcome_reward("") + info = {"metrics": {}, "metrics_agg_mode": self.visual_toolbox.metrics_agg_mode} + if reward_info: + info.update(reward_info) + return "", reward, True, True, info + + result, _, done, exe_info = self.visual_toolbox.execute(action) + info = {"metrics": exe_info, "metrics_agg_mode": self.visual_toolbox.metrics_agg_mode} + + # Track tool call failures: if a tool call was attempted but failed or was invalid + # success_tool_call is 1 when tool call succeeds, 0 otherwise + if exe_info.get("tool_call", 0) == 1 and exe_info.get("success_tool_call", 0) == 0: + self.has_tool_call_failure = True + + # Check if max_steps is reached + step_limit_reached = self.step_count >= self.max_steps + truncated = False + + # If step limit is reached, force episode termination + if step_limit_reached and not done: + done = True + truncated = True + logger.info(f"[MAX_STEPS] Reached maximum steps ({self.max_steps}), truncating episode") + + # Compute reward on the last step (when done=True) + # Pass the action (final model response) to obtain_outcome_reward + reward = 0.0 + if done: + reward, reward_info = self.obtain_outcome_reward(action) + if reward_info: + info.update(reward_info) + + return result, reward, done, truncated, info + + def obtain_outcome_reward(self, response: str) -> Tuple[float, Dict]: + """ + Compute the final reward for the episode using LLM-as-judge. + + This method is called in step() when the episode terminates (done=True). + It extracts the answer from the model response, validates the format, + calls the reward model (LLM judge) to evaluate accuracy, and computes + the final weighted reward. + + Args: + response: The final model response (action from the last step) + + Returns: + Tuple[float, Dict]: (final_reward, reward_info) + - final_reward: weighted combination of acc, format, and tool rewards + - reward_info: dict with detailed reward breakdown and metadata + """ + # Extract answer and validate format from the response + # Following DeepEyesRewardWorker._get_llm_judgment logic + answer_text, is_format_error = self._extract_answer(response) + + # Get LLM judgment for accuracy if reward proxy is available + # Following the exact logic from DeepEyesRewardWorker._get_llm_judgment + acc_reward = 0.0 + llm_response = None + + if self.reward_proxy is not None and self.reward_tokenizer is not None: + question = self._data_item["question"] + ground_truth = self._data_item["ground_truth"] + + # yali: 与使用prompt作为question有diff, prompt里包含了system/user, question只包含问题 + judge_prompt_text = get_prompt(answer_text, ground_truth, question) + judge_messages = [ + {"role": "system", "content": "You are a helpful assistant."}, + {"role": "user", "content": judge_prompt_text}, + ] + + # Call reward model through proxy + llm_response = generate_by_proxy( + messages=judge_messages, + tokenizer=self.reward_tokenizer, + proxy=self.reward_proxy, + enable_thinking=self.enable_thinking, + generation_config=self.reward_generating_args, + src_rank=self.current_env_id, + ) + + if llm_response is not None: + acc_reward = self._extract_score(llm_response) + else: + # LLM judgment failed, return -999.0 (invalid sample) + logger.warning("LLM judgment failed (returned None), marking sample as invalid") + return -999.0, { + "reward_info": { + "final_reward": -999.0, + "acc_reward": 0.0, + "format_reward": 0.0, + "tool_reward": 0.0, + "llm_judgment_failed": True, + "response": response, + "answer": answer_text, + } + } + + # Penalize for model trying to predict longer answer to hack llm-as-judge + if len(answer_text) >= 1000: + acc_reward = 0.0 + is_format_error = True + + # Compute component rewards + # tool_reward is based on whether vision tools were used successfully + # - step_count > 1 means tools were called + # - acc_reward > 0.5 means the answer is correct + # - has_tool_call_failure=False means all tool calls were successful + format_reward = -1.0 if is_format_error else 0.0 + tool_reward = 1.0 if self.step_count > 1 and acc_reward > 0.5 and not self.has_tool_call_failure else 0.0 + + # Compute final weighted reward + final_reward = ( + self.acc_weight * acc_reward + + self.format_weight * format_reward + + self.tool_weight * tool_reward + ) + + # Build detailed reward info + reward_info = { + "reward_info": { + "final_reward": final_reward, + "acc_reward": acc_reward, + "format_reward": format_reward, + "tool_reward": tool_reward, + "is_format_error": is_format_error, + "step_count": self.step_count, + "has_tool_call_failure": self.has_tool_call_failure, + "question": self._data_item.get("question"), + "ground_truth": self._data_item.get("ground_truth"), + "response": response, + "answer": answer_text, + "llm_response": llm_response, + } + } + + # logger.info(json.dumps(reward_info, ensure_ascii=False)) + return final_reward, reward_info + + def _extract_answer(self, predict_str: str) -> Tuple[str, bool]: + """ + Extract answer from model response and validate format. + + Args: + predict_str: The model's response string + + Returns: + Tuple[str, bool]: (answer_text, is_format_error) + """ + is_format_error = False + + # Check think tags + count_think_1 = predict_str.count("") + count_think_2 = predict_str.count("") + if count_think_1 != count_think_2: + is_format_error = True + + # Extract content after last + predict_no_think = predict_str.split("")[-1].strip() + + # Check answer tags + count_answer_1 = predict_no_think.count("") + count_answer_2 = predict_no_think.count("") + if count_answer_1 != count_answer_2: + is_format_error = True + + # Extract answer text + answer_text = predict_str.split("")[-1].split("")[0].strip() + + return answer_text, is_format_error + + def _extract_score(self, response: str) -> float: + """ + Extract accuracy score from LLM judge response. + + Args: + response: The LLM judge's response string + + Returns: + float: Accuracy reward (1.0 or 0.0) + """ + if "Judgement:" in response: + response = response.split("Judgement:")[-1].strip() + if "1" in response: + return 1.0 + elif "0" in response: + return 0.0 + else: + logger.warning(f"[WARNING] Response format error: {response}") + return 0.0 + else: + if response == "1": + return 1.0 + elif response == "0": + return 0.0 + else: + logger.warning(f"[WARNING] Response format error: {response}") + return 0.0 + + def add_extra_data(self, data: DataProto, messages: List[Dict]): + data.non_tensor_batch.update( + { + "question": np.array([self._data_item["question"]], dtype=object), + "ground_truth": np.array([self._data_item["ground_truth"]], dtype=object), + "message": np.array([messages], dtype=object), + } + ) diff --git a/roll/pipeline/agentic/env/deepeyes/utils.py b/roll/pipeline/agentic/env/deepeyes/utils.py new file mode 100644 index 000000000..554bfed03 --- /dev/null +++ b/roll/pipeline/agentic/env/deepeyes/utils.py @@ -0,0 +1,370 @@ +""" +reference: https://github.com/Visual-Agent/DeepEyes/blob/main/verl/workers/agent/envs/mm_process_engine/visual_toolbox_v2.py +""" + + +import numpy as np +from typing import Dict, Any +import re +import json +from math import ceil, floor + + +class PROMPT: + SYSTEM_PROMPT_V1 = """You are a helpful assistant. + # Tools + You may call one or more functions to assist with the user query. + You are provided with function signatures within XML tags: + + {"type":"function","function":{"name":"image_zoom_in_tool","description":"Zoom in on a specific region of an image by cropping it based on a bounding box (bbox).","parameters":{"type":"object","properties":{"image_path":{"type":"string","description":"Path or URL of the image to zoom in."},"bbox":{"type":"array","items":{"type":"number"},"minItems":4,"maxItems":4,"description":"The bounding box of the region to zoom in, as [x1, y1, x2, y2], where (x1, y1) is the top-left corner and (x2, y2) is the bottom-right corner."}},"required":["image_path","bbox"]}}} + {"type":"function","function":{"name":"image_rotate_tool","description":"Rotate an image by a specified angle (clockwise or counterclockwise).","parameters":{"type":"object","properties":{"image_path":{"type":"string","description":"Path or URL of the image to be rotated."},"angle":{"type":"integer","description":"Rotation angle in degrees (e.g., 90, 180, 270). Positive values for clockwise, negative for counterclockwise."}},"required":["image_path","angle"]}}} + + For each function call, return a json object with function name and arguments within XML tags: + + {"name": , "arguments": } + """ + # user v1 failed, model do not output toolcall + USER_PROMPT_V1 = "\nReason in your mind and then give the final answer. Output strictly following the format [your inner thoughts][your final answer]." + # v2: no image_path + # SYSTEM_PROMPT_V2 = """You are a helpful assistant. + # # Tools + # You may call one or more functions to assist with the user query. + # You are provided with function signatures within XML tags: + # + # {"type":"function","function":{"name":"image_zoom_in_tool","description":"Zoom in on a specific region of an image by cropping it based on a bounding box (bbox).","parameters":{"type":"object","bbox":{"type":"array","items":{"type":"number"},"minItems":4,"maxItems":4,"description":"The bounding box of the region to zoom in, as [x1, y1, x2, y2], where (x1, y1) is the top-left corner and (x2, y2) is the bottom-right corner."}},"required":["bbox"]}}} + # + # For each function call, return a json object with function name and arguments within XML tags: + # + # {"name": , "arguments": } + # """ + SYSTEM_PROMPT_V2 = """You are a helpful assistant. +# Tools +You may call one or more functions to assist with the user query. +You are provided with function signatures within XML tags: + +{"type":"function","function":{"name":"image_zoom_in_tool","description":"Zoom in on a specific region of an image by cropping it based on a bounding box (bbox) and an optional object label.","parameters":{"type":"object","properties":{"bbox_2d":{"type":"array","items":{"type":"number"},"minItems":4,"maxItems":4,"description":"The bounding box of the region to zoom in, as [x1, y1, x2, y2], where (x1, y1) is the top-left corner and (x2, y2) is the bottom-right corner."},"label":{"type":"string","description":"The name or label of the object in the specified bounding box (optional)."}},"required":["bbox"]}}} + +# How to call a tool +Return a json object with function name and arguments within XML tags: + +{"name": , "arguments": } + +**Example**: + +{"name": "image_zoom_in_tool", "arguments": {"bbox_2d": [10, 20, 100, 200], "label": "the apple on the desk"}} +""" + USER_PROMPT_V2 = "\nThink first, call **image_zoom_in_tool** if needed, then answer. Format strictly as: ... ... (if tools needed) ... " + SYSTEM_PROMPT_V3 = "" + USER_PROMPT_V3 = """\nIf the images provided above are sufficient to answer the user's question, please put your final answer within . +Otherwise generate a new grouding in JSON format: +```json\n{\n "function": "zoom_in",\n "bbox_2d": [x1, y1, x2, y2],\n "label": "object_name"\n}\n``` +The zoomed-in image of your grounding will be provided in next turn. +""" + SYSTEM_PROMPT_V4 = "" + USER_PROMPT_V4 = """\nIf the current images are insufficient to answer the question, request a zoom-in by providing this tool_call object within tags: + +{"function": "zoom_in", "bbox_2d": [x1, y1, x2, y2], "label": "object_name"} + +The zoomed image will be provided in the next turn. Otherwise, provide your answer within tags. +""" + SYSTEM_PROMPT_V5 = """You are a helpful assistant. +# Tools +You are provided with function signatures within XML tags: + +{"type":"function","function":{"name":"image_zoom_in_tool","description":"Zoom in on a specific region of an image by cropping it based on a bounding box (bbox) and an optional object label.","parameters":{"type":"object","properties":{"bbox_2d":{"type":"array","items":{"type":"number"},"minItems":4,"maxItems":4,"description":"The bounding box of the region to zoom in, as [x1, y1, x2, y2], where (x1, y1) is the top-left corner and (x2, y2) is the bottom-right corner."},"label":{"type":"string","description":"The name or label of the object in the specified bounding box (optional)."}},"required":["bbox"]}}} + +# How to call a tool +Return a json object with function name and arguments within XML tags: + +{"name": , "arguments": } + +You may call **one or more** functions to assist with the user query. +**Example**: + +{"name": "image_zoom_in_tool", "arguments": {"bbox_2d": [10, 20, 100, 200], "label": "the apple on the desk"}} + + +{"name": "image_zoom_in_tool", "arguments": {"bbox_2d": [8, 40, 50, 150], "label": "the person under the tree"}} +""" + # USER_PROMPT_V5 = "\nThink first, call **image_zoom_in_tool** one or more times if needed, i.e., ... ... ... (if any tools needed) OR ... (if no tools needed)." + # # 看第一轮的rollout,这个会有一些问题,导致模型最后没回答,只是说了一句信息完备,不用调工具了。后续观察score上涨很快,应该自己学会了! + # TURN_PROMPT_V5 = "\nAbove are the tool responses after calling {}. Think first, continue to call **image_zoom_in_tool** if needed. Format strictly as: ... ... ... (if any tools needed)." + # TURN_PROMPT_V5_PLUS = """Think in your mind first, Analyze the problem thoroughly. Determine if available information suffices or if tools are needed. Decide whether to call tools one or more times or provide final answer. + # Then execute one action: tools OR complete response + # """ + TURN_PROMPT_V5 = "\nThink in the mind first, and then decide whether to call tools one or more times OR provide final answer. Format strictly as: ... ... ... (if any tools needed) OR ... (if no tools needed)." + USER_PROMPT_V5 = TURN_PROMPT_V5 + + + +class VisualToolBoxV2(object): + name = "visual_toolbox_v2" + # user_prompt = "Here is the cropped image returned after you calling the function {}.\nIf the images provided above are sufficient to answer the user's question, please put your final answer within . Otherwise you can continue to call tools within ." + user_prompt = PROMPT.USER_PROMPT_V2 + metrics_agg_mode = { + "extract_answer": "sum", + "extract_none": "sum", + "invalid_tool_call": "sum", + "success_tool_call": "sum", + "failed_tool_call": "sum", + "tool_call": "sum", + } + + def __init__(self): + self.multi_modal_data = None # To store the current image being processed + + def extract_answer(self, action_string: str) -> Dict[str, any]: + answer = re.findall(r"(.*?)", action_string, re.DOTALL) + return answer[-1] if answer else None + + def extract_action(self, action_string: str) -> Dict[str, Any]: + """ + Extracts the tool call from the action string. + Args: + action_string: The string containing the tool call in XML tags. + Returns: + A dictionary with the tool name and arguments. + Raises: + ValueError: If no tool call is found or JSON is invalid. + """ + tool_call_match = re.findall(r"(.*?)", action_string, re.DOTALL) + return tool_call_match[-1] if tool_call_match else None + + def execute(self, action_string: str, **kwargs) -> tuple: + """ + Execute the tool functionality based on the action string. + Args: + action_string: The string containing the tool call in XML tags. + Returns: + observation: The structured observation with the processed image. + reward: 0.1 if tool call is successful with correct JSON format, 0 otherwise. + done: Whether the episode is terminated. + info: Additional info. + """ + exe_info = { + "extract_answer": 0, + "extract_none": 0, + "invalid_tool_call": 0, + "success_tool_call": 0, + "failed_tool_call": 0, + "tool_call": 0, + } + answer = self.extract_answer(action_string) + if answer: + exe_info["extract_answer"] = 1 + return "", 0.0, True, exe_info + action = self.extract_action(action_string) + if not action: + exe_info["extract_none"] = 1 + return "", 0.0, True, exe_info + exe_info["tool_call"] = 1 + try: + tool_call = json.loads(action.strip()) + except Exception as e: + error_msg = f"Invalid tool call format: {action.strip()}. Error: {e}" + obs = f"Error: {str(error_msg)}" + exe_info["invalid_tool_call"] = 1 + return obs, 0.0, False, exe_info + try: + tool_name = tool_call["name"] + args = tool_call["arguments"] + if tool_name == "image_zoom_in_tool": + # Zoom in by cropping the image + # image_path = args["image_path"] + bbox = args["bbox_2d"] + bbox = self.maybe_resize_bbox(*bbox) + if not bbox: + raise ValueError(f"ZOOM IN ARGUMENTS ARE INVALID") + # img = Image.open(image_path) + img = self.multi_modal_data["image"][0] + cropped_img = img.crop(bbox) + current_image = cropped_img + elif tool_name == "image_rotate_tool": + # Rotate the image + # image_path = args["image_path"] + angle = args["angle"] + # img = Image.open(image_path) + img = self.multi_modal_data["image"][0] + rotated_img = img.rotate(angle) + current_image = rotated_img + else: + raise ValueError(f"Unknown tool name: {tool_name}") + obs = { + "prompt": "" + "" + self.user_prompt + "", + "image": [current_image], + } + reward = 0.0 # Reward for successful tool call with correct JSON + done = False + print(f"[DEBUG] SUCCESS ACTION {action_string=}") + exe_info["success_tool_call"] = 1 + return obs, reward, done, exe_info + except Exception as e: + # Return an error observation if something goes wrong + print(f"[DEBUG] Execute WRONG - {str(e)} {action_string=}") + obs = f"Error: {str(e)}" + reward = 0.0 # No reward for failed execution + done = False + exe_info["failed_tool_call"] = 1 + return obs, reward, done, exe_info + + def reset(self, image): + self.multi_modal_data = {"image": image} + self.height = self.multi_modal_data["image"][0].height + self.width = self.multi_modal_data["image"][0].width + + def validate_bbox(self, left, top, right, bottom): + try: + assert left < right and bottom > top, f"invalid shape for {left=}, {top=}, {right=}, {bottom=}" + height = bottom - top + width = right - left + assert max(height, width) / min(height, width) <= 100, ( + f"aspect ratio error: {left=}, {top=}, {right=}, {bottom=}" + ) + assert min(height, width) > 30, f"{height=}, {width=} is too small" + assert max(height, width) >= 56 and min(height, width) >= 14, ( + "images shape error, input image shape is too small" + ) + return True + except Exception as err: + print(f" [ERROR vl_agent #2] {err=}") + return False + + def maybe_resize_bbox(self, left, top, right, bottom): + left = max(0, left) + top = max(0, top) + right = min(self.width, right) + bottom = min(self.height, bottom) + if not self.validate_bbox(left, top, right, bottom): + return None + height = bottom - top + width = right - left + if height < 28 or width < 28: + center_x = (left + right) / 2.0 + center_y = (top + bottom) / 2.0 + ratio = 28 / min(height, width) + new_half_height = ceil(height * ratio * 0.5) + new_half_width = ceil(width * ratio * 0.5) + new_left = floor(center_x - new_half_width) + new_right = ceil(center_x + new_half_width) + new_top = floor(center_y - new_half_height) + new_bottom = ceil(center_y + new_half_height) + if not self.validate_bbox(new_left, new_top, new_right, new_bottom): + return None + return [new_left, new_top, new_right, new_bottom] + return [left, top, right, bottom] + + +def get_chat_template(): + chat_template = """ +Below are two answers to a question. Question is [Question], [Standard Answer] is the standard answer to the question, and [Model_answer] is the answer extracted from a model's output to this question. Determine whether these two answers are consistent. +Note that [Model Answer] is consistent with [Standard Answer] whenever they are essentially the same. If the meaning is expressed in the same way, it is considered consistent, for example, 'pink' and 'it is pink'. +If they are consistent, Judement is 1; if they are different, Judement is 0. Just output Judement and don't output anything else.\n\n +""" + return chat_template + + +def get_gpt4_score_ICE(): + example_1 = """ +[Question]: Is the countertop tan or blue? +[Standard Answer]: The countertop is tan. +[Model_answer] : tan +Judgement: 1 +""" # noqa + example_2 = """ +[Question]: On which side of the picture is the barrier? +[Standard Answer]: The barrier is on the left side of the picture. +[Model_answer] : left +Judgement: 1 +""" # noqa + example_3 = """ +[Question]: Is the kite brown and large? +[Standard Answer]: Yes, the kite is brown and large. +[Model_answer] : Yes +Judgement: 1 +""" # noqa + example_4 = """ +[Question]: Are the spots on a giraffe? +[Standard Answer]: No, the spots are on a banana. +[Model_answer] : no +Judgement: 1 +""" # noqa + example_5 = """ +[Question]: Who is wearing pants? +[Standard Answer]: The boy is wearing pants. +[Model_answer] : The person in the picture is wearing pants. +Judgement: 1 +""" # noqa + example_6 = """ +[Question]: Is the man phone both blue and closed? +[Standard Answer]: Yes, the man phone is both blue and closed. +[Model_answer] : No. +Judgement: 0 +""" # noqa + example_7 = """ +[Question]: What color is the towel in the center of the picture? +[Standard Answer]: The towel in the center of the picture is blue. +[Model_answer] : The towel in the center of the picture is pink. +Judgement: 0 +""" # noqa + return [example_1, example_2, example_3, example_4, example_5, example_6, example_7] + + +COMMON_VERIFY_PROMPT = """# CONTEXT # +I am a teacher, and I have some high-level reasoning problems. I am tasked with evaluating the correctness of a student's answer. +Below, I am provided with a problem and a reference answer. Additionally, a student's answer is provided. My job is to assess whether the student's answer captures the same meaning as the reference answer, even when expressed with different wording or format. +# OBJECTIVE # +I need you to judge whether the student's answer is correct given the ground truth answer. +Your tasks include: +1. Identify Semantic Equivalence: Carefully examine the expression in both answers. Confirm whether the semantic meaning of student's final answer is equivalent to the reference answer, even when expressed with different wording or format. +# TONE # +Professional, scientific. +# RESPONSE: MARKDOWN REPORT # +## Equivalence Judgement +[Whether the student's answer share the same meaning with the reference answer. (TRUE or FALSE)] +# ATTENTION # + - The reference answer is ALWAYS correct. You should carefully judge whether the student gives the same answer as reference answer. + - The Equivalence Judgement is only TRUE or FALSE. The answer is FALSE even if the student's final answer almost correct with a minor mistakes. + - Don't give extra explanation. +**Question**: +{query} +**Reference Answer** +{gold_ans} +## Student Final Answer +{pred_ans}""" +MATH_VERIFY_PROMPT = """# CONTEXT # +I am a teacher, and I have some high-level math problems. I am tasked with evaluating the correctness of a student's answer. +Below, I am provided with a problem and a reference answer. Additionally, a student's answer is provided. My job is to assess whether the student's answer captures the same meaning as the reference answer, even when expressed with different wording or format. +# OBJECTIVE # +I need you to judge whether the student's answer is correct given the ground truth answer. +Your tasks include: +1. Identify Mathematical or Notational Equivalence: Pay special attention to any LaTeX expressions in both answers. Confirm that the mathematical relationships, variables, and operations conveyed are equivalent. +# TONE # +Professional, scientific. +# RESPONSE: MARKDOWN REPORT # +## Equivalence Judgement +[Whether the student's answer share the same meaning with the reference answer. (TRUE or FALSE)] +# ATTENTION # + - The reference answer is ALWAYS correct. You should carefully judge whether the student gives the same answer as reference answer. + - The Equivalence Judgement is only TRUE or FALSE. The answer is FALSE even if the student's final answer almost correct with a minor mistakes. + - Don't give extra explanation. +**Question**: +{query} +**Reference Answer** +{gold_ans} +## Student Final Answer +{pred_ans}""" + + +def get_prompt(predict_str, ground_truth, question): + examples = get_gpt4_score_ICE() + chat_template = get_chat_template() + demo_prompt = chat_template + for example in examples: + demo_prompt += example + "\n\n" + test_prompt = f""" +[Question]: {question} +[Standard Answer]: {ground_truth} +[Model_answer] : {predict_str} +Judgement:""" + full_prompt = f"{demo_prompt}{test_prompt}" + return full_prompt diff --git a/roll/pipeline/agentic/env/sokoban/env.py b/roll/pipeline/agentic/env/sokoban/env.py index c51552ae4..5e5fc19f0 100644 --- a/roll/pipeline/agentic/env/sokoban/env.py +++ b/roll/pipeline/agentic/env/sokoban/env.py @@ -115,7 +115,7 @@ def step(self, action: str): "action_desc": action_desc } info.update(action_info) - return next_obs, reward, False, False, info + return next_obs, reward, terminated, False, info previous_pos = self.player_position _, reward, terminated, _ = GymSokobanEnv.step(self, action_info["action"]) diff --git a/roll/pipeline/agentic/env/sokoban/native_env.py b/roll/pipeline/agentic/env/sokoban/native_env.py new file mode 100644 index 000000000..2538e6529 --- /dev/null +++ b/roll/pipeline/agentic/env/sokoban/native_env.py @@ -0,0 +1,284 @@ +from typing import Any, Dict, List, Tuple, SupportsFloat, Union + +from roll.pipeline.agentic.env.sokoban.env import SokobanEnv +from roll.utils.constants import EpisodeStopReason +from roll.utils.logging import get_logger + + +class SokobanNativeEnv(SokobanEnv): + """ + Sokoban environment for iflow native mode. + + This environment provides Sokoban puzzle functionality using the iflow native + architecture. It's a simplified implementation that works with AgentNativeStepEnvManager + without requiring external services like ROCK or iflow. + """ + + def __init__( + self, + group_id: int = 0, + num_env_groups: int = 1, + max_steps: int = 10, + mode: str = "train", + debug: bool = False, + dim_room: Tuple[int, int] = (6, 6), + num_boxes: int = 1, + search_depth: int = 300, + format_penalty: float = -0.1, + action_pattern: str = "(.*?)", + system_template: str = None, + observation_suffix: str = None, + **kwargs + ): + """ + Initialize Sokoban native environment. + """ + # Store environment parameters + self.group_id = group_id + self.num_env_groups = num_env_groups + self.mode = mode + self.debug = debug + + # Runtime state + self.current_step = 0 + self.task_idx = 0 + self.logger = get_logger() + self.reward = 0 + self.terminated = False + self.truncated = False + self.env_reset_failed = False + self.env_timeout = False + self.failure_mode = "" + self.stop_reason = "" + self.error_messages = [] + self.test_output = "" + self.is_closed = False + + # Message history for conversation + self.message_history = [] + + self.system_template = system_template + if self.system_template is None: + self.system_template = "You're a helpful assistant. You are a good game player. You are aiming to get high reward in the game." + + # Initialize parent SokobanEnv + super().__init__( + render_mode="text", + dim_room=dim_room, + max_steps=max_steps, + num_boxes=num_boxes, + search_depth=search_depth, + format_penalty=format_penalty, + action_pattern=action_pattern, + reset=False, + **kwargs + ) + self.observation_suffix = observation_suffix + if self.observation_suffix is None: + action_lookup_str = "\nYour available actions are:\n" + ", ".join( + [f"{v}" for k, v in self.ACTION_LOOKUP.items()]) + self.observation_suffix = (f"\n\n\nIMPORTANT: Ensure that your response is the format of ' [your answer] ', with no extra text, eg. Right." + f"{action_lookup_str}\n. \n\n" + f"Decide the next action:\n") + + def reset(self, seed=None) -> Tuple[List[Dict], Dict]: + """ + Reset the environment and return initial observation. + + Returns: + observation: List of messages for the agent + info: Dictionary containing tools, error_msg, and failure_mode + """ + super().reset(seed) + self._clean_state() + + # Get the text observation from parent + text_obs, env_info = super().reset(seed) + + # Initialize message history + self.message_history = [ + { + "role": "system", + "content": f"{self.system_template}\n\n{env_info.get('env_instruction', self.get_instructions())}" + }, + { + "role": "user", + "content": f"Here is the current state:\n{text_obs}\n\n{self.observation_suffix}" + } + ] + + # Return info with empty tools (Sokoban doesn't use tools) + info = { + "tools": [], + "error_msg": "", + "failure_mode": self.failure_mode + } + + return self.message_history, info + + def step(self, action: str) -> Tuple[Union[List[Dict], str], SupportsFloat, bool, bool, dict[str, Any]]: + """ + Execute one step in the environment. + + Args: + action: Action string from the agent + + Returns: + observation: List of messages containing full conversation history + reward: Step reward + terminated: Whether episode ended + truncated: Whether episode was truncated + info: Additional information dictionary + """ + self.current_step += 1 + # Check for control actions + if isinstance(action, EpisodeStopReason): + if action in [EpisodeStopReason.MAX_LENGTH, EpisodeStopReason.ENV_TIMEOUT]: + self.terminated = True + self.truncated = True + self.stop_reason = action.name + observation = self.message_history # Return full history + return observation, self.reward, True, True, {} + + # Add assistant's response to message history + self.message_history.append({ + "role": "assistant", + "content": action + }) + + # Execute the action using parent step method + text_obs, reward, terminated, truncated, info = super().step(action) + + # Update state + self.reward = reward + self.terminated = terminated + self.truncated = truncated + + # Add new user message with updated state to message history + user_content = f"Current state:\n{text_obs}\n\n{self.observation_suffix}" + if info.get("action_is_valid", False): + user_content = (f"\n\n\n(IMPORTANT TIPS: the last action is not valid, your new response *must* strictly adhere to the format according system-reminder.)\n\n" + f"{user_content}") + user_message = { + "role": "user", + "content": user_content + } + self.message_history.append(user_message) + + # Add metrics to info + metrics = info.get("metrics", {}) + metrics.update({ + "env_timeout": self.env_timeout, + "env_reset_failed": self.env_reset_failed, + "success": self.boxes_on_target == self.num_boxes, + "raw_reward": self.reward, + "task_id": self.task_idx + }) + + metrics_agg_mode = info.get("metrics_agg_mode", {}) + info_new = { + "metrics": metrics, + "metrics_agg_mode": metrics_agg_mode, + "failure_mode": self.failure_mode, + "error_messages": self.error_messages, + "stop_reason": self.stop_reason, + "test_output": self.test_output + } + info.update(info_new) + + return self.message_history, self.reward, self.terminated, self.truncated, info + + def _clean_state(self): + """Clean up state for new episode.""" + self.task_idx += 1 + self.current_step = 0 + self.reward = 0 + self.terminated = False + self.truncated = False + self.env_reset_failed = False + self.env_timeout = False + self.failure_mode = "" + self.stop_reason = "" + self.error_messages.clear() + self.test_output = "" + self.is_closed = False + self.message_history = [] # Clear message history for new episode + + def close(self): + """Close the environment.""" + super().close() + self.is_closed = True + + @property + def env_info(self) -> Dict: + """Return environment information.""" + return { + "task_idx": self.task_idx, + "dim_room": self.dim_room, + "num_boxes": self.num_boxes, + "max_steps": self.max_steps, + "current_step": self.current_step, + "boxes_on_target": self.boxes_on_target, + } + +if __name__ == '__main__': + + env = SokobanNativeEnv( + dim_room=(6, 6), + num_boxes=2, + max_steps=10, + ) + + print("=== SokobanNativeEnv Debug ===") + + # Reset environment + obs, info = env.reset(seed=42) + print("\n[Initial Observation]") + print(f"Number of messages: {len(obs)}") + print(f"System message: {obs[0]['content']}") + print(f"User message: {obs[1]['content'][:200]}...") + + # Test some actions + actions = [ + "Up", + "Right", + "Down", + "Left", + "Up", + ] + + for i, action in enumerate(actions): + print(f"\n=== Step {i+1} ===") + print(f"Action: {action}") + + obs, reward, terminated, truncated, info = env.step(action) + + print(f"Reward: {reward}") + print(f"Terminated: {terminated}") + print(f"Truncated: {truncated}") + print(f"Success: {info.get('metrics', {}).get('success', False)}") + print(f"Current step: {env.current_step}") + print(f"Boxes on target: {env.boxes_on_target}/{env.num_boxes}") + + # Show last user message + if obs: + print(f"\nLatest observation:\n{obs[-1]['content']}") + + if terminated or truncated: + print(f"\nEpisode ended! Reason: {info.get('stop_reason', 'Unknown')}") + break + + # Test with invalid action + print("\n=== Testing Invalid Action ===") + obs, reward, terminated, truncated, info = env.step("invalid action") + print(f"Invalid action reward: {reward}") + print(f"Action valid: {info.get('metrics', {}).get('action_is_valid', False)}") + + # Show final environment info + print("\n=== Final Environment Info ===") + env_info = env.env_info + for key, value in env_info.items(): + print(f"{key}: {value}") + + env.close() + print("\n=== Debug Complete ===") diff --git a/roll/pipeline/agentic/env_manager/agent_native_env_manager.py b/roll/pipeline/agentic/env_manager/agent_native_env_manager.py new file mode 100644 index 000000000..6d11285a6 --- /dev/null +++ b/roll/pipeline/agentic/env_manager/agent_native_env_manager.py @@ -0,0 +1,521 @@ +import copy +import json +import time +from datetime import datetime +from typing import List, Union, Dict, Optional + +import numpy as np +import ray +import torch +from codetiming import Timer +from tensordict import TensorDict + +from roll.pipeline.agentic.agentic_config import AgenticConfig, EnvManagerConfig +from roll.pipeline.agentic.env_manager.base_env_manager import RolloutCache +from roll.distributed.scheduler.protocol import DataProto +from roll.pipeline.agentic.env_manager.token_mask_utils import convert_list_content_str +from roll.pipeline.agentic.env_manager.traj_env_manager import TrajEnvManager +from roll.utils.constants import GenerateStopReason, EpisodeStopReason +from roll.utils.functionals import pad_to_length, aggregate_metrics +from roll.utils.hash_utils import compute_object_hash + + +class AgentNativeStepEnvManager(TrajEnvManager): + """ + Used for native like format. + You can extend your format_messages as needed. + For swe/tb native env + # TODO: 增加业务指标,性能/error/timeout + """ + log_stats: Dict + failure_mode: str + env_reset_failed: bool + stop_reason: EpisodeStopReason + tools: List[Dict] + traj_start_time: float + + def run_rollout_loop(self, data: DataProto): + assert "seed" in data.meta_info + self.running = True + self.group_seed = data.meta_info['seed'] + self.env_config['group_seed'] + with Timer(name="reset", logger=None) as reset_timer: + rollout_cache: RolloutCache = self.reset() + self.log_stats["reset_time"] = round(reset_timer.last, 4) + start_step = self.current_step + max_reset_retries = 0 + while self.running and rollout_cache is not None: + + if self.env_reset_failed: + max_reset_retries += 1 + self.logger.error(f"[ROLLOUT_LOOP] Failed! - due to sandbox initialization failure...") + rollout: DataProto = self.create_placeholder_rollout(self.episode_id) + rollout.meta_info["drop_flag"] = True + + ray.get(self.output_queue.put.remote(self.env_config['group_id'], self.episode_id, start_step, rollout, self.env_config['env_id'])) + self.env.close() + if max_reset_retries > 3: + backoff_time = min(3600, 10 * max_reset_retries) + self.logger.warning(f"[ROLLOUT_LOOP] Avoidance mode - Backing off for {backoff_time}s (retry #{max_reset_retries})") + time.sleep(backoff_time) + else: + time.sleep(10) + with Timer(name="reset", logger=None) as reset_timer: + rollout_cache = self.reset() + self.log_stats["reset_time"] = round(reset_timer.last, 4) + start_step = self.current_step + continue + + max_reset_retries = 0 + with Timer(name="generate", logger=None) as generate_timer: + lm_output: DataProto = self.make_decision(rollout_cache) + stop_reason = lm_output.meta_info.pop("stop_reason") + if stop_reason == GenerateStopReason.MAX_LENGTH: + self.stop_reason = EpisodeStopReason.MAX_LENGTH + elif stop_reason == GenerateStopReason.ABORT: + self.stop_reason = EpisodeStopReason.ABORT + self.log_stats["current_step"].append(self.current_step) + self.log_stats["generate_time"].append(round(generate_timer.last)) + + with Timer(name="step", logger=None) as step_timer: + if stop_reason in [GenerateStopReason.FINISH, GenerateStopReason.MAX_LENGTH]: + rollout_cache: RolloutCache = self.step(lm_output) + self.log_stats["step_time"].append(round(step_timer.last, 4)) + + if self.running and rollout_cache.terminated: + rollout: DataProto = self.formulate_rollouts(rollout_cache) + traj_group_id = f"{self.rollout_cache.tag}_{self.rollout_cache.group_id}_{self.episode_id}_{self.group_seed}" + traj_id = f"{traj_group_id}_{self.rollout_cache.env_id}" + rollout.non_tensor_batch["traj_group_id"] = np.array([traj_group_id] * rollout.batch.batch_size[0], dtype=object) + rollout.non_tensor_batch["traj_id"] = np.array([traj_id] * rollout.batch.batch_size[0], dtype=object) + ray.get(self.output_queue.put.remote(self.env_config['group_id'], self.episode_id, start_step, rollout, self.env_config['env_id'])) + + rollout_cache = self.reset() + start_step = self.current_step + + ray.get(self.output_queue.put.remote(self.env_config['group_id'], self.episode_id, start_step, None, self.env_config['env_id'])) + + def reset(self) -> Optional[RolloutCache]: + self.log_stats = {"generate_time": [], "step_time": [], "current_step": [], "reset_time": 0.0, "response_length": [], "tokens_per_second": []} + self.stop_reason = EpisodeStopReason.FINISH + self.rollout_cache = RolloutCache(env_id=self.env_config['env_id'], + group_id=self.env_config['group_id'], + tag=self.env_config['tag']) + + self.episode_id = ray.get(self.output_queue.get_episode_id.remote( + self.env_config['group_id'], + self.env_config['env_id'] + )) + if self.episode_id is None: + assert not self.running + return None + + seed = self.group_seed + self.episode_id + self.traj_start_time = time.time() + observation, info = self.env.reset(seed=seed) + if observation is None: + return None + + if self.env.env_reset_failed: + self.env_reset_failed = True + self.logger.error(f"[ENV_RESET] Failed! - Environment reset failed, observation: {json.dumps(observation, ensure_ascii=False)}, env_reset_failed: {self.env.env_reset_failed}") + self.failure_mode = info.get("failure_mode", "Sandbox Initialization Failed") + self.stop_reason = EpisodeStopReason.ENV_RESET_FAILED + else: + self.env_reset_failed = False + + self.tools = info.get("tools", []) + self.rollout_cache.history.append({ + "observation": copy.deepcopy(observation), + "messages": None, # agent input messages + **info, + }) + return self.rollout_cache + + def step(self, llm_output: DataProto): + if llm_output.batch is not None: + response = self.tokenizer.batch_decode(llm_output.batch['responses'], skip_special_tokens=False)[0] + else: + response = self.stop_reason + observation, reward, terminated, truncated, info = self.env.step(action=response) + + self.rollout_cache.step += 1 + + # terminated 完全由swe|tb env决定 + self.rollout_cache.terminated = terminated + self.rollout_cache.truncated = truncated + if self.rollout_cache.step >= self.env_config.max_steps: + self.stop_reason = EpisodeStopReason.MAX_STEPS + self.rollout_cache.history[-1]['reward'] = reward + self.rollout_cache.history[-1]['llm_response'] = response + if info is not None: + self.rollout_cache.history[-1].update(info) + + self.rollout_cache.history.append({ + "observation": copy.deepcopy(observation), + "actions_left": self.env_config.max_steps - self.rollout_cache.step, + "messages": None + }) + return self.rollout_cache + + def make_decision(self, rollout_cache: RolloutCache): + lm_input = self.format_messages(rollout_cache) + input_ids = lm_input.batch["input_ids"] + + if input_ids.shape[1] >= self.pipeline_config.sequence_length: + self.logger.warning(f"sequence_length = {self.pipeline_config.sequence_length} input_ids length = {input_ids.shape[1]}," + f"maybe you should increase the response_length") + return DataProto(meta_info={"stop_reason": GenerateStopReason.MAX_LENGTH}) + + max_new_tokens = min(self.env_config["max_tokens_per_step"], + self.worker_config.generating_args.max_new_tokens, + self.pipeline_config.sequence_length-input_ids.shape[1]) + generation_config = self.worker_config.generating_args.to_dict() + generation_config["max_new_tokens"] = min(max_new_tokens, self.pipeline_config.sequence_length) + lm_input.meta_info["src_rank"] = self.env_config["env_id"] + + content = self.rollout_cache.history[-1] + input_messages = content['observation'] + + lm_output: DataProto = self.llm_proxy.generate(messages=input_messages, + lm_input=lm_input, + generation_config=generation_config) + + if lm_output is None: + return DataProto(meta_info={"stop_reason": GenerateStopReason.ABORT}) + + response_ids = lm_output.batch['responses'][0] + response_ids = response_ids.tolist() + + if "infer_logprobs" in lm_output.batch.keys(): + infer_logprobs = lm_output.batch['infer_logprobs'][0][-len(response_ids):] + content["infer_logprobs"] = infer_logprobs.tolist() + + content["response_ids"] = response_ids + content["messages"].append({"role": "assistant", "content": self.tokenizer.decode(response_ids, skip_special_tokens=True)}) + lm_output.meta_info["stop_reason"] = GenerateStopReason.FINISH + return lm_output + + def format_messages(self, rollout_cache: RolloutCache) -> DataProto: + current_cache = rollout_cache.history[-1] + + messages: List[Dict] = current_cache["observation"] + + prompt_ids = self.tokenizer.apply_chat_template(convert_list_content_str(messages, parse_tool_call_parameter_to_dict=self.pipeline_config.parse_tool_call_parameter_to_dict), + tools=self.tools, + tokenize=True, add_generation_prompt=True, enable_thinking=False) + input_ids = torch.tensor(prompt_ids, dtype=torch.long).unsqueeze(0) + attention_mask = torch.tensor([1] * input_ids.shape[1], dtype=torch.long).unsqueeze(0) + # Huggingface Transformers prefer position_ids to be 0-based. + # Attn Mask: [1, 1, 1, ..., 1, 0, 0, ..., 0] + # cumsum: [1, 2, 3, ..., n, n+1, n+1, ..., n+1] + # cumsum - 1: [0, 1, 2, ..., n-1, n, n, ..., n] + position_ids = attention_mask.cumsum(dim=-1) - 1 + lm_input = DataProto() + lm_input.batch = TensorDict({ + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids, + }, batch_size=input_ids.shape[0]) + + current_cache["prompt_ids"] = prompt_ids + current_cache['state_hash'] = compute_object_hash(messages) + current_cache['messages'] = messages + return lm_input + + def formulate_rollouts(self, rollout_cache: RolloutCache): + """ + Construct step-wise training samples from the collected trajectory. + TODO: 相同前序合并优化 + 样本构造方法: + - 按messages构造response_id + - 按response_id构造,纯step_wise用 + """ + last_observation = [] + if 'observation' in rollout_cache.history[-1]: + last_observation = rollout_cache.history[-1]['observation'] + rollout_cache.history.pop(-1) + + samples: List[DataProto] = [] + step_rewards = [i['reward'] for i in self.rollout_cache.history] + episode_score = sum(step_rewards) + + # Initialize lists for step length statistics + step_prompt_length_list = [] + step_response_length_list = [] + + all_messages: List[List[Dict]] = [] # 可能包含多条轨迹,相同前序的为一条messages + messages = None + for step, history in enumerate(rollout_cache.history): + if "response_ids" not in history: + break + + # Collect step length statistics + step_prompt_length_list.append(len(history["prompt_ids"])) + step_response_length_list.append(len(history["response_ids"])) + + token_ids = history["prompt_ids"] + history["response_ids"] + response_masks = [0] * len(history["prompt_ids"]) + [1] * len(history["response_ids"]) + input_ids =torch.tensor(token_ids, dtype=torch.long).unsqueeze(0) + attention_mask = torch.tensor([1] * len(token_ids), dtype=torch.long).unsqueeze(0) + response_mask = torch.tensor(response_masks, dtype=torch.bool).unsqueeze(0) + infer_logprobs = [] + if "infer_logprobs" in history: + infer_logprobs = [0] * len(history["prompt_ids"]) + history["infer_logprobs"] + + generate_time = self.log_stats["generate_time"][len(self.log_stats["response_length"])] + self.log_stats["response_length"].append(len(history["response_ids"])) + if generate_time > 0.01: + tokens_per_second = len(history["response_ids"]) / generate_time + self.log_stats["tokens_per_second"].append(tokens_per_second) + else: + self.log_stats["tokens_per_second"].append(0.0) + + first_response_idx = response_masks.index(1) + prompt_masks = [1] * first_response_idx + [0] * (len(token_ids) - first_response_idx) + prompt_mask = torch.tensor(prompt_masks, dtype=torch.bool).unsqueeze(0) + score_tensor = torch.tensor([0] * len(token_ids), dtype=torch.float).unsqueeze(0) + score_tensor[0][-1] = history['reward'] + # Huggingface Transformers prefer position_ids to be 0-based. + # Attn Mask: [1, 1, 1, ..., 1, 0, 0, ..., 0] + # cumsum: [1, 2, 3, ..., n, n+1, n+1, ..., n+1] + # cumsum - 1: [0, 1, 2, ..., n-1, n, n, ..., n] + position_ids = attention_mask.cumsum(dim=-1) - 1 + + input_ids = pad_to_length(input_ids, length=self.pipeline_config.sequence_length, pad_value=self.tokenizer.pad_token_id) + attention_mask = pad_to_length(attention_mask, length=self.pipeline_config.sequence_length, pad_value=0) + position_ids = pad_to_length(position_ids, length=self.pipeline_config.sequence_length, pad_value=0) + response_mask = pad_to_length(response_mask, length=self.pipeline_config.sequence_length, pad_value=0) + prompt_mask = pad_to_length(prompt_mask, length=self.pipeline_config.sequence_length, pad_value=0) + score_tensor = pad_to_length(score_tensor, length=self.pipeline_config.sequence_length, pad_value=0) + lm_input = DataProto( + batch=TensorDict( + { + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids, + "response_mask": response_mask, + "prompt_mask": prompt_mask, + "scores": score_tensor, + }, + batch_size=input_ids.shape[0]), + non_tensor_batch={ + "env_ids": np.array([self.rollout_cache.env_id], dtype=object), + "group_ids": np.array([self.rollout_cache.group_id], dtype=object), + "tags": np.array([self.rollout_cache.tag], dtype=object), + "step_scores": np.array([history["reward"]], dtype=object), # step-level reward, return by env + "episode_scores": np.array([episode_score], dtype=object), + "state_hash": np.array([history['state_hash']], dtype=object), + "step": np.array([step], dtype=object), + "trajectory_data": np.array([None], dtype=object), + "messages": np.array([None], dtype=object), + "tools": np.array([None], dtype=object), + "exp_name": np.array([self.pipeline_config.exp_name], dtype=object), + } + ) + if len(infer_logprobs): + infer_logprobs = torch.tensor(infer_logprobs, dtype=torch.float).unsqueeze(0) + infer_logprobs = pad_to_length(infer_logprobs, length=self.pipeline_config.sequence_length, pad_value=0) + lm_input.batch["infer_logprobs"] = infer_logprobs[:, 1:] + + samples.append(lm_input) + messages = history["messages"] + + # TODO: 需要更细致的处理 + # 可选的方式是,将content + tool_use dict 替换回response + all_messages.append(messages) + batch: DataProto = DataProto.concat(samples) + + response_length = batch.batch["response_mask"].float().sum(-1).mean().item() + metrics_agg_mode = self.rollout_cache.history[-1].get('metrics_agg_mode', {}) + history_metrics = [item.get("metrics", {}) for item in self.rollout_cache.history] + env_metric = aggregate_metrics(history_metrics=history_metrics, metrics_agg_mode=metrics_agg_mode) + env_metric["num_actions"] = rollout_cache.step + env_metric["env_timeout"] = getattr(self.env, "env_timeout", False) + timing_metric = { + "traj_time_env_total": round(float(time.time() - self.traj_start_time), 4), + "traj_time_reset": round(float(self.log_stats["reset_time"]), 4), + "traj_time_step": round(float(np.mean(self.log_stats["step_time"])), 4), + "traj_time_step_min": round(float(np.min(self.log_stats["step_time"])), 4), + "traj_time_step_max": round(float(np.max(self.log_stats["step_time"])), 4), + "traj_time_generate": round(float(np.mean(self.log_stats["generate_time"])), 4), + "traj_time_generate_min": round(float(np.min(self.log_stats["generate_time"])), 4), + "traj_time_generate_max": round(float(np.max(self.log_stats["generate_time"])), 4), + "traj_time_generate_sum": round(float(np.sum(self.log_stats["generate_time"])), 4), + "traj_time_response_length": round(float(np.mean(self.log_stats["response_length"])), 4), + "traj_time_response_length_min": round(float(np.min(self.log_stats["response_length"])), 4), + "traj_time_response_length_max": round(float(np.max(self.log_stats["response_length"])), 4), + "traj_time_tokens_per_second": round(float(np.mean(self.log_stats["tokens_per_second"])), 4), + "traj_time_tokens_per_second_min": round(float(np.min(self.log_stats["tokens_per_second"])), 4), + "traj_time_tokens_per_second_max": round(float(np.max(self.log_stats["tokens_per_second"])), 4), + } + length_metric = { + "response_length": float(response_length), + "step_prompt_length": round(float(np.mean(step_prompt_length_list)), 2), + "step_prompt_length_min": round(float(np.min(step_prompt_length_list)), 2), + "step_prompt_length_max": round(float(np.max(step_prompt_length_list)), 2), + "step_response_length": round(float(np.mean(step_response_length_list)), 2), + "step_response_length_min": round(float(np.min(step_response_length_list)), 2), + "step_response_length_max": round(float(np.max(step_response_length_list)), 2), + } + + env_metric.update(timing_metric) + env_metric.update(length_metric) + + env_metric = {f"env/{rollout_cache.tag}/{k}": v for k, v in env_metric.items()} + env_metric["env/response_length"] = response_length + batch.meta_info = {"metrics": env_metric} + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f") + start_step = self.log_stats["current_step"][0] + end_step = self.log_stats["current_step"][-1] + last_step_info = rollout_cache.history[-1] + failure_mode = last_step_info.get("failure_mode", "") + traj_id = f"{rollout_cache.tag}_{start_step}_{end_step}_{rollout_cache.group_id}_{rollout_cache.env_id}_{self.episode_id}_{self.group_seed}_{timestamp}" + trajectory_data = { + "trajectory_id": traj_id, + "timestamp": timestamp, + "current_step": self.current_step, + "env_info":{ + "env_id": rollout_cache.env_id, + "group_id": rollout_cache.group_id, + "tag": rollout_cache.tag, + "seed": self.group_seed, + "episode_id": self.episode_id, + "max_steps": self.env_config.max_steps, + "mode": self.mode, + "sequence_length": self.pipeline_config.sequence_length, + **self.env.env_info + }, + "timing_info": { + "traj_save_time": datetime.now().isoformat(), + **timing_metric + }, + "length_info": { + "trajectory_length": rollout_cache.step, + "num_actions": rollout_cache.step, + "terminated": rollout_cache.terminated, + "truncated": rollout_cache.truncated, + **length_metric + }, + "reward_info": { + "episode_reward": episode_score, + "step_rewards": step_rewards, + "first_round_reward": step_rewards[0] if step_rewards else 0, + "final_reward": step_rewards[-1] if step_rewards else 0 + }, + "failure_info": { + "failure_mode": last_step_info.get("failure_mode", ""), + "stop_reason": self.stop_reason.name, + "error_messages": last_step_info.get("error_messages", []), + "test_output": last_step_info.get("test_output", ""), + "has_failure": bool(failure_mode and failure_mode not in ['', 'none']), + "failure_step": rollout_cache.step, + }, + "metrics": env_metric, + "last_observation": last_observation + } + + # stepwise 样本只存一份traj data + batch.non_tensor_batch["trajectory_data"][-1] = json.dumps(trajectory_data) + batch.non_tensor_batch["messages"][-1] = json.dumps(all_messages) + batch.non_tensor_batch["tools"][-1] = json.dumps(self.tools) + + # 避免 trajectory_data dict 过大,导致写入/读取odps失败 + colummns_config = [ + ["trajectory_data", "string"], + ["messages", "string"], + ["tools", "string"], + ["exp_name", "string"], + ] + batch.meta_info["COLUMMNS_CONFIG"] = colummns_config + return batch + + def create_placeholder_rollout(self, episode_id): + """ + Create a minimal placeholder rollout with response_mask=1 to skip loss calculation. + """ + self.logger.info(f"[PLACEHOLDER_ROLLOUT] failure_mode: {self.failure_mode}") + + seq_len = length=self.pipeline_config.sequence_length + input_ids = torch.full((1, seq_len), self.tokenizer.pad_token_id, dtype=torch.long) + attention_mask = torch.zeros((1, seq_len), dtype=torch.long) + position_ids = torch.zeros((1, seq_len), dtype=torch.long) + response_mask = torch.zeros((1, seq_len), dtype=torch.bool) + prompt_mask = torch.zeros((1, seq_len), dtype=torch.bool) + score_tensor = torch.zeros((1, seq_len), dtype=torch.float) + + lm_input = DataProto() + lm_input.batch = TensorDict({ + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids, + "response_mask": response_mask, + "prompt_mask": prompt_mask, + "scores": score_tensor, + }, batch_size=1) + + + infer_logprobs = torch.zeros((1, seq_len - 1), dtype=torch.float) + lm_input.batch["infer_logprobs"] = infer_logprobs + + lm_input.non_tensor_batch = { + "env_ids": np.array([self.env_config['env_id']], dtype=object), + "group_ids": np.array([self.env_config['group_id']], dtype=object), + "tags": np.array([self.env_config['tag']], dtype=object), + "step_scores": np.array([0], dtype=object), + "episode_scores": np.array([0], dtype=object), + "state_hash": np.array([''], dtype=object), + "step": np.array([0], dtype=object), + "trajectory_data": np.array([None], dtype=object), + "messages": np.array([None], dtype=object), + "tools": np.array([None], dtype=object), + "exp_name": np.array([self.pipeline_config.exp_name], dtype=object), + } + + traj_group_id = f"{self.env_config['tag']}_{self.env_config['group_id']}_{episode_id}_{self.group_seed}" + traj_id = f"{traj_group_id}_{self.env_config['env_id']}" + lm_input.non_tensor_batch["traj_group_id"] = np.array([traj_group_id] * lm_input.batch.batch_size[0], dtype=object) + lm_input.non_tensor_batch["traj_id"] = np.array([traj_id] * lm_input.batch.batch_size[0], dtype=object) + + colummns_config = [ + ["trajectory_data", "string"], + ["messages", "string"], + ["tools", "string"], + ["exp_name", "string"], + ] + lm_input.meta_info["COLUMMNS_CONFIG"] = colummns_config + lm_input.meta_info["metrics"] = {} + return lm_input + + + +class GroupFilter: + def __init__(self, config: AgenticConfig, env_manager_config: EnvManagerConfig, mode: str): + self.config = config + self.env_manager_config = env_manager_config + self.mode = mode + self.global_filter_stats = {"total": 0, "filtered": 0} + + def filter(self, group_id: int, episode_id: int, group: list[DataProto]): + self.global_filter_stats["total"] += 1 + should_drop = False + for data in group: + if data.meta_info.get("drop_flag", False): + should_drop = True + + if not should_drop: + return False + + current_global_filter_ratio = ( + self.global_filter_stats["filtered"] / self.global_filter_stats["total"] + if self.global_filter_stats["total"] > 0 else 0.0 + ) + + if current_global_filter_ratio >= 0.5: + return False + + if (self.global_filter_stats["filtered"] + 1) / self.global_filter_stats["total"] > 0.5: + return False + + self.global_filter_stats["filtered"] += 1 + return True \ No newline at end of file diff --git a/roll/pipeline/agentic/env_manager/step_concat_env_manager.py b/roll/pipeline/agentic/env_manager/step_concat_env_manager.py index 9a32e7645..23438f935 100644 --- a/roll/pipeline/agentic/env_manager/step_concat_env_manager.py +++ b/roll/pipeline/agentic/env_manager/step_concat_env_manager.py @@ -36,7 +36,8 @@ def format_messages(self, rollout_cache: RolloutCache) -> DataProto: prompt_ids = custom_apply_chat_template(messages=messages, tokenizer=self.tokenizer, add_generation_prompt=True) input_ids = torch.tensor(prompt_ids, dtype=torch.long).unsqueeze(0) attention_mask = torch.tensor([1] * input_ids.shape[1], dtype=torch.long).unsqueeze(0) - position_ids = attention_mask.cumsum(dim=-1) + # Huggingface Transformers prefer position_ids to be 0-based. + position_ids = attention_mask.cumsum(dim=-1) - 1 lm_input = DataProto() lm_input.batch = TensorDict({ "input_ids": input_ids, diff --git a/roll/pipeline/agentic/env_manager/step_env_manager.py b/roll/pipeline/agentic/env_manager/step_env_manager.py index 4348605a3..987ff97f1 100644 --- a/roll/pipeline/agentic/env_manager/step_env_manager.py +++ b/roll/pipeline/agentic/env_manager/step_env_manager.py @@ -52,7 +52,7 @@ def format_messages(self, rollout_cache: RolloutCache) -> DataProto: prompt_ids = custom_apply_chat_template(messages=messages, tokenizer=self.tokenizer, add_generation_prompt=True) input_ids = torch.tensor(prompt_ids, dtype=torch.long).unsqueeze(0) attention_mask = torch.tensor([1] * input_ids.shape[1], dtype=torch.long).unsqueeze(0) - position_ids = attention_mask.cumsum(dim=-1) + position_ids = attention_mask.cumsum(dim=-1) - 1 lm_input = DataProto() lm_input.batch = TensorDict({ "input_ids": input_ids, @@ -88,7 +88,11 @@ def formulate_rollouts(self, rollout_cache: RolloutCache): prompt_mask = torch.tensor(prompt_masks, dtype=torch.bool).unsqueeze(0) score_tensor = torch.tensor([0] * len(token_ids), dtype=torch.float).unsqueeze(0) score_tensor[0][-1] = history['reward'] - position_ids = attention_mask.cumsum(dim=-1) + # Huggingface Transformers prefer position_ids to be 0-based. + # Attn Mask: [1, 1, 1, ..., 1, 0, 0, ..., 0] + # cumsum: [1, 2, 3, ..., n, n+1, n+1, ..., n+1] + # cumsum - 1: [0, 1, 2, ..., n-1, n, n, ..., n] + position_ids = attention_mask.cumsum(dim=-1) - 1 input_ids = pad_to_length(input_ids, length=self.pipeline_config.sequence_length, pad_value=self.tokenizer.pad_token_id) attention_mask = pad_to_length(attention_mask, length=self.pipeline_config.sequence_length, pad_value=0) diff --git a/roll/pipeline/agentic/env_manager/token_mask_utils.py b/roll/pipeline/agentic/env_manager/token_mask_utils.py index c80de49b3..833081942 100644 --- a/roll/pipeline/agentic/env_manager/token_mask_utils.py +++ b/roll/pipeline/agentic/env_manager/token_mask_utils.py @@ -1,8 +1,11 @@ +import json from typing import List, Dict from functools import lru_cache from transformers import PreTrainedTokenizer from roll.datasets.collator import DataCollatorWithPaddingForMM +from roll.utils.logging import get_logger +logger = get_logger() @lru_cache(maxsize=10) @@ -183,7 +186,7 @@ def token_ids_to_assistant_mask(messages: List[Dict], input_ids_list: List[List] return assistant_mask_list -def split_by_token(input_ids: list, token: int) -> list[list]: +def split_by_token(input_ids: list, token: int, messages: List[Dict], tokenizer: PreTrainedTokenizer) -> list[list]: """ Split the input_ids list by the given token and return a list of lists. Each sub-list starts with that token. @@ -220,4 +223,96 @@ def split_by_token(input_ids: list, token: int) -> list[list]: if current_segment: result.append(current_segment) + if len(result) == len(messages): + return result + input_ids_list = result[:] + result = [] + # spliting by start token is vulnerable since the format of responses cannot be guaranteed + # input_ids_list has large length than messages when format error, which is caused by + # responses includeing more than one start token + # adjustment according to messages + segment_mismatch = True + ids_next_idx = 0 # index in input_ids_list for the next message + bos_token_id = input_ids_list[0][0] + for i, message in enumerate(messages): + segment_mismatch = len(input_ids_list) - ids_next_idx != len(messages) - i + if segment_mismatch: + # str or list of dict + content = ( + "".join([item["text"] for item in message["content"] if item["type"] == "text"]) + if not isinstance(message["content"], str) + else message["content"] + ) + token_id_without_format = tokenizer.encode(content) + bos_num = token_id_without_format.count(bos_token_id) + 1 # generated + chat_format + current_segment = sum(input_ids_list[ids_next_idx : ids_next_idx + bos_num], []) + ids_next_idx += bos_num + else: + current_segment = input_ids_list[ids_next_idx] + ids_next_idx += 1 + result.append(current_segment) return result + + + + +def convert_list_content_str(messages: List[Dict], parse_tool_call_parameter_to_dict=False) -> List[Dict]: + """ + Convert state0.json format to tokenizer-compatible format. + + The state0.json may have content as either: + 1. A string (already compatible) + 2. A list of dictionaries with 'type' and 'text' keys + + This function ensures all content is converted to strings by concatenating + text from list objects when needed. + + Args: + messages: List of message dictionaries from iflow_state0.json + parse_tool_call_parameter_to_dict: Whether to convert tool call arguments to dict, https://github.com/QwenLM/Qwen3-Coder/issues/444 + + Returns: + List of message dictionaries with string content suitable for tokenizer + """ + converted_messages = [] + + for message in messages: + converted_message = message.copy() + + # Handle content field + content = message.get('content') + if isinstance(content, list): + # Concatenate all text elements from the list + text_parts = [] + for item in content: + if isinstance(item, dict) and 'text' in item: + text_parts.append(item['text']) + elif isinstance(item, str): + text_parts.append(item) + converted_message['content'] = ''.join(text_parts) + elif isinstance(content, str): + # Already in correct format + converted_message['content'] = content + else: + # Handle other cases (convert to string) + converted_message['content'] = str(content) + + if parse_tool_call_parameter_to_dict: + if message['role'] == 'assistant': + if "tool_calls" in message: + tool_calls: List[Dict] = message['tool_calls'] + try: + for tool_call in tool_calls: + if "arguments" in tool_call["function"] and isinstance(tool_call['function']['arguments'], str): + tool_call['function']["arguments"] = json.loads(tool_call['function']['arguments']) + except Exception as e: + # NOTE: check 兜底逻辑是否合理 + # 现在更倾向于把assistant部分的内容替换为 content=response_text + # not isinstance(tool_call['function']['arguments'], str)的情况在model_update的时候会出现,abort的request会被convert两次 + content = converted_message.get('content', '') + tool_calls = message.pop("tool_calls") + converted_message['content'] = f"{content}{tool_calls}" + logger.error(f"Error parsing tool call arguments: {e}, src arguments: {json.dumps(tool_calls)}, parsing drawback to {converted_message['content']}") + converted_messages.append(converted_message) + + return converted_messages \ No newline at end of file diff --git a/roll/pipeline/agentic/env_manager/traj_env_manager.py b/roll/pipeline/agentic/env_manager/traj_env_manager.py index 88ab15d91..2a1f23ee3 100644 --- a/roll/pipeline/agentic/env_manager/traj_env_manager.py +++ b/roll/pipeline/agentic/env_manager/traj_env_manager.py @@ -125,25 +125,27 @@ def run_rollout_loop(self, data: DataProto): if self.running and (rollout_cache.terminated or stop_reason == GenerateStopReason.MAX_LENGTH): self.logger.debug(f"group_id: {self.env_config['group_id']} env_id: {self.env_config['env_id']} episode_id: {self.episode_id} start_step {start_step} gen_stats: {log_stats}") log_stats = {"generate_time": [], "step_time": [], "current_step": []} - rollout: DataProto = self.formulate_rollouts(rollout_cache) traj_group_id = f"{self.rollout_cache.tag}_{self.rollout_cache.group_id}_{self.episode_id}_{self.group_seed}" traj_id = f"{traj_group_id}_{self.rollout_cache.env_id}" rollout.non_tensor_batch["traj_group_id"] = np.array([traj_group_id] * rollout.batch.batch_size[0], dtype=object) rollout.non_tensor_batch["traj_id"] = np.array([traj_id] * rollout.batch.batch_size[0], dtype=object) - ray.get(self.output_queue.put.remote(self.env_config['group_id'], self.episode_id, start_step, rollout)) + ray.get(self.output_queue.put.remote(self.env_config['group_id'], self.episode_id, start_step, rollout, self.env_config['env_id'])) rollout_cache = self.reset() start_step = self.current_step - ray.get(self.output_queue.put.remote(self.env_config['group_id'], self.episode_id, start_step, None)) + ray.get(self.output_queue.put.remote(self.env_config['group_id'], self.episode_id, start_step, None, self.env_config['env_id'])) def reset(self) -> RolloutCache: self.rollout_cache = RolloutCache(env_id=self.env_config['env_id'], group_id=self.env_config['group_id'], tag=self.env_config['tag']) - self.episode_id = ray.get(self.output_queue.get_episode_id.remote(self.env_config['group_id'])) + self.episode_id = ray.get(self.output_queue.get_episode_id.remote( + self.env_config['group_id'], + self.env_config['env_id'] + )) if self.episode_id is None: assert not self.running return None @@ -190,11 +192,6 @@ def step(self, llm_output: DataProto): if suffix is not None: self.rollout_cache.history[-1]["suffix"] = suffix - if self.mode == "val" and self.pipeline_config.render_save_dir and hasattr(self.env, "render"): - frame = self.env.render(mode='rgb_array') - if isinstance(frame, np.ndarray): - self.rollout_cache.frames.append(frame) - return self.rollout_cache def make_decision(self, rollout_cache: RolloutCache): @@ -226,7 +223,7 @@ def make_decision(self, rollout_cache: RolloutCache): response_ids = response_ids.tolist() content = self.rollout_cache.history[-1] - if "infer_logprobs" in lm_output.batch: + if "infer_logprobs" in lm_output.batch.keys(): infer_logprobs = lm_output.batch['infer_logprobs'][0][-len(response_ids):] content["infer_logprobs"] = infer_logprobs.tolist() @@ -240,7 +237,7 @@ def format_messages(self, history: RolloutCache) -> DataProto: messages = [] user_content = "" - if self.rollout_cache.step == 0: + if content["actions_left"] == self.env_config.max_steps: messages.append({"role": "system", "content": self.agent_system_template}) if "env_instruction" in history.history[0]: user_content = f"{history.history[0]['env_instruction']}\n" @@ -270,7 +267,11 @@ def format_messages(self, history: RolloutCache) -> DataProto: input_ids = torch.tensor(input_ids, dtype=torch.long).unsqueeze(0) attention_mask = torch.tensor([1] * input_ids.shape[1], dtype=torch.long).unsqueeze(0) - position_ids = attention_mask.cumsum(dim=-1) + # Huggingface Transformers prefer position_ids to be 0-based. + # Attn Mask: [1, 1, 1, ..., 1, 0, 0, ..., 0] + # cumsum: [1, 2, 3, ..., n, n+1, n+1, ..., n+1] + # cumsum - 1: [0, 1, 2, ..., n-1, n, n, ..., n] + position_ids = attention_mask.cumsum(dim=-1) - 1 lm_input = DataProto() lm_input.batch = TensorDict({ "input_ids": input_ids, @@ -316,7 +317,11 @@ def formulate_rollouts(self, rollout_cache: RolloutCache): prompt_mask =torch.tensor(prompt_masks, dtype=torch.bool).unsqueeze(0) score_tensor = torch.tensor([0] * len(token_ids), dtype=torch.float).unsqueeze(0) score_tensor[0][-1] = episode_score - position_ids = attention_mask.cumsum(dim=-1) + # Huggingface Transformers prefer position_ids to be 0-based. + # Attn Mask: [1, 1, 1, ..., 1, 0, 0, ..., 0] + # cumsum: [1, 2, 3, ..., n, n+1, n+1, ..., n+1] + # cumsum - 1: [0, 1, 2, ..., n-1, n, n, ..., n] + position_ids = attention_mask.cumsum(dim=-1) - 1 lm_input = DataProto() lm_input.batch = TensorDict( @@ -354,7 +359,6 @@ def formulate_rollouts(self, rollout_cache: RolloutCache): "env_ids": np.array([self.rollout_cache.env_id], dtype=object), "group_ids": np.array([self.rollout_cache.group_id], dtype=object), "tags": np.array([self.rollout_cache.tag], dtype=object), - "frames": np.array([self.rollout_cache.frames], dtype=object), "step_scores": np.array([scores], dtype=object), "episode_scores": np.array([episode_score], dtype=object), }) diff --git a/roll/pipeline/agentic/env_manager/vl_traj_env_manager.py b/roll/pipeline/agentic/env_manager/vl_traj_env_manager.py index 2a9a17fce..f109ca752 100644 --- a/roll/pipeline/agentic/env_manager/vl_traj_env_manager.py +++ b/roll/pipeline/agentic/env_manager/vl_traj_env_manager.py @@ -1,25 +1,28 @@ -import base64 from contextlib import nullcontext +from collections import defaultdict from threading import Lock from typing import Dict, List, Optional, Tuple import PIL import gem import numpy as np +import ray import torch from transformers import PreTrainedTokenizer, ProcessorMixin +from codetiming import Timer from roll.datasets.collator import DataCollatorWithPaddingForMM from roll.distributed.scheduler.generate_scheduler import RequestScheduler from roll.distributed.scheduler.protocol import DataProto from roll.distributed.scheduler.rollout_scheduler import GroupQueueManager +from roll.models.model_providers import default_tokenizer_provider from roll.pipeline.agentic.agentic_config import EnvManagerConfig, AgenticConfig from roll.pipeline.agentic.env_manager.base_env_manager import RolloutCache, BaseEnvManager from roll.pipeline.agentic.env_manager.token_mask_utils import split_by_token, \ token_ids_to_assistant_mask from roll.pipeline.agentic.env_manager.traj_env_manager import TrajEnvManager from roll.pipeline.agentic.llm_proxy import BaseLLMProxy, create_llm_proxy -from roll.utils.constants import GenerateStopReason +from roll.utils.constants import EpisodeStopReason, GenerateStopReason, RAY_NAMESPACE from roll.utils.env_action_limiter import get_global_limiter from roll.utils.functionals import pad_to_length, aggregate_metrics from roll.utils.logging import get_logger @@ -48,12 +51,15 @@ def __init__(self, self.tokenizer: PreTrainedTokenizer = tokenizer self.processor: ProcessorMixin = processor self.extra_data_provider = extra_data_provider + # TODO: allow to specify image_token and other processor settings self.collator = DataCollatorWithPaddingForMM( - tokenizer=self.tokenizer, - processor=self.processor, - answer_key=None, - extra_data_provider=self.extra_data_provider, - ) + tokenizer=self.tokenizer, + processor=self.processor, + answer_key=None, + image_flag_key=None, + video_flag_key=None, + extra_data_provider=self.extra_data_provider, + ) self.output_queue = output_queue self.mode = mode self.generate_scheduler: RequestScheduler = generate_scheduler @@ -61,8 +67,7 @@ def __init__(self, # EnvManager states self.rollout_cache: Optional[RolloutCache] = None self.group_seed = None - self.episode_id = 0 - self.current_step = -1 + self.episode_id = None self.running = False self.use_thread_lock = self.env_config.get("use_thread_lock", False) # 避免同时执行大量cpu操作, 可以通过env_config配置 self.thread_lock = thread_lock if self.use_thread_lock else nullcontext() @@ -71,10 +76,45 @@ def __init__(self, self.env_step_limiter = nullcontext() if self.max_env_step_concurrent > 0: env_tag = self.env_config.get("tag", "default") - self.env_step_limiter = get_global_limiter(tag=env_tag, max_concurrent_calls=self.max_env_step_concurrent) + self.env_step_limiter = get_global_limiter(tag=f"{env_tag}_{self.mode}", max_concurrent_calls=self.max_env_step_concurrent) + + # Initialize reward scheduler and reward proxy BEFORE creating the environment + # This allows passing reward components through env_config to the environment constructor + self.reward_scheduler: Optional[RequestScheduler] = None + self.reward_proxy: Optional[BaseLLMProxy] = None + self.reward_tokenizer: Optional[PreTrainedTokenizer] = None + + # Create environment kwargs from config (convert OmegaConf to dict to avoid type errors) + env_kwargs = dict(self.env_config['config']) + + # Try to get reward scheduler from Ray named actor + if self.pipeline_config.reward: + self.reward_scheduler = ray.get_actor( + name=f"RewardScheduler-{pipeline_config.reward.name}", + namespace=RAY_NAMESPACE + ) + # Get reward tokenizer + self.reward_tokenizer = default_tokenizer_provider( + model_args=pipeline_config.reward.model_args + ) + # Create reward proxy (without env reference since env doesn't exist yet) + self.reward_proxy = create_llm_proxy( + generate_scheduler=self.reward_scheduler, + llm_proxy_config=pipeline_config.reward.llm_proxy, + tokenizer=self.reward_tokenizer, + env=None, + ) + self.logger.info(f"Initialized reward proxy with scheduler: RewardScheduler-{pipeline_config.reward.name}") + + # Inject reward components into env_kwargs (not OmegaConf config) + env_kwargs['current_env_id'] = self.env_config["env_id"] + env_kwargs['reward_tokenizer'] = self.reward_tokenizer + env_kwargs['reward_proxy'] = self.reward_proxy + if self.pipeline_config.reward.generating_args: + env_kwargs['reward_generating_args'] = self.pipeline_config.reward.generating_args.to_dict() with self.thread_lock, self.env_step_limiter: - self.env = gem.make(env_id=self.env_config["env_type"], **self.env_config['config']) + self.env = gem.make(env_id=self.env_config["env_type"], **env_kwargs) cfg_template = self.pipeline_config.custom_envs[self.env_config["tag"]] self.agent_system_template = cfg_template["agent_system_template"] @@ -104,17 +144,116 @@ def __init__(self, self.logger.info(f"pre_step_template: {self.pre_step_template}") self.logger.info(f"next_step_template: {self.next_step_template}") - # TODO: add rewards_scheduler for local ray reward workers + # Create LLM proxy for policy generation self.llm_proxy: BaseLLMProxy = create_llm_proxy( generate_scheduler=self.generate_scheduler, llm_proxy_config=self.worker_config.llm_proxy, tokenizer=self.tokenizer, env=self.env ) + + def run_rollout_loop(self, data: DataProto): + """ + 1. Each time run_rollout_loop is called, + it will continuously play episodes until it receives a command that data collection is complete. + The seed needs to be reset to ensure consistency across all groups. + + Seed update logic: + group_seed = base_seed + group_id + episode_seed = group_seed + episode_id + + trajectory_id: f"{group_id}_{episode_id}_{episode_seed}" + """ + assert "seed" in data.meta_info + self.running = True + self.group_seed = data.meta_info['seed'] + self.env_config['group_seed'] + rollout_cache: RolloutCache = self.reset() + start_step = self.current_step + + log_stats = {"generate_time": [], "step_time": [], "current_step": []} + self.stop_reason = EpisodeStopReason.FINISH + + while self.running and rollout_cache is not None: + + with Timer(name="generate", logger=None) as generate_timer: + lm_output: DataProto = self.make_decision(rollout_cache) + generation_stop_reason = lm_output.meta_info.pop("stop_reason") + # Convert GenerateStopReason.MAX_LENGTH to EpisodeStopReason.MAX_LENGTH + # Similar to agent_native_env_manager.py:74-77 + if generation_stop_reason == GenerateStopReason.MAX_LENGTH: + self.stop_reason = EpisodeStopReason.MAX_LENGTH + elif generation_stop_reason == GenerateStopReason.ABORT: + self.stop_reason = EpisodeStopReason.ABORT + log_stats["current_step"].append(self.current_step) + log_stats["generate_time"].append(generate_timer.last) + + with Timer(name="step", logger=None) as step_timer: + if generation_stop_reason in [GenerateStopReason.FINISH, GenerateStopReason.MAX_LENGTH]: + rollout_cache: RolloutCache = self.step(lm_output) + log_stats["step_time"].append(step_timer.last) + + if self.running and (rollout_cache.terminated or generation_stop_reason == GenerateStopReason.MAX_LENGTH): + self.logger.debug(f"group_id: {self.env_config['group_id']} env_id: {self.env_config['env_id']} episode_id: {self.episode_id} start_step {start_step} gen_stats: {log_stats}") + log_stats = {"generate_time": [], "step_time": [], "current_step": []} + + rollout: DataProto = self.formulate_rollouts(rollout_cache) + traj_group_id = f"{self.rollout_cache.tag}_{self.rollout_cache.group_id}_{self.episode_id}_{self.group_seed}" + traj_id = f"{traj_group_id}_{self.rollout_cache.env_id}" + rollout.non_tensor_batch["traj_group_id"] = np.array([traj_group_id] * rollout.batch.batch_size[0], dtype=object) + rollout.non_tensor_batch["traj_id"] = np.array([traj_id] * rollout.batch.batch_size[0], dtype=object) + ray.get(self.output_queue.put.remote(self.env_config['group_id'], self.episode_id, start_step, rollout, self.env_config['env_id'])) + + rollout_cache = self.reset() + start_step = self.current_step + self.stop_reason = EpisodeStopReason.FINISH + + ray.get(self.output_queue.put.remote(self.env_config['group_id'], self.episode_id, start_step, None, self.env_config['env_id'])) + + def step(self, llm_output: DataProto): + # Similar to agent_native_env_manager.py:133-157 + # If stop_reason is MAX_LENGTH, pass it to env; otherwise decode response + if llm_output.batch is not None: + response = self.tokenizer.batch_decode(llm_output.batch['responses'], skip_special_tokens=False)[0] + else: + # When MAX_LENGTH, batch may be None, pass stop_reason as action + response = self.stop_reason if self.stop_reason else "" + + with self.thread_lock, self.env_step_limiter: + observation, reward, terminated, truncated, info = self.env.step(action=response) + suffix = info.pop("suffix", None) + + self.rollout_cache.step += 1 + self.rollout_cache.terminated = terminated + self.rollout_cache.truncated = truncated + if self.rollout_cache.step >= self.env_config.max_steps: + self.rollout_cache.terminated = True + if not terminated: + self.rollout_cache.truncated = True + self.rollout_cache.history[-1]['reward'] = reward + self.rollout_cache.history[-1]['llm_response'] = response + if info is not None: + self.rollout_cache.history[-1].update(info) + + self.rollout_cache.history.append({ + "observation": observation, + "actions_left": self.env_config.max_steps - self.rollout_cache.step, + "messages": None + }) + if suffix is not None: + self.rollout_cache.history[-1]["suffix"] = suffix + + return self.rollout_cache def make_decision(self, rollout_cache: RolloutCache): lm_input, messages = self.format_messages(rollout_cache) + # cache length of newly appended prompt to help to compute response_mask + rollout_cache.history[-1]["input_ids_length"] = lm_input.batch["input_ids"].shape[1] + rollout_cache.history[-1]["prompt_ids_length"] = rollout_cache.history[-1]["input_ids_length"] - ( + (rollout_cache.history[-2]["input_ids_length"] + rollout_cache.history[-2]["response_ids_length"]) + if len(rollout_cache.history) >= 2 + else 0 + ) input_ids = lm_input.batch["input_ids"] if input_ids.shape[1] >= self.pipeline_config.sequence_length: @@ -136,14 +275,19 @@ def make_decision(self, rollout_cache: RolloutCache): if lm_output is None: return DataProto(meta_info={"stop_reason": GenerateStopReason.ABORT}) lm_output.meta_info["stop_reason"] = GenerateStopReason.FINISH + # cache length of response_ids to help to compute response_mask + # eos_token should be taken into account + rollout_cache.history[-1]["response_ids_length"] = len(lm_output.batch["responses"][0]) + self.logger.debug( + f"env_id={self.env_config['env_id']}, global_step={self.current_step}, episode_id={self.episode_id}, turn_idx={rollout_cache.step}, " + f"input_ids_length={rollout_cache.history[-1]['input_ids_length']}, prompt_ids_length={rollout_cache.history[-1]['prompt_ids_length']}, " + f"response_ids_length={rollout_cache.history[-1]['response_ids_length']}" + ) return lm_output def format_messages(self, history: RolloutCache) -> Tuple[DataProto, List[Dict]]: - - messages = [ - {"role": "system", "content": self.agent_system_template}, - ] - images = [] + messages = [{"role": "system", "content": self.agent_system_template}] + mm_data = None for idx, content in enumerate(history.history): @@ -151,39 +295,113 @@ def format_messages(self, history: RolloutCache) -> Tuple[DataProto, List[Dict]] "sequences, following the format of (s, a, r, s, a, r...).") pre_step_content = self.pre_step_template.format(turn_idx=idx + 1) - if self.rollout_cache.step == 0: - pre_step_content = history.history[0]["env_instruction"] + pre_step_content + # cannot use `self.rollout_cache.step==0` which would add env_instruction only once for multi-turns + if content["actions_left"] == self.env_config.max_steps: + # add env_instruction in the first step + pre_step_content = history.history[0].get("env_instruction", "") + pre_step_content next_step_content = self.next_step_template.format(actions_left=content["actions_left"], max_response_length=self.env_config["max_tokens_per_step"]) - base64_image = base64.b64encode(content["observation"]).decode("utf-8") - user_content_list_dict = [ - { - "type": "text", - "text": pre_step_content # Reward:\n1.0\nTurn 1:\nState: - }, - { - "type": "image", - "image": f"data:image/jpeg;base64,{base64_image}", - }, - { - "type": "text", - "text": next_step_content # You have 3 actions left. Always output: [your answer] with no extra text.Strictly follow this format. Max response length: 200 words (tokens).Decide the next action: - } - ] + obs = content["observation"] + obs_content = None + mm_dict = defaultdict(list) + # obs might be a str, a image (as ndarray), a dict with prompt/image/video as values, + if isinstance(obs, str): + obs_content = obs + elif isinstance(obs, np.ndarray): + obs_content = [{"type": "image"}] + mm_dict = {"image": [PIL.Image.fromarray(obs, mode="RGB")]} + else : + assert isinstance(obs, dict), f"observation type {type(obs)} is not supported" + obs_content = obs.get("prompt", "") + # str or list of dict, and the dict is item of chat format or user content + if isinstance(obs_content, list): + if "role" in obs_content[0]: + if obs_content[0].get("role", None) == "system": + messages[0]["content"] = obs_content[0]["content"] + obs_content = obs_content[1]["content"] + else: + obs_content = obs_content[0]["content"] + mm_dict = dict((k, v) for k, v in obs.items() if k not in ["prompt"]) + + # replace image placeholder included in env returned prompt + def replace_placeholder(text): + if "image" in mm_dict and getattr(self.env, "image_placeholder", None): + text = text.replace(self.env.image_placeholder, self.collator.image_token) + if "video" in mm_dict and getattr(self.env, "video_placeholder", None): + text = text.replace(self.env.video_placeholder, self.collator.video_token) + return text + + if not isinstance(obs_content, str): + pre_step_content = [ + { + "type": "text", + "text": pre_step_content, # Reward:\n1.0\nTurn 1:\nState: + } + ] + next_step_content = [ + { + "type": "text", + "text": next_step_content, # You have 3 actions left. Always output: [your answer] with no extra text.Strictly follow this format. Max response length: 200 words (tokens).Decide the next action: + } + ] + for obs_item in obs_content: + if obs_item["type"] == "text": + obs_item["text"] = replace_placeholder(obs_item["text"]) + else: + obs_content = replace_placeholder(obs_content) + user_content_list_dict = pre_step_content + obs_content + next_step_content messages.append({"role": "user", "content": user_content_list_dict}) - images.append(PIL.Image.fromarray(content["observation"], mode='RGB')) + if mm_dict: + mm_data = defaultdict(list) if mm_data is None else mm_data + for k, v in mm_dict.items(): + mm_data[k].extend([v] if not isinstance(v, (list, tuple)) else v) if "llm_response" in content: - messages.append({"role": "assistant", "content": content["llm_response"]}) - - lm_input_texts = self.tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) - features = [{ + # eos token is included in response, only need to process once actually + llm_response = ( + content["llm_response"][: -len(self.tokenizer.eos_token)] + if content["llm_response"].endswith(self.tokenizer.eos_token) + else content["llm_response"] + ) + messages.append({"role": "assistant", "content": llm_response}) + + if not messages[0]["content"]: + messages = messages[1:] + assert messages, f"empty messages with {history=}" + add_generation_prompt = False if messages[-1]["role"] == "assistant" else True + lm_input_texts = self.tokenizer.apply_chat_template( + messages, add_generation_prompt=add_generation_prompt, tokenize=False + ) + feature = { self.collator.prompt_key: lm_input_texts, - self.collator.image_key: images, - self.collator.image_flag_key: True - }] - inputs = self.collator(features) + } + if mm_data: + if "image" in mm_data: + feature[self.collator.image_key] = mm_data["image"] + if "video" in mm_data: + feature[self.collator.video_key] = mm_data["video"] + + self.logger.debug( + f"env_id={self.env_config['env_id']}, global_step={self.current_step}, episode_id={self.episode_id}, turn_idx={idx + 1}, {feature=}" + ) + if not add_generation_prompt: # the final multi-turn feature, no need for infer + self.collator.return_infer_inputs = False + inputs = self.collator([feature]) + self.collator.return_infer_inputs = True lm_input: DataProto = DataProto.from_single_dict(inputs) + if not add_generation_prompt: + # NOTE: apply_chat_template would append suffix in response such as "<|im_end|>\n", + # while generated response often contains "<|im_end|>", and "\n" should not be + # treated as response + history.history[-1]["extra_suffix_length"] = lm_input.batch["input_ids"].shape[1] - ( + history.history[-1]["input_ids_length"] + history.history[-1]["response_ids_length"] + ) + self.logger.debug( + f"env_id={self.env_config['env_id']}, global_step={self.current_step}, episode_id={self.episode_id}, turn_idx={history.step}, " + f"final input_ids_shape={lm_input.batch['input_ids'].shape}, last turn input_ids_length={history.history[-1]['input_ids_length']}/" + f"prompt_ids_length={history.history[-1]['prompt_ids_length']}/response_ids_length={history.history[-1]['response_ids_length']}, " + f"extra_suffix_length={history.history[-1]['extra_suffix_length']}" + ) return lm_input, messages @@ -195,20 +413,34 @@ def formulate_rollouts(self, rollout_cache: RolloutCache): if 'observation' in rollout_cache.history[-1]: rollout_cache.history.pop(-1) + lm_input, messages = self.format_messages(rollout_cache) + + # can be used to trigger trajectory reward computation + if callable(getattr(self.env, "normalize_reward", None)): + self.env.normalize_reward(messages, rollout_cache, self.tokenizer) + scores = [i['reward'] for i in self.rollout_cache.history] episode_score = sum(scores) - lm_input, messages = self.format_messages(rollout_cache) - input_ids = lm_input.batch["input_ids"] attention_mask = lm_input.batch["attention_mask"] position_ids = lm_input.batch["position_ids"] token_ids = input_ids[0].tolist() - token_ids_split = split_by_token(token_ids, token_ids[0]) - response_masks_list = token_ids_to_assistant_mask(messages=messages, input_ids_list=token_ids_split, tokenizer=self.tokenizer) + # TODO: use length in cache to construct response_masks after conner case is fixed + # response_masks = [] + # for item in rollout_cache.history: + # response_masks.extend([0] * item["prompt_ids_length"] + [1] * item["response_ids_length"]) + # response_masks.extend([0] * item["extra_suffix_length"]) + token_ids_split = split_by_token(token_ids, token_ids[0], messages=messages, tokenizer=self.tokenizer) + response_masks_list = token_ids_to_assistant_mask( + messages=messages, input_ids_list=token_ids_split, tokenizer=self.tokenizer + ) response_masks = [item for items in response_masks_list for item in items] + assert len(response_masks) == len(token_ids), ( + f"response_masks length must be equal to token_ids length, {len(response_masks)=} != {len(token_ids)=}" + ) response_mask = torch.tensor(response_masks, dtype=torch.bool).unsqueeze(0) first_response_idx = response_masks.index(1) @@ -220,7 +452,11 @@ def formulate_rollouts(self, rollout_cache: RolloutCache): input_ids = input_ids[:, :last_response_idx+1] attention_mask = attention_mask[:, :last_response_idx+1] - position_ids = position_ids[:, :, :last_response_idx+1] + position_ids = ( + position_ids[:, :, : last_response_idx + 1] + if position_ids.dim() == 3 + else position_ids[:, : last_response_idx + 1] + ) response_length = response_mask.sum(dim=-1).float().mean().item() input_ids = pad_to_length(input_ids, length=self.pipeline_config.sequence_length, pad_value=self.tokenizer.pad_token_id) @@ -255,5 +491,8 @@ def formulate_rollouts(self, rollout_cache: RolloutCache): env_metric = {f"env/{rollout_cache.tag}/{k}": v for k, v in env_metric.items()} env_metric["env/response_length"] = response_length lm_input.meta_info = {"metrics": env_metric} - return lm_input + if callable(getattr(self.env, "add_extra_data", None)): + self.env.add_extra_data(lm_input, messages) + + return lm_input diff --git a/roll/pipeline/agentic/llm_proxy/__init__.py b/roll/pipeline/agentic/llm_proxy/__init__.py index e925e3965..610929bc0 100644 --- a/roll/pipeline/agentic/llm_proxy/__init__.py +++ b/roll/pipeline/agentic/llm_proxy/__init__.py @@ -1,3 +1,5 @@ +from typing import Optional + from transformers import PreTrainedTokenizer import gem @@ -12,7 +14,7 @@ def create_llm_proxy( generate_scheduler: RequestScheduler, llm_proxy_config: LLMProxyConfig, tokenizer: PreTrainedTokenizer, - env: gem.Env) -> BaseLLMProxy: + env: Optional[gem.Env]) -> BaseLLMProxy: proxy_type = llm_proxy_config.proxy_type if proxy_type in LLM_PROXY_REGISTRY: cls = LLM_PROXY_REGISTRY[proxy_type] diff --git a/roll/pipeline/agentic/llm_proxy/openai_proxy.py b/roll/pipeline/agentic/llm_proxy/openai_proxy.py index 4e937ca0b..21350fc81 100644 --- a/roll/pipeline/agentic/llm_proxy/openai_proxy.py +++ b/roll/pipeline/agentic/llm_proxy/openai_proxy.py @@ -103,8 +103,10 @@ def generate(self, # Pass extra_body only if it's not empty extra_body=extra_body if extra_body else None ) - - response_text = completion.choices[0].message.content + if completion.choices is None: + response_text = "OpenAI API returned no choices." + else: + response_text = completion.choices[0].message.content responses = self.tokenizer([response_text], return_tensors="pt") lm_input.batch["responses"] = responses["input_ids"] lm_input.non_tensor_batch["response_text"] = np.array([response_text], dtype=object) diff --git a/roll/pipeline/agentic/llm_proxy/policy_proxy.py b/roll/pipeline/agentic/llm_proxy/policy_proxy.py index e1f4adf9b..76b6edaf9 100644 --- a/roll/pipeline/agentic/llm_proxy/policy_proxy.py +++ b/roll/pipeline/agentic/llm_proxy/policy_proxy.py @@ -18,7 +18,6 @@ def generate(self, generation_config: Dict[str, Any]) -> DataProto: lm_input.meta_info["generation_config"] = generation_config - lm_input.meta_info['response_callback_fn'] = self.generate_scheduler.report_response.remote lm_input.meta_info["pad_to_seq_len"] = False lm_output: DataProto = ray.get(self.generate_scheduler.generate_one_request.remote(data=lm_input)) diff --git a/roll/pipeline/agentic/llm_proxy/proxy_utils.py b/roll/pipeline/agentic/llm_proxy/proxy_utils.py new file mode 100644 index 000000000..9daddcda2 --- /dev/null +++ b/roll/pipeline/agentic/llm_proxy/proxy_utils.py @@ -0,0 +1,158 @@ +""" +Utility functions for LLM proxy operations. +""" + +from typing import List, Dict, Any, Optional, Union + +import torch +from tensordict import TensorDict +from transformers import PreTrainedTokenizer + +from roll.distributed.scheduler.protocol import DataProto +from roll.pipeline.agentic.llm_proxy.base_llm_proxy import BaseLLMProxy +from roll.utils.logging import get_logger + +logger = get_logger() + + +def generate_by_proxy( + messages: List[Dict[str, Any]], + tokenizer: PreTrainedTokenizer, + proxy: BaseLLMProxy, + enable_thinking: bool = False, + generation_config: Optional[Dict[str, Any]] = None, + collator: Optional[Any] = None, + mm_data: Optional[Dict[str, Any]] = None, + src_rank: Optional[int] = None, +) -> Optional[str]: + """ + Generate text through proxy with support for multimodal inputs. + + This function formats messages using chat template, creates a DataProto + with tokenized input (and optional multimodal data), calls proxy.generate(), + and returns the decoded text response. + + For text-only generation, it uses tokenizer directly. For multimodal generation, + it uses collator to process images/videos along with text. + + Args: + messages: List of message dictionaries for the prompt. + For text: [{"role": "user", "content": "..."}] + For multimodal: [{"role": "user", "content": [{"type": "text", "text": "..."}, + {"type": "image", "image": PIL.Image}]}] + tokenizer: Tokenizer for the inference model + proxy: LLM proxy for model inference + enable_thinking: Whether to enable thinking tags in chat template (text-only mode) + generation_config: Optional generation config to override defaults + (temperature, max_new_tokens, etc.) + collator: Optional DataCollatorWithPaddingForMM for multimodal processing. + If provided, multimodal mode is used. + mm_data: Optional multimodal data dict with "image" and/or "video" keys. + Only used when collator is provided. + src_rank: Optional source rank for request routing in scheduler. + If not provided, defaults to 0. + + Returns: + Decoded text response from the LLM, or None if the request fails + + Examples: + Text-only generation: + >>> messages = [{"role": "user", "content": "Judge this response..."}] + >>> response_text = generate_by_proxy( + ... messages=messages, + ... tokenizer=tokenizer, + ... proxy=proxy, + ... enable_thinking=True, + ... generation_config={"temperature": 0.2, "max_new_tokens": 2048} + ... ) + + Multimodal generation: + >>> messages = [{"role": "user", "content": "Describe this image"}] + >>> mm_data = {"image": [pil_image]} + >>> response_text = generate_by_proxy( + ... messages=messages, + ... tokenizer=tokenizer, + ... proxy=proxy, + ... collator=collator, + ... mm_data=mm_data + ... ) + """ + # Multimodal mode: use collator to process features + if collator is not None: + # Get text from chat template without tokenization + lm_input_texts = tokenizer.apply_chat_template( + messages, + add_generation_prompt=True, + tokenize=False + ) + + # Build feature dict + feature = { + collator.prompt_key: lm_input_texts, + } + + # Add multimodal data if provided + if mm_data: + if "image" in mm_data: + feature[collator.image_key] = mm_data["image"] + if "video" in mm_data: + feature[collator.video_key] = mm_data["video"] + + # Process through collator + inputs = collator([feature]) + lm_input: DataProto = DataProto.from_single_dict(inputs) + + # Text-only mode: tokenize directly + else: + # Format messages using chat template with optional thinking tags + prompt_ids = tokenizer.apply_chat_template( + messages, + tokenize=True, + add_generation_prompt=True, + enable_thinking=enable_thinking + ) + + # Create DataProto with tokenized input + input_ids = torch.tensor(prompt_ids, dtype=torch.long).unsqueeze(0) + attention_mask = torch.tensor([1] * input_ids.shape[1], dtype=torch.long).unsqueeze(0) + position_ids = attention_mask.cumsum(dim=-1) + + lm_input = DataProto() + lm_input.batch = TensorDict({ + "input_ids": input_ids, + "attention_mask": attention_mask, + "position_ids": position_ids, + }, batch_size=input_ids.shape[0]) + + # Set generation config with defaults if not provided + if generation_config is None: + generation_config = { + "temperature": 0.2, # Lower temperature for stable judgments + "max_new_tokens": 2048, + "top_p": 0.95, + } + + # Set src_rank for request routing in scheduler + lm_input.meta_info["src_rank"] = src_rank if src_rank is not None else 0 + + # Call proxy.generate() for inference + lm_output: Optional[DataProto] = proxy.generate( + messages=messages, + lm_input=lm_input, + generation_config=generation_config + ) + + # Handle failure cases + if lm_output is None: + logger.warning("LLM generation failed (returned None)") + return None + + # Extract response token IDs and decode to text + if "responses" not in lm_output.batch.keys(): + logger.error("LLM output missing 'responses' key") + return None + + response_ids = lm_output.batch['responses'][0] + response_text = tokenizer.decode(response_ids, skip_special_tokens=True) + + return response_text diff --git a/roll/pipeline/agentic/utils.py b/roll/pipeline/agentic/utils.py index 6cd46ba05..bc995145d 100644 --- a/roll/pipeline/agentic/utils.py +++ b/roll/pipeline/agentic/utils.py @@ -280,17 +280,31 @@ def dump_frames_as_gif(filename, frames, duration=0.2): pass +def remove_nan_items(data: Dict[str, np.ndarray]): + if not data: + return {} + + # 所有数组都假设 dtype=object,只有 None 需要过滤 + arr = np.vstack([np.asarray(v, dtype=object) for v in data.values()]) # (num_keys, N) + mask = arr != None # noqa: E711 + valid_row_mask = mask.all(axis=0) + return { + k: np.asarray(v, dtype=object)[valid_row_mask] + for k, v in data.items() + } + + def dump_rollout_trajectories(path, global_step, data: DataProto): """ Dumps rollout trajectories to persistent storage. - The data is written using a column-based configuration defined in COLUMNS_CONFIG. + The data is written using a column-based configuration defined in COLUMMNS_CONFIG. Each column is specified as a list [column_name, data_type], where: - column_name: string identifier for the column - data_type: data type specification ('bigint', 'string', 'double', etc.) Example configuration: - columns_config = [ + colummns_config = [ ['global_step', 'bigint'], ['id', 'string'], ['source', 'string'], @@ -300,14 +314,15 @@ def dump_rollout_trajectories(path, global_step, data: DataProto): if not path: return - columns_config: Optional[List] = data.meta_info.get("COLUMNS_CONFIG", None) + columns_config: Optional[List] = data.meta_info.get("COLUMMNS_CONFIG", None) if columns_config is None: return - write_data = copy.deepcopy(data.non_tensor_batch) - [data.non_tensor_batch.pop(item[0]) for item in columns_config if item[0] in data.non_tensor_batch] + write_data = {item[0]: data.non_tensor_batch.pop(item[0]) for item in columns_config if item[0] in data.non_tensor_batch} + + write_data = remove_nan_items(copy.deepcopy(write_data)) + data_cnt = len(write_data[columns_config[0][0]]) - data_cnt = len(data) write_data["global_step"] = [global_step] * data_cnt columns_config.append(["global_step", "bigint"]) @@ -316,6 +331,128 @@ def dump_rollout_trajectories(path, global_step, data: DataProto): p = multiprocessing.Process(target=func, args=(path, write_data, columns_config), daemon=False) p.start() + +def compute_segment_masked_mean(tensor: torch.Tensor, mask: torch.Tensor) -> torch.Tensor: + """ + 对每段连续的1分别计算 masked_mean,不连续的段不相乘。 + + Args: + tensor: [batch_size, seq_len] 要计算的值 + mask: [batch_size, seq_len] mask,1表示有效位置,0表示无效位置 + + Returns: + [batch_size, seq_len] 结果,每段连续的1位置填充该段的 masked_mean + """ + batch_size, seq_len = mask.shape + device = mask.device + result = torch.zeros_like(tensor) + + # 对每个样本分别处理 + for b in range(batch_size): + sample_mask = mask[b] # [seq_len] + sample_tensor = tensor[b] # [seq_len] + + # 找到所有连续的1的段 + # 使用 diff 找到边界:1->0 和 0->1 的位置 + diff = torch.diff(sample_mask, prepend=torch.tensor([0], device=device)) + # 找到段的开始位置(0->1) + segment_starts = torch.where(diff == 1)[0] + # 找到段的结束位置(1->0),diff[i]==-1 表示 mask[i-1]==1 且 mask[i]==0,所以段的结束位置是 i(不包括i) + segment_ends = torch.where(diff == -1)[0] + + # 如果最后一个位置是1,需要添加结束位置 + if sample_mask[-1] == 1: + segment_ends = torch.cat([segment_ends, torch.tensor([seq_len], device=device)]) + + # 确保 segment_starts 和 segment_ends 长度匹配 + if len(segment_starts) != len(segment_ends): + # 如果长度不匹配,只处理能匹配的部分 + min_len = min(len(segment_starts), len(segment_ends)) + segment_starts = segment_starts[:min_len] + segment_ends = segment_ends[:min_len] + + # 对每段分别计算 masked_mean + for start, end in zip(segment_starts, segment_ends): + # 获取这段的索引 + segment_indices = torch.arange(start, end, device=device) + segment_mask = sample_mask[segment_indices] # 这段的mask + segment_tensor = sample_tensor[segment_indices] # 这段的值 + + if segment_mask.sum() > 0: + # 计算这段的 masked_mean(只考虑mask为1的位置) + segment_mean = (segment_tensor * segment_mask).sum() / (segment_mask.sum() + 1e-8) + # 将结果填充到这段内mask为1的位置 + result[b, segment_indices] = segment_mean * segment_mask + + return result + + +def compute_agentic_reinforce_return( + token_level_rewards: torch.Tensor, gamma: torch.Tensor, lambd: torch.Tensor, mask: Optional[torch.Tensor] = None +): + """ + 计算 REINFORCE 的 return,支持按 mask 分段 discount 衰减。 + 每段内所有位置获得相同的折扣累积值(从该段最后位置开始累积)。 + + Args: + token_level_rewards: [batch_size, seq_len] token 级别的奖励 + gamma: discount factor + lambd: lambda 参数(当前未使用,保留以兼容接口) + mask: [batch_size, seq_len] mask,1表示有效位置,0表示无效位置。如果为None,则对所有位置计算 + + Returns: + advantages: [batch_size, seq_len] advantages + returns: [batch_size, seq_len] returns + """ + with torch.no_grad(): + batch_size, gen_len = token_level_rewards.shape + device = token_level_rewards.device + returns = torch.zeros_like(token_level_rewards, dtype=torch.float32) + + # 如果没有提供 mask,则对所有位置计算(向后兼容) + if mask is None: + mask = torch.ones_like(token_level_rewards) + + # 确保 gamma 是标量 + gamma_val = gamma.item() if torch.is_tensor(gamma) else gamma + + # 对每个样本分别处理 + for b in range(batch_size): + sample_mask = mask[b] # [seq_len] + sample_rewards = token_level_rewards[b] # [seq_len] + + # 找到所有连续的1的段 + # 使用 diff 找到边界:1->0 和 0->1 的位置 + diff = torch.diff(sample_mask.float(), prepend=torch.tensor([0.0], device=device)) + + # 找到段的开始位置(0->1,diff==1) + segment_starts = torch.where(diff == 1)[0] + + # 找到段的结束位置(1->0,diff==-1) + segment_ends = torch.where(diff == -1)[0] + + # 如果最后一个位置是1,需要添加结束位置 + if len(sample_mask) > 0 and sample_mask[-1] == 1: + segment_ends = torch.cat([segment_ends, torch.tensor([gen_len], device=device)]) + + # 计算该段从最后位置开始的累积折扣奖励 + cumulative_return = 0.0 + # 对每段分别计算 discounted return + for start, end in zip(segment_starts.flip(-1), segment_ends.flip(-1)): + start_idx = start.item() + end_idx = end.item() + segment_len = end_idx - start_idx + + cumulative_return = sample_rewards[end_idx - 1].item() + gamma_val * cumulative_return + + # 该段内所有位置都设置为这个累积值 + returns[b, start_idx:end_idx] = cumulative_return + + advantages = returns + + return advantages, returns + + @torch.no_grad() def agentic_compute_advantage( data: "DataProto", @@ -350,9 +487,13 @@ def agentic_compute_advantage( token_level_rewards=token_level_rewards, gamma=gamma, lambd=lambd ) elif adv_estimator in ["agentic_reinforce"]: - raise NotImplementedError + advantages, returns = compute_agentic_reinforce_return( + token_level_rewards=token_level_rewards, gamma=gamma, lambd=lambd, mask=response_mask + ) else: raise NotImplementedError + + data.batch["raw_advantages"] = advantages if whiten_advantages: # TODO whiten过程中是否要考虑response的长度? advantages = masked_whiten(values=advantages, mask=response_mask) diff --git a/roll/pipeline/base_pipeline.py b/roll/pipeline/base_pipeline.py index ad12f91a1..5c4d67e78 100644 --- a/roll/pipeline/base_pipeline.py +++ b/roll/pipeline/base_pipeline.py @@ -3,9 +3,11 @@ import shutil from collections import defaultdict from concurrent import futures -from typing import List, Any, Dict +from typing import Any, Dict, List import ray +from ray.util.placement_group import PlacementGroup +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy from transformers import set_seed from roll.distributed.executor.cluster import Cluster @@ -18,7 +20,6 @@ from roll.utils.tracking import create_tracker from roll.utils.worker_state import WorkerState - logger = get_logger() @@ -29,8 +30,9 @@ class BasePipeline: def __init__(self, pipeline_config): set_seed(seed=pipeline_config.seed) self.pipeline_config = pipeline_config - self.resource_manager = ResourceManager(num_nodes=self.pipeline_config.num_nodes, - num_gpus_per_node=self.pipeline_config.num_gpus_per_node) + self.resource_manager = ResourceManager( + num_nodes=self.pipeline_config.num_nodes, num_gpus_per_node=self.pipeline_config.num_gpus_per_node + ) self.state = WorkerState() self.checkpoint_manager = CheckpointManager(checkpoint_config=self.pipeline_config.checkpoint_config) self.tracker = create_tracker( @@ -60,7 +62,7 @@ def run(self): def set_model_update_pair(self, src_cluster, tgt_cluster, frequency=1): self.model_update_groups.append( - ModelUpdateGroup(src_cluster=src_cluster, tgt_cluster=tgt_cluster, frequency=frequency) + ModelUpdateGroup(src_cluster=src_cluster, tgt_cluster=tgt_cluster, frequency=frequency, pipeline_config=self.pipeline_config) ) def set_checkpoint_clusters(self, *clusters): @@ -70,9 +72,13 @@ def model_update(self, global_step): metrics = {} for model_update_group in self.model_update_groups: metrics.update(model_update_group.model_update(global_step)) + model_update_group.tgt_cluster.process_weights_after_loading() return metrics - def do_checkpoint(self, global_step): + def do_checkpoint(self, global_step, is_last_step=None): + if is_last_step is None: + is_last_step = global_step == self.pipeline_config.max_steps - 1 + metrics = self.state.log_history[-1] metrics["system/step"] = global_step if global_step > 0 and ( @@ -80,7 +86,9 @@ def do_checkpoint(self, global_step): ): ckpt_metrics_refss = [] for cluster in self.checkpoint_clusters: - ckpt_metrics_refss.append(cluster.do_checkpoint(global_step=global_step, blocking=False)) + ckpt_metrics_refss.append( + cluster.do_checkpoint(global_step=global_step, is_last_step=is_last_step, blocking=False) + ) for ckpt_metrics_refs in ckpt_metrics_refss: ckpt_metrics = DataProto.materialize_concat(data_refs=ckpt_metrics_refs) @@ -147,13 +155,33 @@ def _cleanup_old_checkpoints(self): logger.warning(f"Failed to delete checkpoint {ckpt_dir}: {e}") def download_models(self, *clusters: Cluster): - node2worker: Dict[str, Any] = {} + node2pg: Dict[str, PlacementGroup] = {} node2model_names: Dict[str, set[str]] = defaultdict(set) for cluster in clusters: - for worker, node_ip in cluster.worker2nodes.items(): - node2worker[node_ip] = worker - if cluster.worker_config.model_args.model_name_or_path: - node2model_names[node_ip].add(cluster.worker_config.model_args.model_name_or_path) - if self.pipeline_config.resume_from_checkpoint: - node2model_names[node_ip].add(self.pipeline_config.resume_from_checkpoint) - ray.get([node2worker[node_ip].download_models.remote(model_name_or_paths=model_names) for node_ip, model_names in node2model_names.items()]) \ No newline at end of file + assert cluster.placement_groups is not None + for pg_list in cluster.placement_groups: + assert len(pg_list) > 0 + worker_nodes = set() + for pg in pg_list: + node_rank = pg["node_rank"] + if node_rank not in worker_nodes: + worker_nodes.add(node_rank) + node2pg[node_rank] = pg["placement_group"] + if cluster.worker_config.model_args.model_name_or_path: + node2model_names[node_rank].add(cluster.worker_config.model_args.model_name_or_path) + if self.pipeline_config.resume_from_checkpoint: + node2model_names[node_rank].add(self.pipeline_config.resume_from_checkpoint) + ray.get( + [ + download_models.options( + scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=node2pg[node_rank]) + ).remote(model_name_or_paths=model_names) + for node_rank, model_names in node2model_names.items() + ] + ) + +@ray.remote +def download_models(model_name_or_paths: set[str]): + with futures.ThreadPoolExecutor(max_workers=5) as thread_executor: + futures.wait([thread_executor.submit(download_model, model_name_or_path) + for model_name_or_path in model_name_or_paths]) diff --git a/roll/pipeline/base_worker.py b/roll/pipeline/base_worker.py index d5c84c120..c0ae33d06 100644 --- a/roll/pipeline/base_worker.py +++ b/roll/pipeline/base_worker.py @@ -1,7 +1,8 @@ +import inspect import os import threading import time -from typing import Union, Optional, Dict +from typing import Dict, Optional, Union, List import ray import torch @@ -10,38 +11,30 @@ from roll.configs.worker_config import WorkerConfig from roll.distributed.executor.worker import Worker -from roll.distributed.scheduler.decorator import register, Dispatch +from roll.distributed.scheduler.decorator import Dispatch, register from roll.distributed.scheduler.protocol import DataProto from roll.distributed.strategy.factory import create_strategy from roll.distributed.strategy.strategy import InferenceStrategy, TrainStrategy -from roll.models.model_providers import default_actor_model_provider, default_value_model_provider, \ - default_reward_model_provider, default_diffusion_module_provider -from roll.utils.checkpoint_manager import download_model -from roll.utils.context_managers import state_offload_manger -from roll.utils.functionals import ( - append_to_dict, - masked_mean, - compute_approx_kl, - postprocess_generate, - GenerateRequestType, - agg_loss, +from roll.models.model_providers import ( + default_actor_model_provider, + default_diffusion_module_provider, + default_reward_model_provider, + default_value_model_provider, ) +from roll.platforms import current_platform +from roll.utils.checkpoint_manager import download_model +from roll.utils.context_managers import state_offload_manger, log_gpu_memory_usage +from roll.utils.dynamic_batching import make_mini_batch_iter_for_dynamic_batching +from roll.utils.functionals import agg_loss, append_to_dict, compute_approx_kl, masked_mean, postprocess_generate, reduce_metrics from roll.utils.offload_nccl import reload_process_groups from roll.utils.offload_states import OffloadStateType -from roll.utils.dynamic_batching import make_mini_batch_iter_for_dynamic_batching -from roll.platforms import current_platform class ActorWorker(Worker): def __init__(self, worker_config: WorkerConfig): super().__init__(worker_config=worker_config) self.tokenizer = None - self.strategy: Optional[Union[InferenceStrategy, TrainStrategy]] = None - self.response_call_back_fns = {} - self.response_callback_refs = [] - self.server_metrics = {} - self.thread_server = None - self.offload_manager = None + self.strategy: TrainStrategy = None self._logprobs_cache = {} @register(dispatch_mode=Dispatch.ONE_TO_ALL) @@ -63,12 +56,6 @@ def initialize(self, pipeline_config): self.strategy.offload_states() - # Platform must have been initialized when calling current_platform.reset_max_memory_allocated - # with arguments (inside state_offload_manager). We explicitly init platform here because - # current process is used as engine client when using vllm v1 engine, and - # there is no chance to init platform context. - current_platform.init() - @register(dispatch_mode=Dispatch.DP_MP_DISPATCH_FIRST) def train_step(self, data: DataProto): """ @@ -88,17 +75,18 @@ def train_step(self, data: DataProto): ): data = data.to(current_platform.device_type) data = self.strategy.get_data_input(data) + per_device_train_batch_size = self.worker_config.training_args.per_device_train_batch_size + backward_batch_size = ( + per_device_train_batch_size * self.worker_config.training_args.gradient_accumulation_steps + ) if self.worker_config.use_dynamic_batching_in_train: + # TODO: support `keep_mini_batch`, The number of mini_batch may be smaller than original size dataloader = make_mini_batch_iter_for_dynamic_batching( - data = data, + data=data, epochs=self.pipeline_config.ppo_epochs, - ga_steps = self.worker_config.training_args.gradient_accumulation_steps + ga_steps=self.worker_config.training_args.gradient_accumulation_steps, ) else: - per_device_train_batch_size = self.worker_config.training_args.per_device_train_batch_size - backward_batch_size = ( - per_device_train_batch_size * self.worker_config.training_args.gradient_accumulation_steps - ) dataloader = data.make_iterator( mini_batch_size=backward_batch_size, epochs=self.pipeline_config.ppo_epochs, @@ -106,117 +94,22 @@ def train_step(self, data: DataProto): dataloader_kwargs={"shuffle": True}, ) - for batch_idx, data in enumerate(dataloader): - pg_metrics = self.strategy.train_step(batch=data, loss_func=self.loss_func) + for batch_idx, backward_batch in tqdm(enumerate(dataloader), + desc=f"{self.worker_name} train global step {global_step}", + total=data.batch.batch_size[0] * self.pipeline_config.ppo_epochs // backward_batch_size): + pg_metrics = self.strategy.train_step(batch=backward_batch, loss_func=self.loss_func) + if self.worker_config.use_dynamic_batching_in_train or self.worker_config.use_sequence_packing: + pg_metrics = reduce_metrics(pg_metrics) append_to_dict(metrics, pg_metrics) metrics["actor/lr"] = self.strategy.scheduler.get_last_lr()[0] + metrics["actor/backward_steps"] = data.batch.batch_size[0] * self.pipeline_config.ppo_epochs // backward_batch_size data.to("cpu") self._logprobs_cache.clear() output = DataProto(meta_info={"metrics": metrics}) return output - @register(dispatch_mode=Dispatch.DP_MP_COMPUTE) - @torch.no_grad() - def generate(self, data: DataProto): - """ - batch = TensorDict( - { - 'prompts': idx, - 'responses': response, - 'input_ids': seq, # here input_ids become the whole sentences - 'attention_mask': attention_mask, - 'position_ids': position_ids, - 'old_log_probs': log_probs, - }, - batch_size=batch_size) - return DataProto(batch=batch) - """ - if "generation_config" not in data.meta_info: - generation_config = self.worker_config.generating_args.to_dict() - else: - generation_config = data.meta_info["generation_config"] - - generation_config["eos_token_id"] = [self.tokenizer.eos_token_id, self.tokenizer.pad_token_id] - generation_config["pad_token_id"] = self.tokenizer.pad_token_id - - global_step = data.meta_info.get("global_step", 0) - is_offload_states = data.meta_info.get("is_offload_states", True) - self.logger.info(f"{self.worker_name} generate global step {global_step}") - - metrics = {} - with state_offload_manger( - strategy=self.strategy, - metrics=metrics, - metric_infix=f"{self.cluster_name}/generate", - is_offload_states=is_offload_states, - ): - data = data.to(current_platform.device_type) - data.meta_info["micro_batch_size"] = self.worker_config.infer_batch_size - - output = self.strategy.generate(batch=data, generation_config=generation_config) - output = postprocess_generate( - prompts=data, - output=output, - num_return_sequences=generation_config["num_return_sequences"], - sequence_length=self.pipeline_config.sequence_length, - eos_token_id=self.tokenizer.eos_token_id, - pad_token_id=self.tokenizer.pad_token_id, - ) - data.to("cpu") - output = output.to("cpu") - - output.meta_info = {"metrics": metrics} - return output - - @register(dispatch_mode=Dispatch.ONE_TO_ALL_ONE) - @torch.no_grad() - def start_server(self, data: DataProto): - """ - 解决dp generate的长尾问题,async+ load balance - """ - if self.thread_server is not None: - return - - global_step = data.meta_info.get("global_step", 0) - is_offload_states = data.meta_info.get("is_offload_states", True) - - self.logger.info(f"{self.worker_name} generate server global step {global_step}") - self.response_call_back_fns = {} - - self.response_callback_refs = [] - self.server_metrics = {} - self.offload_manager = state_offload_manger( - strategy=self.strategy, - metrics=self.server_metrics, - metric_infix=f"{self.cluster_name}/generate", - is_offload_states=is_offload_states, - load_kwargs={"include": [OffloadStateType.model_params]}, - ) - self.offload_manager.__enter__() - self.thread_server = threading.Thread( - target=self.strategy.start_server, kwargs=dict(data=data, request_complete_callback=self.request_complete) - ) - self.thread_server.start() - while not self.strategy.running: - time.sleep(0.1) - - @register(dispatch_mode=Dispatch.ONE_TO_ALL_ONE) - def stop_server(self, data: DataProto = None): - if self.thread_server == None: - return - - self.strategy.add_request(command=GenerateRequestType.STOP, data=None) - self.thread_server.join() - self.thread_server = None - self.response_call_back_fns.clear() - self.offload_manager.__exit__(None, None, None) - ray.get(self.response_callback_refs) - self.response_callback_refs.clear() - - return DataProto(meta_info={"metrics": self.server_metrics}) - @register(dispatch_mode=Dispatch.DP_MP_DISPATCH_FIRST) def compute_log_probs(self, data: DataProto): """ @@ -235,6 +128,7 @@ def compute_log_probs(self, data: DataProto): data = self.strategy.get_data_input(data) data = data.to(current_platform.device_type) data.meta_info["micro_batch_size"] = self.worker_config.infer_batch_size + with torch.no_grad(): results: Dict[str, torch.Tensor] = self.strategy.forward_step( batch=data, forward_func=self.forward_func_log_probs @@ -257,7 +151,7 @@ def forward_func_log_probs(self, data: DataProto, output_tensor: torch.Tensor): logits=output_tensor, input_ids=data.batch["input_ids"], attention_mask=data.batch["response_mask"] ) entropy = self.strategy.op_compute_entropy(logits=output_tensor, attention_mask=data.batch["response_mask"]) - return log_probs, {"log_probs": log_probs.clone().detach(), "entropy": entropy.clone().detach()} + return torch.tensor(0., device=output_tensor.device), {"log_probs": log_probs.clone().detach(), "entropy": entropy.clone().detach()} def get_old_log_probs_with_cache(self, data: DataProto, log_probs: torch.Tensor) -> torch.Tensor: """ @@ -310,6 +204,9 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): ref_log_probs = data.batch["ref_log_probs"] advantages = data.batch["advantages"] + batch_num_tokens = data.meta_info['batch_num_tokens'] + global_valid_samples = data.meta_info['global_valid_samples'] + log_probs = self.strategy.op_compute_log_probs( logits=output_tensor, input_ids=data.batch["input_ids"], attention_mask=data.batch["response_mask"] ) @@ -317,8 +214,16 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): ratio = (log_probs - old_log_probs).exp() - pg_clip_low = self.pipeline_config.pg_clip_low if self.pipeline_config.use_pg_clip_range else self.pipeline_config.pg_clip - pg_clip_high = self.pipeline_config.pg_clip_high if self.pipeline_config.use_pg_clip_range else self.pipeline_config.pg_clip + pg_clip_low = ( + self.pipeline_config.pg_clip_low + if self.pipeline_config.use_pg_clip_range + else self.pipeline_config.pg_clip + ) + pg_clip_high = ( + self.pipeline_config.pg_clip_high + if self.pipeline_config.use_pg_clip_range + else self.pipeline_config.pg_clip + ) surr1 = ratio * advantages surr2 = ratio.clamp(1 - pg_clip_low, 1 + pg_clip_high) * advantages pg_loss = -torch.min(surr1, surr2) @@ -326,11 +231,16 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): dual_clip_loss = -torch.max(-pg_loss, (1 + self.pipeline_config.pg_clip * 2) * advantages) pg_loss = torch.where(advantages < 0, dual_clip_loss, pg_loss) - pg_loss = agg_loss(loss_mat=pg_loss, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode) + pg_loss = agg_loss(loss_mat=pg_loss, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask']) - kl_loss = compute_approx_kl(log_probs=log_probs, log_probs_base=ref_log_probs, action_mask=response_mask, - kl_penalty="k3") - kl_loss = agg_loss(loss_mat=kl_loss, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode) + kl_loss = compute_approx_kl( + log_probs=log_probs, log_probs_base=ref_log_probs, action_mask=response_mask, kl_penalty="k3" + ) + kl_loss = agg_loss(loss_mat=kl_loss, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask']) approxkl = compute_approx_kl( log_probs=log_probs, log_probs_base=old_log_probs, action_mask=response_mask, kl_penalty="mse" @@ -347,11 +257,15 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): else: total_loss = pg_loss if self.pipeline_config.entropy_loss_coef > 0: - entropy = self.strategy.op_compute_entropy(logits=output_tensor, attention_mask=data.batch["response_mask"]) + entropy = self.strategy.op_compute_entropy( + logits=output_tensor, attention_mask=data.batch["response_mask"] + ) entropy_loss = agg_loss( loss_mat=entropy, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'], ) total_loss = total_loss - entropy_loss * self.pipeline_config.entropy_loss_coef @@ -362,21 +276,38 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): "actor/ratio_mean": masked_mean(ratio, response_mask, dim=-1).mean().detach().item(), "actor/ratio_max": torch.max(ratio * response_mask).detach().item(), "actor/ratio_min": torch.min(ratio * response_mask + (1 - response_mask) * 1e10).detach().item(), - "actor/clipfrac": agg_loss(loss_mat=torch.lt(surr2, surr1).float(), loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).detach().item(), + "actor/clipfrac": agg_loss( + loss_mat=torch.lt(surr2, surr1).float(), + loss_mask=response_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'], + ) + .detach() + .item(), "actor/pg_loss": pg_loss.detach().item(), "actor/kl_loss": kl_loss.detach().item(), "actor/total_loss": total_loss.detach().item(), - "actor/approxkl": agg_loss(loss_mat=approxkl, loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).detach().item(), - "actor/policykl": agg_loss(loss_mat=policykl, loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).detach().item(), + "actor/approxkl": agg_loss( + loss_mat=approxkl, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'], + ) + .detach() + .item(), + "actor/policykl": agg_loss( + loss_mat=policykl, loss_mask=response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'], + ) + .detach() + .item(), } return total_loss, pg_metrics @register(dispatch_mode=Dispatch.ONE_TO_ALL) - def do_checkpoint(self, global_step): + def do_checkpoint(self, global_step, is_last_step=None): if self.worker_config.offload_nccl: reload_process_groups() with Timer("do_checkpoint") as total_timer: @@ -386,7 +317,10 @@ def do_checkpoint(self, global_step): save_dir = os.path.join(self.pipeline_config.output_dir, self.worker_name, ckpt_id) self.logger.info(f"save checkpoint-{global_step} to {save_dir}") - exec_metrics: Dict = self.strategy.save_checkpoint(save_dir, global_step, ckpt_id) + # could be passed for other strategy with kwargs + exec_metrics: Dict = self.strategy.save_checkpoint( + save_dir, global_step, ckpt_id, is_last_step=is_last_step + ) metrics = { f"time/{self.cluster_name}/do_checkpoint/total": total_timer.last, @@ -396,46 +330,217 @@ def do_checkpoint(self, global_step): output = DataProto(meta_info={"metrics": metrics}) return output - @register(dispatch_mode=Dispatch.ONE_TO_ALL, clear_cache=False) - def add_request(self, command, data: DataProto): + +class InferWorker(Worker): + def __init__(self, worker_config: WorkerConfig): + super().__init__(worker_config=worker_config) + self.tokenizer = None + self.strategy = None + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + async def initialize(self, pipeline_config): + super().initialize(pipeline_config) + + self.strategy = create_strategy(worker=self) + + await self.strategy.initialize(model_provider=default_actor_model_provider) + self.tokenizer = self.strategy.tokenizer + self.logger.info(f"{self.worker_name} initialized") + + await self.strategy.offload_states() + + # Platform must have been initialized when calling current_platform.reset_max_memory_allocated + # with arguments (inside state_offload_manager). We explicitly init platform here because + # current process is used as engine client when using vllm v1 engine, and + # there is no chance to init platform context. + current_platform.init() + + # TODO shigao 之前stop_server会返回一些offload_state_manager的metrics,现在删掉是否可行 + # def start_server + # def stop_server + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + async def load_states(self, *args, **kwargs): + await self.strategy.load_states(*args, **kwargs) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + async def offload_states(self, *args, **kwargs): + await self.strategy.offload_states(*args, **kwargs) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + async def load_states_partial(self, target_dp_ranks: List[int]): + """Load states for workers whose dp_rank is in target_dp_ranks.""" + + # Log entry memory (only for TP rank 0 to reduce log spam) + if self.rank_info.tp_rank == 0: + log_gpu_memory_usage( + head=f"Worker {self.rank} (DP {self.rank_info.dp_rank}) load_states_partial_entry", + logger=self.logger, + rank=None + ) + + assert getattr(self, "strategy", None) is not None, "worker has no strategy to load" + if self.rank_info.dp_rank in target_dp_ranks: + # AST: AST_PRECONDITION(is_model_in_gpu is False) - verify strategy offloaded before load + is_loaded = self._get_strategy_load_state() + + assert is_loaded is False, ( + f"Pre-condition: strategy must be offloaded before load_states_partial, " + f"got Worker {self.rank} (DP {self.rank_info.dp_rank}) is_model_in_gpu={is_loaded}" + ) + + await self.strategy.load_states() + self.logger.info(f"Worker {self.rank} (DP {self.rank_info.dp_rank}) loaded states") + else: + self.logger.debug(f"Worker {self.rank} (DP {self.rank_info.dp_rank}) skipped load") + + + # Log exit memory (only for TP rank 0 to reduce log spam) + if self.rank_info.tp_rank == 0: + log_gpu_memory_usage( + head=f"Worker {self.rank} (DP {self.rank_info.dp_rank}) load_states_partial_exit", + logger=self.logger, + rank=None + ) + + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + async def offload_states_partial(self, target_dp_ranks: List[int]): + """Offload states for workers whose dp_rank is in target_dp_ranks.""" + + # Log entry memory (only for TP rank 0 to reduce log spam) + if self.rank_info.tp_rank == 0: + log_gpu_memory_usage( + head=f"Worker {self.rank} (DP {self.rank_info.dp_rank}) offload_states_partial_entry", + logger=self.logger, + rank=None + ) + + assert getattr(self, "strategy", None) is not None, "worker has no strategy to offload" + if self.rank_info.dp_rank in target_dp_ranks: + # AST: AST_PRECONDITION(is_model_in_gpu is True) - verify strategy loaded before offload + is_loaded = self._get_strategy_load_state() + + assert is_loaded is True, ( + f"Pre-condition: strategy must be loaded before offload_states_partial, " + f"got Worker {self.rank} (DP {self.rank_info.dp_rank}) is_model_in_gpu={is_loaded}" + ) + + await self.strategy.offload_states() + self.logger.info(f"Worker {self.rank} (DP {self.rank_info.dp_rank}) offloaded states") + else: + self.logger.debug(f"Worker {self.rank} (DP {self.rank_info.dp_rank}) skipped offload") + + + # Log exit memory and verify offload success (only for TP rank 0 to reduce log spam) + if self.rank_info.tp_rank == 0: + log_gpu_memory_usage( + head=f"Worker {self.rank} (DP {self.rank_info.dp_rank}) offload_states_partial_exit", + logger=self.logger, + rank=None + ) + + # Verify offloaded workers have near-zero GPU memory usage + if self.rank_info.dp_rank in target_dp_ranks: + import torch + gpu_memory_gb = torch.cuda.memory_allocated() / 1024**3 + if gpu_memory_gb > 1.0: + raise RuntimeError( + f"GPU memory not properly offloaded for Worker {self.rank} (DP {self.rank_info.dp_rank}): " + f"{gpu_memory_gb:.2f} GB still allocated (expected < 1 GB after offload)" + ) + + + async def broadcast_parameter(self, *args, **kwargs): + await self.strategy.broadcast_parameter(*args, **kwargs) + + async def setup_collective_group(self, *args, **kwargs): + await self.strategy.setup_collective_group(*args, **kwargs) + + async def start_model_update(self, *args, **kwargs): + raise NotImplementedError + + async def update_parameter_in_bucket(self, *args, **kwargs): + await self.strategy.update_parameter_in_bucket(*args, **kwargs) + + async def add_lora(self, *args, **kwargs): + await self.strategy.add_lora(*args, **kwargs) + + @register(dispatch_mode=Dispatch.DP_MP_COMPUTE) + async def generate(self, data: DataProto): + """ + batch = TensorDict( + { + 'prompts': idx, + 'responses': response, + 'input_ids': seq, # here input_ids become the whole sentences + 'attention_mask': attention_mask, + 'position_ids': position_ids, + 'old_log_probs': log_probs, + }, + batch_size=batch_size) + return DataProto(batch=batch) + """ + if "generation_config" not in data.meta_info: + generation_config = self.worker_config.generating_args.to_dict() + else: + generation_config = data.meta_info["generation_config"] + + generation_config["eos_token_id"] = [self.tokenizer.eos_token_id, self.tokenizer.pad_token_id] + generation_config["pad_token_id"] = self.tokenizer.pad_token_id + + global_step = data.meta_info.get("global_step", 0) + self.logger.info(f"{self.worker_name} generate global step {global_step}") + + data = data.to("cuda") + data.meta_info["micro_batch_size"] = self.worker_config.infer_batch_size + + is_offload_states = data.meta_info.get("is_offload_states", True) + # state_offload_manager does not support async context + await self.strategy.load_states() + try: + output = await self.strategy.generate(batch=data, generation_config=generation_config) + output = postprocess_generate( + prompts=data, + output=output, + num_return_sequences=generation_config["num_return_sequences"], + sequence_length=self.pipeline_config.sequence_length, + eos_token_id=self.tokenizer.eos_token_id, + pad_token_id=self.tokenizer.pad_token_id, + ) + data.to("cpu") + output = output.to("cpu") + finally: + if is_offload_states: + await self.strategy.offload_states() + + return output + + async def generate_request(self, data: DataProto): """ - data req meta_info里需要包含: - request_id: str - response_callback_fn: callable + data req meta_info里需要包含: request_id: str generation_config, 按request设置 + + on request cancellation: return DataProto with finish_reasons 'abort' """ - def alive_check(): - if self.thread_server is not None: - if not self.thread_server.is_alive(): - raise Exception("thread server has stopped unexpectedly. check stderr for more info.") - if command == GenerateRequestType.ALIVE_CHECK: - alive_check() - output = DataProto(meta_info={"request_counts": len(self.response_call_back_fns)}) - return output - elif command == GenerateRequestType.ADD: - alive_check() - assert "response_callback_fn" in data.meta_info, "response_callback_fn is not in data.meta_info" - is_num_return_sequences_expand = data.meta_info.get("is_num_return_sequences_expand", False) - if "generation_config" not in data.meta_info: - generation_config = self.worker_config.generating_args.to_dict() - if is_num_return_sequences_expand: - self.worker_config.generating_args.num_return_sequences = 1 - generation_config["num_return_sequences"] = 1 - self.logger.info(f"is_num_return_sequences_expand is True, set num_return_sequences to 1.") - else: - generation_config = data.meta_info["generation_config"] - generation_config["eos_token_id"] = [self.tokenizer.eos_token_id, self.tokenizer.pad_token_id] - generation_config["pad_token_id"] = self.tokenizer.pad_token_id - data.meta_info["generation_config"] = generation_config - self.response_call_back_fns[data.meta_info["request_id"]] = data.meta_info.pop("response_callback_fn") - self.strategy.add_request(command=command, data=data) - return DataProto(meta_info={"request_counts": len(self.response_call_back_fns)}) - - def request_complete(self, data: DataProto): + is_num_return_sequences_expand = data.meta_info.get("is_num_return_sequences_expand", False) + if "generation_config" not in data.meta_info: + generation_config = self.worker_config.generating_args.to_dict() + if is_num_return_sequences_expand: + self.worker_config.generating_args.num_return_sequences = 1 + generation_config["num_return_sequences"] = 1 + self.logger.info(f"is_num_return_sequences_expand is True, set num_return_sequences to 1.") + else: + generation_config = data.meta_info["generation_config"] + generation_config["eos_token_id"] = [self.tokenizer.eos_token_id, self.tokenizer.pad_token_id] + generation_config["pad_token_id"] = self.tokenizer.pad_token_id + data.meta_info["generation_config"] = generation_config + data = await self.strategy.generate_request(data=data) data.meta_info["eos_token_id"] = self.tokenizer.eos_token_id data.meta_info["pad_token_id"] = self.tokenizer.pad_token_id - response_call_back_fn = self.response_call_back_fns.pop(data.meta_info["request_id"]) - self.response_callback_refs.append(response_call_back_fn(data)) + return data + + async def abort_requests(self, request_ids): + await self.strategy.abort_requests(request_ids) class CriticWorker(Worker): @@ -578,13 +683,15 @@ def forward_func_values(self, data: DataProto, output_tensor: torch.Tensor): return values, {"values": values.clone().detach()} @register(dispatch_mode=Dispatch.ONE_TO_ALL) - def do_checkpoint(self, global_step): + def do_checkpoint(self, global_step, is_last_step=None): with Timer("do_checkpoint") as total_timer: ckpt_id = f"checkpoint-{global_step}" save_dir = os.path.join(self.pipeline_config.output_dir, self.worker_name, ckpt_id, self.cluster_name) critic_save_dir = os.path.join(self.pipeline_config.output_dir, self.worker_name, ckpt_id) self.logger.info(f"save checkpoint-{global_step} to {save_dir}") - exec_metrics: Dict = self.strategy.save_checkpoint(save_dir, global_step, ckpt_id, local_state_path=critic_save_dir) + exec_metrics: Dict = self.strategy.save_checkpoint( + save_dir, global_step, ckpt_id, local_state_path=critic_save_dir, is_last_step=is_last_step + ) metrics = { f"time/{self.cluster_name}/do_checkpoint/total": total_timer.last, diff --git a/roll/pipeline/diffusion/reward_fl/reward_fl_pipeline.py b/roll/pipeline/diffusion/reward_fl/reward_fl_pipeline.py index fc8150dde..6f3d15fd3 100644 --- a/roll/pipeline/diffusion/reward_fl/reward_fl_pipeline.py +++ b/roll/pipeline/diffusion/reward_fl/reward_fl_pipeline.py @@ -84,7 +84,8 @@ def run(self): with Timer(name="step_total", logger=None) as step_total_timer: batch_dict: Dict batch: DataProto = DataProto.from_single_dict(batch_dict) - batch.meta_info = {"global_step": global_step, "is_offload_states": False, "is_offload_optimizer_states_in_train_step": False} + batch.meta_info = {"global_step": global_step, "is_offload_states": False, + "is_offload_optimizer_states_in_train_step": False, "loss_mask_keys": []} with Timer(name="actor_train", logger=None) as actor_train_timer: actor_train_refs = self.actor_train.train_step(batch, blocking=False) diff --git a/roll/pipeline/distill/distill_config.py b/roll/pipeline/distill/distill_config.py index 6bebd3bae..e72f0c2cb 100644 --- a/roll/pipeline/distill/distill_config.py +++ b/roll/pipeline/distill/distill_config.py @@ -136,8 +136,6 @@ def __post_init__(self): self.target_vocab_size = None - self.validate_worker_config() - def to_dict(self): return dataclasses.asdict(self) diff --git a/roll/pipeline/distill/distill_pipeline.py b/roll/pipeline/distill/distill_pipeline.py index 63ed4e129..0b4dc6d8a 100644 --- a/roll/pipeline/distill/distill_pipeline.py +++ b/roll/pipeline/distill/distill_pipeline.py @@ -24,7 +24,7 @@ from roll.utils.metrics.metrics_manager import MetricsManager from roll.utils.constants import IGNORE_INDEX from roll.pipeline.distill.logits_transfer_group import LogitsTransferGroup - +from roll.utils.functionals import batch_balance logger = get_logger() @@ -233,10 +233,11 @@ def __init__(self, pipeline_config: DistillConfig): self.logits_transfer_group = LogitsTransferGroup(self.teacher, self.student, backend=self.pipeline_config.logits_transfer_backend,) - self.dataloader = get_dataloader(dataset, - self.pipeline_config.student.training_args.per_device_train_batch_size *\ + self.batch_size = self.pipeline_config.student.training_args.per_device_train_batch_size *\ self.pipeline_config.student.training_args.gradient_accumulation_steps *\ - self.student.get_rank_info(0).dp_size, + self.student.dp_size + self.dataloader = get_dataloader(dataset, + self.batch_size, data_collator, num_proc=self.pipeline_config.student.training_args.dataloader_num_workers) @@ -283,7 +284,12 @@ def run(self): metrics_mgr.add_metric("time/val", val_timer.last) batch: DataProto = DataProto.from_single_dict(batch_dict) - batch.meta_info = {"global_step": global_step, "is_offload_states": False, "is_offload_optimizer_states_in_train_step": False} + batch.meta_info = {"global_step": global_step, "is_offload_states": False, "is_offload_optimizer_states_in_train_step": False, + 'loss_mask_keys': ['labels_for_loss']} + # Reorder data for DP rank load balancing + batch_balance_metrics = batch_balance(batch, dp_size=self.student.dp_size, minibatch_size=self.batch_size) + metrics_mgr.add_metrics(batch_balance_metrics) + batch_offset = self.logits_transfer_group.apply_offset_by_dp(batch) with Timer(name="step_train", logger=None) as step_train_timer: with Timer(name="teacher_forward", logger=None) as teacher_timer: diff --git a/roll/pipeline/distill/distill_vlm_pipeline.py b/roll/pipeline/distill/distill_vlm_pipeline.py index 1798b6a7d..40672161a 100644 --- a/roll/pipeline/distill/distill_vlm_pipeline.py +++ b/roll/pipeline/distill/distill_vlm_pipeline.py @@ -262,7 +262,8 @@ def run(self): metrics_mgr.clear_metrics() batch: DataProto = DataProto.from_single_dict(batch_dict) - batch.meta_info = {"global_step": global_step, "is_offload_states": False, "is_offload_optimizer_states_in_train_step": False} + batch.meta_info = {"global_step": global_step, "is_offload_states": False, + "is_offload_optimizer_states_in_train_step": False, "loss_mask_keys": ["labels_for_loss"]} batch_offset = self.logits_transfer_group.apply_offset_by_dp(batch) with Timer(name="step_train", logger=None) as step_train_timer: with Timer(name="teacher_forward", logger=None) as teacher_timer: diff --git a/roll/pipeline/distill/distill_worker.py b/roll/pipeline/distill/distill_worker.py index f6875ac0d..4cba33fe3 100644 --- a/roll/pipeline/distill/distill_worker.py +++ b/roll/pipeline/distill/distill_worker.py @@ -43,10 +43,6 @@ def __init__(self, worker_config: WorkerConfig): self.teacher_log_probs = None self.teacher_topk_indices = None self.teacher_inf_mask = None - self.teacher_probs_iterator = None - self.teacher_log_probs_iterator = None - self.teacher_topk_indices_iterator = None - self.teacher_inf_mask_iterator = None @register(dispatch_mode=Dispatch.ONE_TO_ALL) def initialize(self, pipeline_config): @@ -76,20 +72,6 @@ def train_step(self, data: DataProto): is_offload_states = data.meta_info.get("is_offload_states", True) metrics = {} micro_batch_size = self.worker_config.training_args.per_device_train_batch_size - - # Retrieve the teacher logits - if self.rank_info.is_pipeline_last_stage: - self.teacher_probs = self.probs_cache.pop_full_logits() - self.teacher_probs_iterator = iter(self.teacher_probs.split(micro_batch_size, dim=0)) - self.teacher_log_probs = self.log_probs_cache.pop_full_logits() - self.teacher_log_probs_iterator = iter(self.teacher_log_probs.split(micro_batch_size, dim=0)) - # Retrieve the teacher_topk_indices - if self.rank_info.is_pipeline_last_stage: - self.teacher_topk_indices = self.topk_indices_cache.pop_full_logits() - if self.pipeline_config.logits_topk != 0: - self.teacher_topk_indices_iterator = iter(self.teacher_topk_indices.split(micro_batch_size, dim=0)) - self.teacher_inf_mask = self.inf_mask_cache.pop_full_logits() - self.teacher_inf_mask_iterator = iter(self.teacher_inf_mask.split(micro_batch_size, dim=0)) self.logger.info(f"is_offload_states: {is_offload_states}") with state_offload_manger( strategy=self.strategy, @@ -100,20 +82,20 @@ def train_step(self, data: DataProto): ): data = data.to(current_platform.device_type) data = self.strategy.get_data_input(data) + if self.rank_info.is_pipeline_last_stage: + # Retrieve the teacher logits + data.batch['teacher_probs'] = self.probs_cache.pop_full_logits() + data.batch['teacher_log_probs'] = self.log_probs_cache.pop_full_logits() + # Retrieve the teacher_topk_indices + if self.pipeline_config.logits_topk != 0: + data.batch['teacher_topk_indices'] = self.topk_indices_cache.pop_full_logits() + data.batch['teacher_inf_mask'] = self.inf_mask_cache.pop_full_logits() if "labels" in data.batch.keys(): # rename key: labels -> labels_for_loss data.batch.rename_key_("labels", "labels_for_loss") self.logger.info(f"global_step: {data.meta_info.get('global_step',0)}") - per_device_train_batch_size = self.worker_config.training_args.per_device_train_batch_size - backward_batch_size = ( - per_device_train_batch_size * self.worker_config.training_args.gradient_accumulation_steps - ) - loss_func = self.loss_func - if self.worker_config.use_sequence_packing: - from roll.utils.sequence_packing import SequencePackingDistillLossWrapper - loss_func = SequencePackingDistillLossWrapper(self.strategy, loss_func) - student_metrics = self.strategy.train_step(batch=data, loss_func=loss_func) + student_metrics = self.strategy.train_step(batch=data, loss_func=self.loss_func) append_to_dict(metrics, student_metrics) data.to("cpu") @@ -126,44 +108,38 @@ def train_step(self, data: DataProto): def loss_func(self, data: DataProto, output_tensor: torch.Tensor): """ Loss function interface definition: - data: DataProto, passed through unchanged from train_step + data: DataProto, passed through unchanged from train_step output_tensor: torch.Tensor, the tensor returned by model.forward() """ + batch_num_tokens = data.meta_info['batch_num_tokens']['labels_for_loss'] student_logits = output_tensor labels = data.batch['labels_for_loss'] # language loss - gpt_loss, _ = self.strategy.op_compute_language_loss_from_logits(student_logits, labels) + gpt_loss, _ = self.strategy.op_compute_language_loss_from_logits(student_logits, labels, reduction='sum') + gpt_loss = gpt_loss / batch_num_tokens # distill loss - if self.teacher_probs_iterator is not None: - teacher_probs = next(self.teacher_probs_iterator) - else: - teacher_probs = None - if self.teacher_log_probs_iterator is not None: - teacher_log_probs = next(self.teacher_log_probs_iterator) - else: - teacher_log_probs = None - if self.teacher_topk_indices_iterator is not None: - teacher_topk_indices = next(self.teacher_topk_indices_iterator) + teacher_probs = data.batch['teacher_probs'] + teacher_log_probs = data.batch['teacher_log_probs'] + if 'teacher_topk_indices' in data.batch: + teacher_topk_indices = data.batch['teacher_topk_indices'] else: teacher_topk_indices = None - if self.teacher_inf_mask_iterator is not None: - teacher_inf_mask = next(self.teacher_inf_mask_iterator) - else: - teacher_inf_mask = None + teacher_inf_mask = data.batch['teacher_inf_mask'] distill_loss, _ = self.strategy.op_compute_various_divergence(self.kl_loss_func, student_logits, teacher_probs, teacher_log_probs, teacher_topk_indices, teacher_inf_mask - , labels, attention_mask=None,) + , labels, attention_mask=None, reduction='sum') + distill_loss = distill_loss / batch_num_tokens loss = ((1 - self.pipeline_config.distill_loss_weight) * gpt_loss + self.pipeline_config.distill_loss_weight * distill_loss) student_metrics = { - "train/loss": loss.detach().item(), - "train/train_distill_loss": distill_loss.detach().item(), - "train/train_student_loss": gpt_loss.detach().item(), + "train/loss@sum": loss.detach().item(), + "train/train_distill_loss@sum": distill_loss.detach().item(), + "train/train_student_loss@sum": gpt_loss.detach().item(), } return loss, student_metrics @@ -180,20 +156,22 @@ def val_step(self, data: DataProto): return output def loss_func_for_eval(self, data: DataProto, output_tensor: torch.Tensor): + batch_num_tokens = data.meta_info['batch_num_tokens']['labels_for_loss'] labels = data.batch['labels_for_loss'] - gpt_loss, _ = self.strategy.op_compute_language_loss_from_logits(output_tensor, labels) + gpt_loss, _ = self.strategy.op_compute_language_loss_from_logits(output_tensor, labels, reduction='sum') + gpt_loss = gpt_loss / batch_num_tokens student_metrics = { - "student/val_loss": gpt_loss.detach().item(), + "student/val_loss@sum": gpt_loss.detach().item(), } return gpt_loss, student_metrics @register(dispatch_mode=Dispatch.ONE_TO_ALL) - def do_checkpoint(self, global_step): + def do_checkpoint(self, global_step, is_last_step=False): with Timer("do_checkpoint") as total_timer: ckpt_id = f"checkpoint-{global_step}" save_dir = os.path.join(self.pipeline_config.output_dir, self.worker_name, ckpt_id, self.cluster_name) self.logger.info(f"save checkpoint-{global_step} to {save_dir}") - exec_metrics: Dict = self.strategy.save_checkpoint(save_dir, global_step, ckpt_id) + exec_metrics: Dict = self.strategy.save_checkpoint(save_dir, global_step, ckpt_id, is_last_step=is_last_step) metrics = { f"time/{self.cluster_name}/do_checkpoint/total": total_timer.last, @@ -256,19 +234,22 @@ def broadcast_logits(self, tensor_name_for_transfer, tp=False, cp=False): logits_cache = getattr(self, cache_name) rank_info = self.rank_info self.logger.info( - f"[Student][broadcast_logits] rank={dist.get_rank()}, pp={rank_info.pp_rank}, dp={rank_info.dp_rank}, tp={rank_info.tp_rank}, " + f"[Student][broadcast_logits] rank={dist.get_rank()}, pp={rank_info.pp_rank}, dp={rank_info.dp_rank}," + f" tp={rank_info.tp_rank}, cp={rank_info.cp_rank} " f"is_pipeline_last_stage={rank_info.is_pipeline_last_stage}, tp_size={rank_info.tp_size}" ) - if rank_info.is_pipeline_last_stage and rank_info.tp_size > 1: + if rank_info.is_pipeline_last_stage and (rank_info.tp_size > 1 or rank_info.cp_size > 1): assert self.strategy.strategy_name == "megatron_train", \ - f"Error in DistillWorker broadcast_logits: {self.strategy.strategy_name} with tp_size == {rank_info.tp_size}" + f"Error in DistillWorker broadcast_logits: {self.strategy.strategy_name}, which must be megatron_train" from megatron.core import mpu - if tp: + if tp and rank_info.tp_size > 1: group = mpu.get_tensor_model_parallel_group() rank = rank_info.tp_rank - else: + elif cp and rank_info.cp_size > 1: group = mpu.get_context_parallel_group() rank = rank_info.cp_rank + else: + return self.logger.info( f"[Student][broadcast_logits] calling logits_cache.broadcast_from_dynamic_holder(), tp={tp}, cp={cp}, group={group}, rank={rank}" ) @@ -497,8 +478,8 @@ def forward_func(self, data: DataProto, output_tensor: torch.Tensor, non_loss_da def forward(self, data: DataProto): data = self.strategy.get_data_input(data) if "labels" in data.batch.keys(): - keep_keys = [k for k in data.batch.keys() if k != "labels"] - data = data.select(batch_keys=keep_keys, deepcopy=False) + # rename key: labels -> labels_for_loss + data.batch.rename_key_("labels", "labels_for_loss") is_offload_states = data.meta_info.get("is_offload_states", False) metrics = {} with state_offload_manger( @@ -510,20 +491,12 @@ def forward(self, data: DataProto): ): data = data.to(current_platform.device_type) data.meta_info["micro_batch_size"] = self.pipeline_config.teacher.training_args.per_device_train_batch_size - assert self.pipeline_config.teacher.training_args.per_device_train_batch_size <= \ - self.pipeline_config.student.training_args.per_device_train_batch_size, \ - "Teacher's per_device_train_batch_size must be less than or equal to student's." - + assert data.meta_info["micro_batch_size"] <= data.batch.batch_size[0] data.meta_info["output_on_all_tp_cp_ranks"] = True self.logger.info(f"global_step: {data.meta_info.get('global_step', 0)}") - forward_func = self.forward_func - if self.worker_config.use_sequence_packing: - from roll.utils.sequence_packing import SequencePackingDistillForwardWrapper - forward_func = SequencePackingDistillForwardWrapper(self.strategy, forward_func) - with torch.no_grad(): - forward_output = self.strategy.forward_step(batch=data, forward_func=forward_func) + forward_output = self.strategy.forward_step(batch=data, forward_func=self.forward_func) self.topk_probs = None self.topk_log_probs = None self.topk_indices = None diff --git a/roll/pipeline/dpo/dpo_config.py b/roll/pipeline/dpo/dpo_config.py index cdc38afba..7ca21beae 100644 --- a/roll/pipeline/dpo/dpo_config.py +++ b/roll/pipeline/dpo/dpo_config.py @@ -75,7 +75,11 @@ def __post_init__(self): self.actor_train.name = "actor_train" self.reference.name = "reference" - self.validate_worker_config() + assert self.actor_train.use_sequence_packing == False and self.reference.use_sequence_packing == False,\ + "dpo pipeline doesn't support use sequence packing now" + + self.actor_train.apply_loss_scale = False + self.reference.apply_loss_scale = False def set_max_steps(self, max_steps: int): self.max_steps = max_steps diff --git a/roll/pipeline/dpo/dpo_pipeline.py b/roll/pipeline/dpo/dpo_pipeline.py index b5450c7f0..00641f870 100644 --- a/roll/pipeline/dpo/dpo_pipeline.py +++ b/roll/pipeline/dpo/dpo_pipeline.py @@ -1,6 +1,7 @@ import copy import json import os +import time from typing import Any, Dict, List import datasets @@ -197,7 +198,8 @@ def run(self): with Timer(name="step_total", logger=None) as step_total_timer: batch_dict: Dict batch: DataProto = DataProto.from_single_dict(batch_dict) - batch.meta_info = {"global_step": global_step, "is_offload_states": False, "is_offload_optimizer_states_in_train_step": False} + batch.meta_info = {"global_step": global_step, "is_offload_states": False, + "is_offload_optimizer_states_in_train_step": False, 'loss_mask_keys': []} with Timer(name="cal_ref_log_probs", logger=None) as cal_ref_log_probs_timer: ref_log_probs = self.reference.compute_log_probs(batch, blocking=True) @@ -246,6 +248,7 @@ def val(self): for batch_dict in tqdm(self.val_dataloader): batch_dict: Dict batch: DataProto = DataProto.from_single_dict(batch_dict) + batch.meta_info['loss_mask_keys'] = [] with Timer(name="cal_ref_log_probs", logger=None) as cal_ref_log_probs_timer: ref_log_probs = self.reference.compute_log_probs(batch, blocking=True) diff --git a/roll/pipeline/rlvr/actor_pg_worker.py b/roll/pipeline/rlvr/actor_pg_worker.py index 2910fed0f..5e6f8e946 100644 --- a/roll/pipeline/rlvr/actor_pg_worker.py +++ b/roll/pipeline/rlvr/actor_pg_worker.py @@ -4,6 +4,7 @@ from roll.distributed.scheduler.protocol import DataProto from roll.utils.functionals import masked_mean, agg_loss, compute_approx_kl from roll.pipeline.rlvr.actor_worker import ActorWorker +from roll.utils.train_infer_corrections import compute_train_infer_correction class ActorPGWorker(ActorWorker): @@ -15,7 +16,7 @@ def __init__(self, *args, **kwargs): self._topr_sample_logged = False self._cispo_config_logged = False self._kimi15_config_logged = False - + def _get_or_cache_config(self, key, default_value): """获取或缓存配置值""" if key not in self._pg_config_cache: @@ -35,16 +36,41 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): ref_log_probs = data.batch["ref_log_probs"] advantages = data.batch["advantages"] + batch_num_tokens = data.meta_info['batch_num_tokens'] + global_valid_samples = data.meta_info['global_valid_samples'] + if 'final_response_mask' not in batch_num_tokens: + batch_num_tokens['final_response_mask'] = batch_num_tokens['response_mask'] + global_valid_samples['final_response_mask'] = global_valid_samples['response_mask'] + log_probs = self.strategy.op_compute_log_probs( logits=output_tensor, input_ids=data.batch["input_ids"], attention_mask=data.batch["response_mask"] ) old_log_probs = self.get_old_log_probs_with_cache(data, log_probs) + infer_log_probs = data.batch.get("infer_logprobs", old_log_probs) + infer_log_probs = infer_log_probs if len(infer_log_probs) > 0 else old_log_probs + train_infer_metric = {} + if not self.pipeline_config.enable_old_logprobs_recompute: + train_infer_is_weight, filter_mask, train_infer_metric = compute_train_infer_correction( + cfg=self.pipeline_config.train_infer_correction, + response_mask=response_mask, + old_log_probs=old_log_probs, + infer_log_probs=infer_log_probs, + global_valid_samples=global_valid_samples['response_mask'], + global_valid_tokens=batch_num_tokens['response_mask'], + ) + + # Apply filter mask to both response_mask and final_response_mask + response_mask = response_mask.long() * filter_mask.long() + final_response_mask = final_response_mask.long() * filter_mask.long() + else: + train_infer_is_weight = data.batch['train_infer_is_weight'] + valid_samples = torch.any(final_response_mask > 0, dim=1).float() sample_weights = self.compute_sample_weights(data, response_mask) ratio = (log_probs - old_log_probs).exp() - + # 预先计算并缓存一些通用指标 self._cached_metrics = { "ratio": ratio, @@ -57,15 +83,15 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): } pg_variant = self._get_or_cache_config('pg_variant', 'vanilla') - + if not self._pg_variant_logged: self.logger.info(f"Policy Gradient Used: {pg_variant}") self._pg_variant_logged = True - + if pg_variant == 'vanilla': # Basic Policy Gradient pg_loss = self._compute_vanilla_pg_loss(ratio, log_probs, advantages) elif pg_variant == 'ppo': # Proximal Policy Optimization - pg_loss = self._compute_ppo_loss(ratio, advantages) + pg_loss = self._compute_ppo_loss(ratio, advantages, final_response_mask, batch_num_tokens, global_valid_samples) elif pg_variant == 'tis': # Truncated Importance Sampling pg_loss = self._compute_tis_loss(ratio, log_probs, old_log_probs, advantages, data) elif pg_variant == 'topr': # Tapered off-policy REINFORCE @@ -77,17 +103,26 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): else: raise ValueError(f"Unsupported pg_variant: {pg_variant}") + if self.pipeline_config.train_infer_correction.is_weight.enabled: + pg_loss = pg_loss * train_infer_is_weight + weighted_pg_loss = agg_loss(loss_mat=pg_loss, loss_mask=final_response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode, weights=sample_weights) + loss_agg_mode=self.pipeline_config.loss_agg_mode, weights=sample_weights, + batch_num_tokens=batch_num_tokens['final_response_mask'], + global_valid_samples=global_valid_samples['final_response_mask'],) original_pg_loss = agg_loss(loss_mat=pg_loss, loss_mask=final_response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode) + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['final_response_mask'], + global_valid_samples=global_valid_samples['final_response_mask'],) kl_loss = compute_approx_kl( log_probs=log_probs, log_probs_base=ref_log_probs, action_mask=final_response_mask, kl_penalty="k3" ) kl_loss = agg_loss(loss_mat=kl_loss, loss_mask=final_response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode) + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['final_response_mask'], + global_valid_samples=global_valid_samples['final_response_mask'],) approxkl = compute_approx_kl( log_probs=log_probs, log_probs_base=old_log_probs, action_mask=response_mask, kl_penalty="mse" @@ -101,6 +136,8 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): loss_mat=entropy, loss_mask=data.batch["response_mask"][:, 1:], loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'], ) # 缓存损失相关指标 @@ -126,17 +163,18 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): self._cached_metrics["total_loss"] = total_loss # 使用缓存的指标 - pg_metrics = self._get_pg_metrics(data) - + pg_metrics = self._get_pg_metrics(data, batch_num_tokens=batch_num_tokens, global_valid_samples=global_valid_samples,) + pg_metrics.update(train_infer_metric) return total_loss, pg_metrics - def _compute_ppo_loss(self, ratio: torch.Tensor, advantages: torch.Tensor): + def _compute_ppo_loss(self, ratio: torch.Tensor, advantages: torch.Tensor, final_response_mask: torch.Tensor, + batch_num_tokens: dict, global_valid_samples: dict): """ 计算PPO损失 """ pg_clip = self.pipeline_config.pg_clip pg_clip_low = self.pipeline_config.pg_clip_low if self.pipeline_config.use_pg_clip_range else self.pipeline_config.pg_clip - pg_clip_high = self.pipeline_config.pg_clip_high if self.pipeline_config.use_pg_clip_range else self.pipeline_config.pg_clip + pg_clip_high = self.pipeline_config.pg_clip_high if self.pipeline_config.use_pg_clip_range else self.pipeline_config.pg_clip surr1 = ratio * advantages surr2 = ratio.clamp(1 - pg_clip_low, 1 + pg_clip_high) * advantages loss = -torch.min(surr1, surr2) @@ -144,16 +182,25 @@ def _compute_ppo_loss(self, ratio: torch.Tensor, advantages: torch.Tensor): dual_clip_loss = -torch.max(-loss, (1 + pg_clip * 2) * advantages) loss = torch.where(advantages < 0, dual_clip_loss, loss) - + # 缓存PPO相关指标 clipped_low = (ratio < 1 - pg_clip_low).float() clipped_high = (ratio > 1 + pg_clip_high).float() clipped = (clipped_low + clipped_high).float() - + self._cached_metrics.update({ - "ppo_ratio_high_clipfrac": clipped_high.mean().detach().item(), - "ppo_ratio_low_clipfrac": clipped_low.mean().detach().item(), - "ppo_ratio_clipfrac": clipped.mean().detach().item(), + "ppo_ratio_high_clipfrac": agg_loss(loss_mat=clipped_high, + loss_mask=final_response_mask, + loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['final_response_mask'],).detach().item(), + "ppo_ratio_low_clipfrac": agg_loss(loss_mat=clipped_low, + loss_mask=final_response_mask, + loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['final_response_mask'],).detach().item().detach().item(), + "ppo_ratio_clipfrac": agg_loss(loss_mat=clipped, + loss_mask=final_response_mask, + loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['final_response_mask'],).detach().item().detach().item(), }) return loss @@ -161,15 +208,15 @@ def _compute_ppo_loss(self, ratio: torch.Tensor, advantages: torch.Tensor): def _compute_vanilla_pg_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, advantages: torch.Tensor): """ 计算原始Policy Gradient损失 - + Args: ratio: 重要性采样比率 π(a|s) / π_old(a|s) advantages: 优势函数值 - + Returns: pg_loss: Policy Gradient损失 """ - + return -log_probs * advantages def _compute_tis_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, old_log_probs: torch.Tensor, advantages: torch.Tensor, data: DataProto): @@ -181,17 +228,17 @@ def _compute_tis_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, old_lo # 缓存TIS配置 tis_lower_bound = self._get_or_cache_config('tis_lower_bound', 0.0) tis_upper_bound = self._get_or_cache_config('tis_upper_bound', 1.0) - + # 截断重要性采样比率 clipped_ratio = torch.clamp(ratio, min=tis_lower_bound, max=tis_upper_bound) TIS_loss = -clipped_ratio.detach() * advantages * log_probs - + # 缓存TIS相关指标 lower_clipped = (ratio < tis_lower_bound).float() upper_clipped = (ratio > tis_upper_bound).float() total_clipped = (lower_clipped + upper_clipped).float() - + self._cached_metrics.update({ "tis_lower_bound": tis_lower_bound, "tis_upper_bound": tis_upper_bound, @@ -200,38 +247,38 @@ def _compute_tis_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, old_lo "tis_total_clipfrac": total_clipped.mean().detach().item(), "tis_clipped_ratio": clipped_ratio.detach(), }) - + return TIS_loss - def _compute_topr_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, old_log_probs: torch.Tensor, + def _compute_topr_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, old_log_probs: torch.Tensor, advantages: torch.Tensor, data: DataProto): """ 计算TOPR (Tapered off-policy REINFORCE) 损失. https://arxiv.org/abs/2503.14286 - + 根据论文公式(8): ∇J_TOPR(π) = Σ_{τ∈T^+} μ(τ)R(τ)∇log π(τ) + Σ_{τ∈T^-} μ(τ)[π(τ)/μ(τ)]_0^1 R(τ)∇log π(τ) - + - 正样本(T^+): SFT更新, 直接对log π(τ)求导, 不使用importance sampling - 负样本(T^-): TIS更新, 使用clipped importance sampling ratio [0,1] - + Args: ratio: 重要性采样比率 π(a|s) / π_old(a|s) [batch_size, seq_len] log_probs: 当前策略的log概率 [batch_size, seq_len] old_log_probs: 旧策略的log概率 [batch_size, seq_len] - advantages: 优势函数值 [batch_size, seq_len] + advantages: 优势函数值 [batch_size, seq_len] data: 数据,包含奖励/分数信息 - + Returns: topr_loss: TOPR损失 [batch_size, seq_len] """ # 缓存TOPR配置 positive_weight = self._get_or_cache_config('topr_positive_weight', 1.0) negative_weight = self._get_or_cache_config('topr_negative_weight', 1.0) - + scores = data.batch['scores'] positive_mask = (scores > 0).float() negative_mask = (scores <= 0).float() - + if not self._topr_sample_logged: total_samples = len(scores) positive_count = positive_mask.sum().item() @@ -240,18 +287,18 @@ def _compute_topr_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, old_l self.logger.info(f"TOPR奖励统计 - 平均: {scores.mean().item():.4f}, 标准差: {scores.std().item():.4f}, 最大: {scores.max().item():.4f}, 最小: {scores.min().item():.4f}") self.logger.info(f"TOPR权重配置 - 正样本权重: {positive_weight}, 负样本权重: {negative_weight}") self._topr_sample_logged = True - + # 计算损失组件 positive_token_mask = positive_mask.unsqueeze(-1) negative_token_mask = negative_mask.unsqueeze(-1) - + positive_loss = - advantages * log_probs * positive_token_mask - + # 负样本: TIS更新,使用clipped importance sampling ratio # 梯度是: -[π(τ)/μ(τ)]_0^1 * R(τ) * ∇log π(τ) clipped_ratio = torch.clamp(ratio, min=0.0, max=1.0).detach() negative_loss = - clipped_ratio * advantages * log_probs * negative_token_mask - + weighted_positive_loss = positive_weight * positive_loss weighted_negative_loss = negative_weight * negative_loss @@ -264,6 +311,10 @@ def _compute_topr_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, old_l negative_upper_clipped = ((ratio > 1.0) & (negative_token_mask > 0)).float() negative_total_clipped = negative_lower_clipped + negative_upper_clipped self._cached_metrics.update({ + "topr_positive_loss": positive_loss, + "topr_negative_loss": negative_loss, + "topr_weighted_positive_loss": weighted_positive_loss, + "topr_weighted_negative_loss": weighted_negative_loss, "topr_positive_weight": positive_weight, "topr_negative_weight": negative_weight, "topr_positive_samples": positive_mask.sum().detach().item(), @@ -275,29 +326,25 @@ def _compute_topr_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, old_l "topr_negative_total_clipfrac": negative_total_clipped.mean().detach().item(), "topr_scores_mean": scores.mean().detach().item(), "topr_scores_std": scores.std().detach().item(), - "topr_positive_loss": positive_loss, - "topr_negative_loss": negative_loss, - "topr_weighted_positive_loss": weighted_positive_loss, - "topr_weighted_negative_loss": weighted_negative_loss, }) - + return topr_loss def _compute_cispo_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, advantages: torch.Tensor): """ 计算CISPO (Clipped Importance Sampling Policy Optimization) 损失 - + 根据论文: https://arxiv.org/abs/2503.14286 CISPO使用截断的重要性采样权重, 同时使用stop-gradient操作来稳定训练 - + 公式: J_CISPO(θ) = E[sg(r̂_t(θ)) * Â_t * log π_θ(a_t|s_t)] 其中: r̂_t(θ) = clip(r_t(θ), 1-ε_low^IS, 1+ε_high^IS) - + Args: ratio: 重要性采样比率 π(a|s) / π_old(a|s) [batch_size, seq_len] log_probs: 当前策略的log概率 [batch_size, seq_len] advantages: 优势函数值 [batch_size, seq_len] - + Returns: cispo_loss: CISPO损失 [batch_size, seq_len] """ @@ -305,38 +352,38 @@ def _compute_cispo_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, adva epsilon_low = self._get_or_cache_config('cispo_epsilon_low', 0.1) epsilon_high = self._get_or_cache_config('cispo_epsilon_high', 0.1) use_unified_mask = self._get_or_cache_config('cispo_use_unified_mask', False) - + clip_lower = 1.0 - epsilon_low clip_upper = 1.0 + epsilon_high - + if not self._cispo_config_logged: self.logger.info(f"CISPO配置 - epsilon_low: {epsilon_low}, epsilon_high: {epsilon_high}") self.logger.info(f"CISPO截断范围: [{clip_lower:.3f}, {clip_upper:.3f}]") self.logger.info(f"CISPO使用统一mask: {use_unified_mask}") self._cispo_config_logged = True - + clipped_ratio = torch.clamp(ratio, min=clip_lower, max=clip_upper) - + # 缓存CISPO相关指标 lower_clipped = (ratio < clip_lower).float() upper_clipped = (ratio > clip_upper).float() total_clipped = (lower_clipped + upper_clipped).float() - + if use_unified_mask: # 使用统一mask公式 (论文公式7). 实际上应该和PPO一致了 # M_t = 0 if (A_t > 0 and r_t > 1+ε_high) or (A_t < 0 and r_t < 1-ε_low), else 1 positive_advantages = advantages > 0 negative_advantages = advantages < 0 - + mask_positive = positive_advantages & (ratio > clip_upper) mask_negative = negative_advantages & (ratio < clip_lower) - token_mask = ~(mask_positive | mask_negative) - + token_mask = ~(mask_positive | mask_negative) + cispo_loss = -clipped_ratio.detach() * advantages * log_probs * token_mask.float() else: cispo_loss = -clipped_ratio.detach() * advantages * log_probs - + cispo_metrics = { "cispo_epsilon_low": epsilon_low, "cispo_epsilon_high": epsilon_high, @@ -354,27 +401,27 @@ def _compute_cispo_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, adva "cispo_masked_negative_tokens": mask_negative.float().mean().detach().item(), "cispo_kept_tokens": token_mask.float().mean().detach().item(), }) - + self._cached_metrics.update(cispo_metrics) return cispo_loss def _compute_kimi15_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, old_log_probs: torch.Tensor, advantages: torch.Tensor): """ 计算Kimi15损失 https://arxiv.org/pdf/2501.12599 - + 根据论文公式(3): 1/k Σ (∇_θ log π_θ(y_j, z_j|x)(r(x, y_j, y*) - r̄) - τ/2 ∇_θ (log π_θ(y_j, z_j|x)/π_θ_i(y_j, z_j|x))^2) - + 这相当于最小化损失函数的负值: L = -[(r - r̄) * log π_θ - τ/2 * (log π_θ/π_θ_i)^2] """ # 缓存Kimi15配置 tau = self._get_or_cache_config('kimi15_tau', 0.1) - + if not self._kimi15_config_logged: self.logger.info(f"Kimi15配置 - tau (正则化参数): {tau}") self._kimi15_config_logged = True - + # 计算并缓存指标 log_ratio = torch.log(ratio + 1e-8) policy_grad_magnitude = (advantages * log_ratio).abs().mean().item() @@ -391,10 +438,10 @@ def _compute_kimi15_loss(self, ratio: torch.Tensor, log_probs: torch.Tensor, old "kimi15_kl_reg_magnitude": kl_reg_magnitude, "kimi15_reg_ratio": kl_reg_magnitude / (policy_grad_magnitude + 1e-8), }) - + return kimi15_loss - def _get_pg_metrics(self, data: DataProto): + def _get_pg_metrics(self, data: DataProto, batch_num_tokens: dict, global_valid_samples: dict,): """ 获取Policy Gradient相关的指标,使用缓存的值避免重复计算 """ @@ -402,66 +449,85 @@ def _get_pg_metrics(self, data: DataProto): cached = self._cached_metrics ratio = cached["ratio"] response_mask = cached["response_mask"] - + # 构建基础指标 base_metrics = { - "actor/ratio_mean": masked_mean(ratio, response_mask, dim=-1).mean().detach().item(), - "actor/ratio_max": torch.max(ratio * response_mask).detach().item(), - "actor/ratio_min": torch.min(ratio * response_mask + (1 - response_mask) * 1e10).detach().item(), - "actor/pg_loss": cached["original_pg_loss"].detach().item(), - "actor/weighted_pg_loss": cached["weighted_pg_loss"].detach().item(), - "actor/kl_loss": cached["kl_loss"].detach().item(), - "actor/total_loss": cached["total_loss"].detach().item(), - "actor/approxkl": agg_loss(loss_mat=cached["approxkl"], loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).detach().item(), - "actor/policykl": agg_loss(loss_mat=cached["policykl"], loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).detach().item(), - "actor/valid_samples": cached["valid_samples"].sum().detach().item(), - "actor/total_samples": float(cached["valid_samples"].size(0)), + "actor/ratio_mean@sum": agg_loss(loss_mat=ratio, + loss_mask=response_mask, + loss_agg_mode='seq-mean-token-mean', + global_valid_samples=global_valid_samples['final_response_mask'],).detach().item(), + "actor/ratio_max@max": torch.max(ratio * response_mask).detach().item(), + "actor/ratio_min@min": torch.min(ratio * response_mask + (1 - response_mask) * 1e10).detach().item(), + "actor/pg_loss@sum": cached["original_pg_loss"].detach().item(), + "actor/weighted_pg_loss@sum": cached["weighted_pg_loss"].detach().item(), + "actor/kl_loss@sum": cached["kl_loss"].detach().item(), + "actor/total_loss@sum": cached["total_loss"].detach().item(), + "actor/approxkl@sum": agg_loss(loss_mat=cached["approxkl"], loss_mask=response_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'],).detach().item(), + "actor/policykl@sum": agg_loss(loss_mat=cached["policykl"], loss_mask=response_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'],).detach().item(), + "actor/valid_samples@sum": cached["valid_samples"].sum().detach().item(), + "actor/total_samples@sum": float(cached["valid_samples"].size(0)), "actor/valid_sample_ratio": (cached["valid_samples"].sum() / cached["valid_samples"].size(0)).detach().item(), "actor/sample_weights_mean": cached["sample_weights"].mean().detach().item(), "actor/sample_weights_min": cached["sample_weights"].min().detach().item(), "actor/sample_weights_max": cached["sample_weights"].max().detach().item(), } - + # 根据PG变体添加特定指标 pg_variant = cached["pg_variant"] - + if pg_variant == 'ppo': ppo_metrics = { - "actor/ppo_ratio_high_clipfrac": cached["ppo_ratio_high_clipfrac"], - "actor/ppo_ratio_low_clipfrac": cached["ppo_ratio_low_clipfrac"], - "actor/ppo_ratio_clipfrac": cached["ppo_ratio_clipfrac"], + "actor/ppo_ratio_high_clipfrac@sum": cached["ppo_ratio_high_clipfrac"], + "actor/ppo_ratio_low_clipfrac@sum": cached["ppo_ratio_low_clipfrac"], + "actor/ppo_ratio_clipfrac@sum": cached["ppo_ratio_clipfrac"], } base_metrics.update(ppo_metrics) - + elif pg_variant == 'tis': tis_metrics = { "actor/tis_lower_clipfrac": cached["tis_lower_clipfrac"], "actor/tis_upper_clipfrac": cached["tis_upper_clipfrac"], "actor/tis_total_clipfrac": cached["tis_total_clipfrac"], - "actor/tis_clipped_ratio_mean": masked_mean(cached["tis_clipped_ratio"], response_mask, dim=-1).mean().item(), + "actor/tis_clipped_ratio_mean@sum": agg_loss(loss_mat=cached["tis_clipped_ratio"], loss_mask=response_mask, + loss_agg_mode='seq-mean-token-mean', + global_valid_samples=global_valid_samples['response_mask'],).detach().item(), "actor/tis_lower_bound": cached["tis_lower_bound"], "actor/tis_upper_bound": cached["tis_upper_bound"], } base_metrics.update(tis_metrics) - + elif pg_variant == 'topr': # 计算TOPR损失组件的聚合指标 topr_loss_metrics = { "actor/topr_positive_loss": agg_loss(loss_mat=cached["topr_positive_loss"], loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).item(), + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'],).item(), "actor/topr_negative_loss": agg_loss(loss_mat=cached["topr_negative_loss"], loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).item(), - "actor/topr_weighted_positive_loss": agg_loss(loss_mat=cached["topr_weighted_positive_loss"], loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).item(), - "actor/topr_weighted_negative_loss": agg_loss(loss_mat=cached["topr_weighted_negative_loss"], loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).item(), + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'],).item(), + "actor/topr_weighted_positive_loss": agg_loss(loss_mat=cached["topr_weighted_positive_loss"], + loss_mask=response_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'],).item(), + "actor/topr_weighted_negative_loss": agg_loss(loss_mat=cached["topr_weighted_negative_loss"], + loss_mask=response_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'],).item() } - + topr_metrics = { - "actor/topr_positive_samples": cached["topr_positive_samples"], - "actor/topr_negative_samples": cached["topr_negative_samples"], + "actor/topr_positive_samples@sum": cached["topr_positive_samples"], + "actor/topr_negative_samples@sum": cached["topr_negative_samples"], "actor/topr_positive_ratio": cached["topr_positive_ratio"], "actor/topr_negative_ratio": cached["topr_negative_ratio"], "actor/topr_negative_lower_clipfrac": cached["topr_negative_lower_clipfrac"], @@ -474,22 +540,22 @@ def _get_pg_metrics(self, data: DataProto): **topr_loss_metrics, } base_metrics.update(topr_metrics) - + elif pg_variant == 'cispo': cispo_metrics = { - f"actor/cispo_{key}": value for key, value in cached.items() + f"actor/cispo_{key}": value for key, value in cached.items() if key.startswith("cispo_") and key != "cispo_clipped_ratio" } - + # 特殊处理需要计算的指标 cispo_metrics["actor/cispo_clipped_ratio_mean"] = masked_mean(cached["cispo_clipped_ratio"], response_mask, dim=-1).mean().item() base_metrics.update(cispo_metrics) - + elif pg_variant == 'kimi15': kimi15_metrics = { - f"actor/kimi15_{key}": value for key, value in cached.items() + f"actor/kimi15_{key}": value for key, value in cached.items() if key.startswith("kimi15_") } base_metrics.update(kimi15_metrics) - + return base_metrics \ No newline at end of file diff --git a/roll/pipeline/rlvr/actor_worker.py b/roll/pipeline/rlvr/actor_worker.py index 19d0c66de..ccfdfb16f 100644 --- a/roll/pipeline/rlvr/actor_worker.py +++ b/roll/pipeline/rlvr/actor_worker.py @@ -4,7 +4,7 @@ from roll.distributed.scheduler.protocol import DataProto from roll.pipeline.base_worker import ActorWorker as BaseActorWorker from roll.utils.functionals import masked_mean, agg_loss, compute_approx_kl - +from roll.utils.train_infer_corrections import compute_train_infer_correction class ActorWorker(BaseActorWorker): @@ -19,6 +19,12 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): ref_log_probs = data.batch["ref_log_probs"] advantages = data.batch["advantages"] + batch_num_tokens = data.meta_info['batch_num_tokens'] + global_valid_samples = data.meta_info['global_valid_samples'] + if 'final_response_mask' not in batch_num_tokens: + batch_num_tokens['final_response_mask'] = batch_num_tokens['response_mask'] + global_valid_samples['final_response_mask'] = global_valid_samples['response_mask'] + log_probs = self.strategy.op_compute_log_probs( logits=output_tensor, input_ids=data.batch["input_ids"], attention_mask=data.batch["response_mask"] ) @@ -26,25 +32,34 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): infer_log_probs = data.batch.get("infer_logprobs", old_log_probs) infer_log_probs = infer_log_probs if len(infer_log_probs) > 0 else old_log_probs - loss_scale =None - if self.worker_config.use_dynamic_batching_in_train and self.pipeline_config.loss_agg_mode == "seq-mean-token-sum": - micro_batch_indices = data.meta_info["micro_batch_indices"] - mini_batch_size = micro_batch_indices[-1][-1] - micro_batch_indices[0][0] - num_micro_batch = len(micro_batch_indices) - micro_batch_size = data.batch.batch_size[0] - loss_scale = num_micro_batch * micro_batch_size / mini_batch_size + train_infer_metric = {} + if not self.pipeline_config.enable_old_logprobs_recompute: + train_infer_is_weight, filter_mask, train_infer_metric = compute_train_infer_correction( + cfg=self.pipeline_config.train_infer_correction, + response_mask=response_mask, + old_log_probs=old_log_probs, + infer_log_probs=infer_log_probs, + global_valid_samples=global_valid_samples['response_mask'], + global_valid_tokens=batch_num_tokens['response_mask'], + ) + + # Apply filter mask to both response_mask and final_response_mask + response_mask = response_mask.long() * filter_mask.long() + final_response_mask = final_response_mask.long() * filter_mask.long() + else: + train_infer_is_weight = data.batch['train_infer_is_weight'] valid_samples = torch.any(final_response_mask > 0, dim=1).float() sample_weights = self.compute_sample_weights(data, response_mask) - kl_loss = compute_approx_kl( log_probs=log_probs, log_probs_base=ref_log_probs, action_mask=final_response_mask, kl_penalty="k3" ) kl_loss = agg_loss(loss_mat=kl_loss, loss_mask=final_response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, - loss_scale=loss_scale) + batch_num_tokens=batch_num_tokens['final_response_mask'], + global_valid_samples=global_valid_samples['final_response_mask'],) approxkl = compute_approx_kl( log_probs=log_probs, log_probs_base=old_log_probs, action_mask=response_mask, kl_penalty="mse" @@ -53,40 +68,12 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): log_probs=log_probs, log_probs_base=old_log_probs, action_mask=response_mask, kl_penalty="kl" ) - train_infer_ratio = (old_log_probs - infer_log_probs).exp() - train_infer_diff = old_log_probs.exp() - infer_log_probs.exp() - train_infer_ratio_seq = masked_mean(old_log_probs - infer_log_probs, response_mask, dim=-1).exp().unsqueeze(-1).expand_as(train_infer_ratio) - train_infer_diff_seq = masked_mean(old_log_probs.exp() - infer_log_probs.exp(), response_mask, dim=-1).unsqueeze(-1).expand_as(train_infer_diff) - - train_infer_ratio_mask_mean = 1.0 - train_infer_diff_mask_mean = 1.0 - train_infer_ratio_seq_mask_mean = 1.0 - train_infer_diff_seq_mask_mean = 1.0 - - if self.pipeline_config.train_infer_ratio_mask: - train_infer_ratio_mask = (train_infer_ratio <= self.pipeline_config.train_infer_ratio_threshold_high).float() * (train_infer_ratio >= self.pipeline_config.train_infer_ratio_threshold_low).float() - train_infer_ratio_mask_mean = masked_mean(train_infer_ratio_mask, final_response_mask, dim=-1).mean().detach().item() - final_response_mask = final_response_mask * train_infer_ratio_mask - if self.pipeline_config.train_infer_diff_mask: - train_infer_diff_mask = (train_infer_diff <= self.pipeline_config.train_infer_diff_threshold_high).float() * (train_infer_diff >= self.pipeline_config.train_infer_diff_threshold_low).float() - train_infer_diff_mask_mean = masked_mean(train_infer_diff_mask, final_response_mask, dim=-1).mean().detach().item() - final_response_mask = final_response_mask * train_infer_diff_mask - - if self.pipeline_config.train_infer_ratio_seq_mask: - train_infer_ratio_seq_mask = (train_infer_ratio_seq <= self.pipeline_config.train_infer_ratio_seq_threshold_high).float() * (train_infer_ratio_seq >= self.pipeline_config.train_infer_ratio_seq_threshold_low).float() - train_infer_ratio_seq_mask_mean = masked_mean(train_infer_ratio_seq_mask, final_response_mask, dim=-1).mean().detach().item() - final_response_mask = final_response_mask * train_infer_ratio_seq_mask - if self.pipeline_config.train_infer_diff_seq_mask: - train_infer_diff_seq_mask = (train_infer_diff_seq <= self.pipeline_config.train_infer_diff_seq_threshold_high).float() * (train_infer_diff_seq >= self.pipeline_config.train_infer_diff_seq_threshold_low).float() - train_infer_diff_seq_mask_mean = masked_mean(train_infer_diff_seq_mask, final_response_mask, dim=-1).mean().detach().item() - final_response_mask = final_response_mask * train_infer_diff_seq_mask - if self.pipeline_config.importance_sampling == "token": ratio = (log_probs - old_log_probs).exp() elif self.pipeline_config.importance_sampling == "seq": log_ratio = log_probs - old_log_probs masked_log_ratio = masked_mean(log_ratio, final_response_mask, dim=-1) - ratio = masked_log_ratio.exp().unsqueeze(-1).expand_as(log_ratio) + ratio = masked_log_ratio.exp().unsqueeze(-1).expand_as(log_ratio) pg_clip_low = self.pipeline_config.pg_clip_low if self.pipeline_config.use_pg_clip_range else self.pipeline_config.pg_clip pg_clip_high = self.pipeline_config.pg_clip_high if self.pipeline_config.use_pg_clip_range else self.pipeline_config.pg_clip @@ -99,16 +86,18 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): dual_clip_loss = -torch.max(-loss, (1 + self.pipeline_config.pg_clip * 2) * advantages) loss = torch.where(advantages < 0, dual_clip_loss, loss) - if self.pipeline_config.use_rollout_importance_sampling_ratio: - rollout_importance_sampling_clip = (train_infer_ratio > self.pipeline_config.rollout_importance_sampling_ratio_upper_bound).float() - loss = train_infer_ratio.clamp(0, self.pipeline_config.rollout_importance_sampling_ratio_upper_bound) * loss + if self.pipeline_config.train_infer_correction.is_weight.enabled: + loss = loss * train_infer_is_weight weighted_pg_loss = agg_loss(loss_mat=loss, loss_mask=final_response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, - weights=sample_weights, loss_scale=loss_scale) + weights=sample_weights, + batch_num_tokens=batch_num_tokens['final_response_mask'], + global_valid_samples=global_valid_samples['final_response_mask'],) original_pg_loss = agg_loss(loss_mat=loss, loss_mask=final_response_mask, loss_agg_mode=self.pipeline_config.loss_agg_mode, - loss_scale=loss_scale) + batch_num_tokens=batch_num_tokens['final_response_mask'], + global_valid_samples=global_valid_samples['final_response_mask'],) clipped_low = (ratio < 1 - pg_clip_low).float() clipped_high = (ratio > 1 + pg_clip_high).float() @@ -127,70 +116,52 @@ def loss_func(self, data: DataProto, output_tensor: torch.Tensor): loss_mat=entropy, loss_mask=data.batch["response_mask"][:, 1:], loss_agg_mode=self.pipeline_config.loss_agg_mode, - loss_scale=loss_scale + batch_num_tokens=batch_num_tokens['response_mask'], + global_valid_samples=global_valid_samples['response_mask'], ) total_loss = total_loss - entropy_loss * self.pipeline_config.entropy_loss_coef - metrics = {} - if self.pipeline_config.postive_loss_coef > 0: - response_positive_mask = (data.batch['scores'] > 0).unsqueeze(-1).expand_as(final_response_mask) - # TODO: 是否应该乘上adv? - postive_loss = agg_loss(loss_mat=-log_probs * advantages, loss_mask=final_response_mask * response_positive_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode, weights=torch.ones_like(sample_weights), - loss_scale=loss_scale) - total_loss = total_loss + postive_loss * self.pipeline_config.postive_loss_coef - metrics['actor/postive_loss'] = postive_loss.detach().item() - - if self.pipeline_config.use_topr_neg_loss_coef > 0: - response_negative_mask = (data.batch['scores'] <= 0).unsqueeze(-1).expand_as(final_response_mask) - clipped_ratio = torch.clamp((log_probs.detach() - old_log_probs).exp(), 0 , 1) - topr_neg_loss = agg_loss(loss_mat=-clipped_ratio * log_probs * advantages, loss_mask=final_response_mask * response_negative_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode, weights=torch.ones_like(sample_weights), - loss_scale=loss_scale) - total_loss = total_loss + topr_neg_loss * self.pipeline_config.use_topr_neg_loss_coef - metrics['actor/topr_neg_loss'] = topr_neg_loss.detach().item() - - train_infer_prob_metric = { - "actor/train_infer_ratio_mean": masked_mean(train_infer_ratio, response_mask, dim=-1).mean().detach().item(), - "actor/train_infer_diff_mean": masked_mean(train_infer_diff, response_mask, dim=-1).mean().detach().item(), - "actor/train_infer_ratio_mask_mean": train_infer_ratio_mask_mean, - "actor/train_infer_diff_mask_mean": train_infer_diff_mask_mean, - "actor/train_infer_ratio_seq_mask_mean": train_infer_ratio_seq_mask_mean, - "actor/train_infer_diff_seq_mask_mean": train_infer_diff_seq_mask_mean, - } - loss_metric = { - "actor/ppo_ratio_high_clipfrac": clipped_high.mean().detach().item(), - "actor/ppo_ratio_low_clipfrac": clipped_low.mean().detach().item(), - "actor/ppo_ratio_clipfrac": clipped.mean().detach().item(), - "actor/ratio_mean": masked_mean(ratio, response_mask, dim=-1).mean().detach().item(), - "actor/ratio_max": torch.max(ratio * response_mask).detach().item(), - "actor/ratio_min": torch.min(ratio * response_mask + (1 - response_mask) * 1e10).detach().item(), - "actor/clipfrac": agg_loss(loss_mat=torch.lt(surr2, surr1).float(), loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode, loss_scale=loss_scale).detach().item(), - } - - if self.pipeline_config.use_rollout_importance_sampling_ratio: - loss_metric["actor/rollout_importance_sampling_clip"] = rollout_importance_sampling_clip.mean().detach().item() + "actor/ppo_ratio_high_clipfrac@sum": agg_loss(loss_mat=clipped_high, loss_mask=final_response_mask, + loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['final_response_mask']).detach().item(), + "actor/ppo_ratio_low_clipfrac@sum": agg_loss(loss_mat=clipped_low, loss_mask=final_response_mask, + loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['final_response_mask']).detach().item(), + "actor/ppo_ratio_clipfrac@sum": agg_loss(loss_mat=clipped, loss_mask=final_response_mask, + loss_agg_mode='token-mean', + batch_num_tokens=batch_num_tokens['final_response_mask']).detach().item(), + "actor/ratio_mean@sum": agg_loss(loss_mat=ratio, loss_mask=response_mask, + loss_agg_mode='seq-mean-token-mean', + global_valid_samples=global_valid_samples['response_mask']).detach().item(), + "actor/ratio_max@max": torch.max(ratio * response_mask).detach().item(), + "actor/ratio_min@min": torch.min(ratio * response_mask + (1 - response_mask) * 1e10).detach().item(), + "actor/clipfrac@sum": agg_loss(loss_mat=torch.lt(surr2, surr1).float(), loss_mask=response_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, batch_num_tokens=batch_num_tokens, + global_valid_samples=global_valid_samples['response_mask']).detach().item(), + } pg_metrics = { - "actor/pg_loss": original_pg_loss.detach().item(), - "actor/weighted_pg_loss": weighted_pg_loss.detach().item(), - "actor/kl_loss": kl_loss.detach().item(), - "actor/total_loss": total_loss.detach().item(), - "actor/approxkl": agg_loss(loss_mat=approxkl, loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).detach().item(), - "actor/policykl": agg_loss(loss_mat=policykl, loss_mask=response_mask, - loss_agg_mode=self.pipeline_config.loss_agg_mode).detach().item(), - "actor/valid_samples": valid_samples.sum().detach().item(), - "actor/total_samples": float(valid_samples.size(0)), - "actor/valid_sample_ratio": (valid_samples.sum() / valid_samples.size(0)).detach().item(), - "actor/sample_weights_mean": sample_weights.mean().detach().item(), - "actor/sample_weights_min": sample_weights.min().detach().item(), - "actor/sample_weights_max": sample_weights.max().detach().item(), - **metrics, + "actor/pg_loss@sum": original_pg_loss.detach().item(), + "actor/weighted_pg_loss@sum": weighted_pg_loss.detach().item(), + "actor/kl_loss@sum": kl_loss.detach().item(), + "actor/total_loss@sum": total_loss.detach().item(), + "actor/approxkl@sum": agg_loss(loss_mat=approxkl, loss_mask=response_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens, + global_valid_samples=global_valid_samples['response_mask'],).detach().item(), + "actor/policykl@sum": agg_loss(loss_mat=policykl, loss_mask=response_mask, + loss_agg_mode=self.pipeline_config.loss_agg_mode, + batch_num_tokens=batch_num_tokens, + global_valid_samples=global_valid_samples['response_mask'],).detach().item(), + "actor/valid_samples@sum": valid_samples.sum().detach().item(), + "actor/total_samples@sum": float(valid_samples.size(0)), + "actor/valid_sample_ratio@sum": (valid_samples.sum() / global_valid_samples['response_mask']).detach().item(), + "actor/sample_weights_mean@mean": sample_weights.mean().detach().item(), + "actor/sample_weights_min@min": sample_weights.min().detach().item(), + "actor/sample_weights_max@max": sample_weights.max().detach().item(), **loss_metric, - **train_infer_prob_metric + **train_infer_metric, } return total_loss, pg_metrics diff --git a/roll/pipeline/rlvr/rewards/__init__.py b/roll/pipeline/rlvr/rewards/__init__.py index 9d939d88f..26b9beebd 100644 --- a/roll/pipeline/rlvr/rewards/__init__.py +++ b/roll/pipeline/rlvr/rewards/__init__.py @@ -3,5 +3,4 @@ from roll.pipeline.rlvr.rewards.general_val_rule_reward_worker import GeneralValRuleRewardWorker from roll.pipeline.rlvr.rewards.ifeval_rule_reward_worker import GeneralRuleRewardWorker from roll.pipeline.rlvr.rewards.llm_judge_reward_worker import LLMJudgeRewardWorker -from roll.pipeline.rlvr.rewards.math_rule_reward_worker import MathRuleRewardWorker - +from roll.pipeline.rlvr.rewards.math_rule_reward_worker import MathRuleRewardWorker \ No newline at end of file diff --git a/roll/pipeline/rlvr/rewards/crossthinkqa_rule_reward_worker.py b/roll/pipeline/rlvr/rewards/crossthinkqa_rule_reward_worker.py index fd1cfcad5..163793d85 100644 --- a/roll/pipeline/rlvr/rewards/crossthinkqa_rule_reward_worker.py +++ b/roll/pipeline/rlvr/rewards/crossthinkqa_rule_reward_worker.py @@ -12,8 +12,6 @@ from roll.distributed.executor.worker import Worker from roll.distributed.scheduler.decorator import Dispatch, register from roll.distributed.scheduler.protocol import DataProto -from roll.distributed.strategy.factory import create_strategy -from roll.distributed.strategy.strategy import InferenceStrategy, TrainStrategy from roll.models.model_providers import default_reward_model_provider, default_tokenizer_provider @@ -154,7 +152,6 @@ def __init__(self, worker_config: WorkerConfig): self.rank_info.dp_rank = self.rank_info.rank self.rank_info.dp_size = self.rank_info.world_size self.tokenizer = default_tokenizer_provider(model_args=self.worker_config.model_args) - self.strategy: Optional[Union[InferenceStrategy, TrainStrategy]] = None self.repetition_penalty_reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5) self.response_length_reward_fn = get_response_length_reward(min_len=100, max_len=400) diff --git a/roll/pipeline/rlvr/rewards/detection_reward_worker.py b/roll/pipeline/rlvr/rewards/detection_reward_worker.py index f57da70f9..57940333d 100644 --- a/roll/pipeline/rlvr/rewards/detection_reward_worker.py +++ b/roll/pipeline/rlvr/rewards/detection_reward_worker.py @@ -30,7 +30,7 @@ from roll.distributed.scheduler.decorator import Dispatch, register from roll.distributed.scheduler.protocol import DataProto from roll.distributed.strategy.strategy import InferenceStrategy, TrainStrategy -from roll.models.model_providers import default_tokenizer_provider +from roll.models.model_providers import default_processor_provider from roll.utils.logging import get_logger @@ -1172,6 +1172,7 @@ def extract_answer_content(text): def normalize_bbox_by_real_size(pred_bboxes, input_width, input_height, normalize_size=1000.0): + # refer to https://github.com/QwenLM/Qwen2.5-VL/issues/721 for qwen2.5-vl bbox if pred_bboxes is None: return None @@ -1624,10 +1625,12 @@ def __init__(self, worker_config: WorkerConfig): self.worker_config = worker_config self.rank_info.dp_rank = self.rank_info.rank self.rank_info.dp_size = self.rank_info.world_size - self.tokenizer = default_tokenizer_provider(model_args=self.worker_config.model_args) + self.processor = default_processor_provider(model_args=self.worker_config.model_args) + self.tokenizer = self.processor.tokenizer self.strategy: Optional[Union[InferenceStrategy, TrainStrategy]] = None - self.patch_size = 14 # hard-code to qwen2.5-vl temporarily + # qwen2.5-vl use 14, while qwen2-vl/qwen3-vl/qwen3-omni use 16 + self.patch_size = self.processor.image_processor.patch_size @register(dispatch_mode=Dispatch.ONE_TO_ALL) def initialize(self, pipeline_config): @@ -1658,6 +1661,14 @@ def compute_rewards(self, data: DataProto): ] verifier_parm["image_grid_thw"] = image_grid_thw verifier = DetectionVerifier(**verifier_parm) + # qwen2.5-vl uses absolute coordinates, while qwen2-vl/qwen3-vl/qwen3-omni + # uses relative coordinates which were scaled to [0,1000], refer to + # https://github.com/QwenLM/Qwen3-VL/issues/721 + # https://github.com/QwenLM/Qwen3-VL/issues/1937 + # and the ground truth in One-RL-to-See-Them-All/Orsta-Data-47k is also scaled + # hacky to set det_verifier_normalized to different value temporarily + if not self.processor.__class__.__name__.startswith("Qwen2_5"): + verifier.det_verifier_normalized = False # Initialize default result result = { "rewards": { @@ -1669,6 +1680,9 @@ def compute_rewards(self, data: DataProto): } format_score = verifier.verify_format(response) accuracy_score_gathered = verifier.verify_accuracy(response, ground_truth) + self.logger.debug( + f"{json.dumps(dict(verifier_parm=verifier_parm, response=response, ground_truth=ground_truth, accuracy_score_gathered=accuracy_score_gathered))}" + ) if isinstance(accuracy_score_gathered, dict): accuracy_score = accuracy_score_gathered['final_score'] diff --git a/roll/pipeline/rlvr/rewards/general_val_rule_reward_worker.py b/roll/pipeline/rlvr/rewards/general_val_rule_reward_worker.py index e56fa619b..11d3ccea0 100644 --- a/roll/pipeline/rlvr/rewards/general_val_rule_reward_worker.py +++ b/roll/pipeline/rlvr/rewards/general_val_rule_reward_worker.py @@ -12,8 +12,6 @@ from roll.distributed.executor.worker import Worker from roll.distributed.scheduler.decorator import Dispatch, register from roll.distributed.scheduler.protocol import DataProto -from roll.distributed.strategy.factory import create_strategy -from roll.distributed.strategy.strategy import InferenceStrategy, TrainStrategy from roll.models.model_providers import default_reward_model_provider, default_tokenizer_provider @@ -109,7 +107,6 @@ def __init__(self, worker_config: WorkerConfig): self.rank_info.dp_rank = self.rank_info.rank self.rank_info.dp_size = self.rank_info.world_size self.tokenizer = default_tokenizer_provider(model_args=self.worker_config.model_args) - self.strategy: Optional[Union[InferenceStrategy, TrainStrategy]] = None @register(dispatch_mode=Dispatch.ONE_TO_ALL) def initialize(self, pipeline_config): diff --git a/roll/pipeline/rlvr/rewards/ifeval_rule_reward_worker.py b/roll/pipeline/rlvr/rewards/ifeval_rule_reward_worker.py index 07dc29202..bcc2ab85e 100644 --- a/roll/pipeline/rlvr/rewards/ifeval_rule_reward_worker.py +++ b/roll/pipeline/rlvr/rewards/ifeval_rule_reward_worker.py @@ -20,8 +20,6 @@ from roll.distributed.executor.worker import Worker from roll.distributed.scheduler.decorator import Dispatch, register from roll.distributed.scheduler.protocol import DataProto -from roll.distributed.strategy.factory import create_strategy -from roll.distributed.strategy.strategy import InferenceStrategy, TrainStrategy import string from difflib import SequenceMatcher import nltk @@ -564,7 +562,6 @@ def __init__(self, worker_config: WorkerConfig): self.rank_info.dp_rank = self.rank_info.rank self.rank_info.dp_size = self.rank_info.world_size self.tokenizer = default_tokenizer_provider(model_args=self.worker_config.model_args) - self.strategy: Optional[Union[InferenceStrategy, TrainStrategy]] = None self.repetition_penalty_reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.5) # nltk.download('wordnet') # nltk.download('omw-1.4') diff --git a/roll/pipeline/rlvr/rewards/llm_judge_reward_worker.py b/roll/pipeline/rlvr/rewards/llm_judge_reward_worker.py index 955aa9419..46446b617 100644 --- a/roll/pipeline/rlvr/rewards/llm_judge_reward_worker.py +++ b/roll/pipeline/rlvr/rewards/llm_judge_reward_worker.py @@ -37,7 +37,7 @@ def __init__(self, worker_config: WorkerConfig): # LLM judge相关配置 self.judge_prompt = self.worker_config.judge_prompt if hasattr(self.worker_config, "judge_prompt") else None - self.judge_prompt = prompt_maps[self.judge_prompt] + self.judge_prompt = prompt_maps.get(self.judge_prompt, None) self.judge_model_type = ( self.worker_config.judge_model_type if hasattr(self.worker_config, "judge_model_type") else "api" ) @@ -56,7 +56,12 @@ def initialize(self, pipeline_config): print(f"{self.worker_name} initialized with API model") elif self.judge_model_type == "inference": - self.strategy = create_strategy(worker=self) + async_strategy = self.worker_config.strategy_args.strategy_name in ["vllm", "sglang"] + if self.worker_config.strategy_args.strategy_name == "sglang": # not weight sync, need backup weights + self.worker_config.strategy_args.strategy_config["enable_weights_cpu_backup"] = True + if self.worker_config.strategy_args.strategy_name == "vllm": + self.worker_config.strategy_args.strategy_config["sleep_level"] = 1 + self.strategy = create_strategy(worker=self, sync_wrapper=async_strategy) self.strategy.initialize(model_provider=default_reward_model_provider) self.tokenizer = self.strategy.tokenizer print(f"{self.worker_name} initialized with inference model") diff --git a/roll/pipeline/rlvr/rewards/math_rule_reward_worker.py b/roll/pipeline/rlvr/rewards/math_rule_reward_worker.py index 7a7b8df19..e9b1380fa 100644 --- a/roll/pipeline/rlvr/rewards/math_rule_reward_worker.py +++ b/roll/pipeline/rlvr/rewards/math_rule_reward_worker.py @@ -20,10 +20,7 @@ from roll.distributed.executor.worker import Worker from roll.distributed.scheduler.decorator import Dispatch, register from roll.distributed.scheduler.protocol import DataProto -from roll.distributed.strategy.factory import create_strategy -from roll.distributed.strategy.strategy import InferenceStrategy, TrainStrategy from roll.models.model_providers import default_reward_model_provider, default_tokenizer_provider -from roll.utils.context_managers import state_offload_manger class timeout: def __init__(self, seconds=1, error_message="Timeout"): @@ -39,7 +36,41 @@ def __enter__(self): def __exit__(self, type, value, traceback): signal.alarm(0) - + +def check_and_extract_within_boxed(response, boxed_start="\\boxed{", boxed_start_list=["\\boxed\{", "\\boxed{"]): + if len(boxed_start_list) > 0: + for boxed_start in boxed_start_list: + last_boxed_index = response.rfind(boxed_start) + if last_boxed_index == -1: + continue + else: + boxed_content_start_index = last_boxed_index + len(boxed_start) + break + if last_boxed_index == -1: + return False, "" + else: + last_boxed_index = response.rfind(boxed_start) + if last_boxed_index == -1: + return False, "" + boxed_content_start_index = last_boxed_index + len(boxed_start) + cur_index = boxed_content_start_index + left_curly_brace_cnt = 0 + left_double_curly_quote = False + while cur_index < len(response): + if response[cur_index:].startswith("\""): + left_double_curly_quote = not left_double_curly_quote + elif left_double_curly_quote == False and response[cur_index:].startswith("{"): + left_curly_brace_cnt += 1 + elif left_double_curly_quote == False and response[cur_index:].startswith("}"): + if left_curly_brace_cnt == 0: + return True, response[boxed_content_start_index:cur_index] + else: + left_curly_brace_cnt -= 1 + if left_curly_brace_cnt < 0: + return False, response[boxed_content_start_index:] + cur_index += 1 + return False, response[boxed_content_start_index:] + def _extract_after_last_end_think(response: str, prompt: str, start_think: str='', end_think: str='') -> str: """ 提取字符串中最后一个 "" 标签之后的所有文本。 @@ -123,7 +154,11 @@ def _hf_verify_math_sample(response, answer, result, prompt): => 默认值: False (不抛出异常,返回空列表) => 建议:保持默认值,确保程序的健壮性,不会因单个样本出错而中断。 """ - parsed_answers = parse(cleaned_response, fallback_mode="no_fallback") + is_success, extracted_answer = check_and_extract_within_boxed(cleaned_response) + if not is_success: + parsed_answers = parse(cleaned_response, fallback_mode="no_fallback") + else: + parsed_answers = parse(f"${extracted_answer}$", fallback_mode="no_fallback") # 如果解析结果为空,则认为提取失败 if not parsed_answers: @@ -215,7 +250,6 @@ def __init__(self, worker_config: WorkerConfig): self.rank_info.dp_rank = self.rank_info.rank self.rank_info.dp_size = self.rank_info.world_size self.tokenizer = default_tokenizer_provider(model_args=self.worker_config.model_args) - self.strategy: Optional[Union[InferenceStrategy, TrainStrategy]] = None self.repetition_penalty_reward_fn = get_repetition_penalty_reward(ngram_size=3, max_penalty=-0.1) self.format_pattern = getattr(self.worker_config, "format_pattern", None) diff --git a/roll/pipeline/rlvr/rewards/multiple_choice_boxed_rule_reward_worker.py b/roll/pipeline/rlvr/rewards/multiple_choice_boxed_rule_reward_worker.py index 93367ed75..450483755 100644 --- a/roll/pipeline/rlvr/rewards/multiple_choice_boxed_rule_reward_worker.py +++ b/roll/pipeline/rlvr/rewards/multiple_choice_boxed_rule_reward_worker.py @@ -12,8 +12,6 @@ from roll.distributed.executor.worker import Worker from roll.distributed.scheduler.decorator import Dispatch, register from roll.distributed.scheduler.protocol import DataProto -from roll.distributed.strategy.factory import create_strategy -from roll.distributed.strategy.strategy import InferenceStrategy, TrainStrategy from roll.models.model_providers import default_reward_model_provider, default_tokenizer_provider @@ -88,7 +86,6 @@ def __init__(self, worker_config: WorkerConfig): self.rank_info.dp_rank = self.rank_info.rank self.rank_info.dp_size = self.rank_info.world_size self.tokenizer = default_tokenizer_provider(model_args=self.worker_config.model_args) - self.strategy: Optional[Union[InferenceStrategy, TrainStrategy]] = None @register(dispatch_mode=Dispatch.ONE_TO_ALL) def initialize(self, pipeline_config): diff --git a/roll/pipeline/rlvr/rlvr_config.py b/roll/pipeline/rlvr/rlvr_config.py index ba51d62ef..40222e8f8 100644 --- a/roll/pipeline/rlvr/rlvr_config.py +++ b/roll/pipeline/rlvr/rlvr_config.py @@ -74,7 +74,8 @@ class RewardConfig(WorkerConfig): default_factory=RewardFilterConfig, metadata={"help": "Arguments passed to reward response filtering"}, ) - + + reward_manager_config: List[Dict[str, Any]] = field(default_factory=list, metadata={"help": "The reward system plugin config."}) @dataclass @@ -92,23 +93,6 @@ class RLVRConfig(PPOConfig): metadata={"help": "The number of return sequences in one group, used in generation_args."} ) - generate_opt_level: int = field( - default=1, - metadata={ - "help": "generate optimizing level: 0 use base batch generate interface, 1 use scheduler process requests" - }, - ) - is_num_return_sequences_expand: bool = field( - default=False, - metadata={"help": "whether replicate `num_return_sequences` times in prompts or not."} - ) - is_use_additional_prompts: bool = field( - default=False, - metadata={"help": "Whether to use additional prompts or not."} - ) - max_additional_running_prompts: int = field( - default=16, metadata={"help": "The additional number of running prompts, beyond batch_size."} - ) save_logging_board_dir: str = field( default=None, metadata={"help": "saving directory of logging board_metrics"} ) @@ -149,22 +133,6 @@ class RLVRConfig(PPOConfig): importance_sampling: Literal["token", "seq"] = ( field(default="token", metadata={"help": "policy importance sampling"}) ) - use_rollout_importance_sampling_ratio: bool = field(default=False, metadata={"help": "apply train/infer ratio as token-level loss weight"}) - rollout_importance_sampling_ratio_upper_bound: float = field(default=1.2) - - train_infer_ratio_mask: bool = field(default=False, metadata={"help": "apply train/infer ratio as token-level response mask"}) - train_infer_ratio_threshold_low: float = field(default=0.8) - train_infer_ratio_threshold_high: float = field(default=1.2) - train_infer_diff_mask: bool = field(default=False, metadata={"help": "apply train-infer diff as token-level response mask"}) - train_infer_diff_threshold_low: float = field(default=-0.2) - train_infer_diff_threshold_high: float = field(default=0.2) - - train_infer_ratio_seq_mask: bool = field(default=False, metadata={"help": "apply train/infer ratio as sequence-level response mask"}) - train_infer_ratio_seq_threshold_low: float = field(default=0.8) - train_infer_ratio_seq_threshold_high: float = field(default=1.2) - train_infer_diff_seq_mask: bool = field(default=False, metadata={"help": "apply train-infer diff as sequence-level response mask"}) - train_infer_diff_seq_threshold_low: float = field(default=-0.2) - train_infer_diff_seq_threshold_high: float = field(default=0.2) val_greedy: bool = field(default=False, metadata={"help": "Use greedy for validation"}) val_n_sample: int = field(default=1, metadata={"help": "Number of samples for validation"}) @@ -186,7 +154,7 @@ def __post_init__(self): if self.actor_train.worker_cls is None: self.actor_train.worker_cls = "roll.pipeline.rlvr.actor_worker.ActorWorker" if self.actor_infer.worker_cls is None: - self.actor_infer.worker_cls = "roll.pipeline.rlvr.actor_worker.ActorWorker" + self.actor_infer.worker_cls = "roll.pipeline.base_worker.InferWorker" if self.reference.worker_cls is None: self.reference.worker_cls = "roll.pipeline.rlvr.actor_worker.ActorWorker" if self.critic.worker_cls is None: @@ -252,7 +220,5 @@ def __post_init__(self): else: self.num_nodes = (max_gpu_num + self.num_gpus_per_node - 1) // self.num_gpus_per_node - self.validate_worker_config() - def to_dict(self): return dataclasses.asdict(self) diff --git a/roll/pipeline/rlvr/rlvr_math_vlm_pipeline.py b/roll/pipeline/rlvr/rlvr_math_vlm_pipeline.py index 0bc2fc664..fcc12e0df 100644 --- a/roll/pipeline/rlvr/rlvr_math_vlm_pipeline.py +++ b/roll/pipeline/rlvr/rlvr_math_vlm_pipeline.py @@ -35,6 +35,7 @@ compute_clip_fraction, group_reward_norm, expand_to_token_level, + get_sample_level_mask ) from roll.utils.kl_controller import get_kl_controller from roll.utils.logging import get_logger @@ -337,7 +338,6 @@ def run(self): ), ) gen_batch.meta_info = {"global_step": global_step} - gen_batch.meta_info["response_callback_fn"] = self.generate_scheduler.report_response.remote generate_output: DataProto = ray.get( self.generate_scheduler.generate.remote( data=gen_batch, @@ -360,6 +360,7 @@ def run(self): value, self.actor_infer.worker_config.generating_args.num_return_sequences ) batch.non_tensor_batch['sample_uuid'] = np.array([str(uuid.uuid4()) for _ in range(batch.batch.shape[0])], dtype=object) + batch.meta_info["loss_mask_keys"] = ["response_mask", "final_response_mask"] with Timer(name="cal_ref_log_probs_reward", logger=None) as cal_timer: if self.pipeline_config.enable_reference: @@ -383,6 +384,11 @@ def run(self): batch = batch.union(rewards) metrics["time/ref_log_probs_values_reward"] = cal_timer.last + with Timer(name="get_sample_level_mask", logger=None) as get_sample_level_mask_timer: + batch, mask_metrics = get_sample_level_mask(batch, self.pipeline_config) + metrics.update(mask_metrics) + metrics["time/get_sample_level_mask"] = get_sample_level_mask_timer.last + with Timer(name="cal_old_log_probs_values", logger=None) as cal_old_logpb_timer: if self.is_lora: batch.meta_info["disable_adapter"] = False @@ -591,7 +597,6 @@ def val(self): non_tensor_batch_keys=["multi_modal_data"] if "multi_modal_data" in batch.non_tensor_batch else [], ) gen_batch.meta_info["is_offload_states"] = False - gen_batch.meta_info["response_callback_fn"] = self.generate_scheduler.report_response.remote generate_output: DataProto = ray.get( self.generate_scheduler.generate.remote( data=gen_batch, diff --git a/roll/pipeline/rlvr/rlvr_pipeline.py b/roll/pipeline/rlvr/rlvr_pipeline.py index b590077d6..73d1a32ce 100644 --- a/roll/pipeline/rlvr/rlvr_pipeline.py +++ b/roll/pipeline/rlvr/rlvr_pipeline.py @@ -1,6 +1,5 @@ import copy import json -import math import os import time import uuid @@ -19,14 +18,15 @@ from roll.configs import GeneratingArguments from roll.datasets.chat_template import get_chat_template from roll.datasets.collator import DataCollatorWithPaddingForPaddedKeys +from roll.datasets.dataset import get_dataset from roll.distributed.executor.cluster import Cluster -from roll.distributed.scheduler.async_generate_scheduler import AsyncDynamicSamplingScheduler from roll.distributed.scheduler.generate_scheduler import DynamicSamplingScheduler from roll.distributed.scheduler.protocol import DataProto from roll.models.model_providers import default_tokenizer_provider from roll.pipeline.base_pipeline import BasePipeline from roll.pipeline.rlvr.rlvr_config import RLVRConfig from roll.pipeline.rlvr.utils import dump_rollout_to_specific_path +from roll.utils.dynamic_batching import dynamic_batching_shard from roll.utils.functionals import ( RunningMoments, agg_loss, @@ -35,27 +35,25 @@ get_sample_level_mask, reduce_metrics, reward_postprocess, + batch_balance ) +from roll.utils.train_infer_corrections import apply_train_infer_correction_to_batch from roll.utils.kl_controller import get_kl_controller from roll.utils.logging import get_logger from roll.utils.metrics.metrics_manager import MetricsManager -from roll.utils.dynamic_batching import dynamic_batching_shard +from roll.utils.offload_states import OffloadStateType logger = get_logger() def is_lora_training(pipeline_config: RLVRConfig) -> bool: - if pipeline_config.actor_train.model_args.lora_target is None: - return False - assert pipeline_config.actor_train.strategy_args.strategy_name == "deepspeed_train", ( - "LoRA only supports deepspeed_train" - ) - return True + return pipeline_config.actor_train.model_args.lora_target is not None def preprocess_dataset(dataset, prompt_len, encode_function, data_args): - logger.info(f"Begin : {dataset}") + # 处理数据 + print(f"Begin : {dataset}") dataset = dataset.map( encode_function, batched=True, @@ -69,12 +67,12 @@ def preprocess_dataset(dataset, prompt_len, encode_function, data_args): num_proc=data_args.preprocessing_num_workers, desc="Filtering dataset", ) - logger.info(f"Filtering prompt len: {dataset}") - logger.info(f"Encoding: {dataset}") + print(f"Filtering prompt len: {dataset}") + print(f"Encoding: {dataset}") return dataset -def get_encode_function(template_name, data_args, tokenizer): +def get_encode_function(template_name, tokenizer, data_args): chat_template_func = get_chat_template(template_name, tokenizer) def encode_function(data_i): @@ -92,36 +90,13 @@ def encode_function(data_i): return encode_function + def update_dataset_domain(tag_2_domain: Dict[str, set[str]], row): if "domain" in row and row["domain"] is not None: return row row["domain"] = tag_2_domain.get(row["tag"], "math_rule") return row -def query_filter_fn(data_list: List[DataProto], config: RLVRConfig) -> bool: - """ - 各domain的过滤规则可以自定义 - """ - response_level_rewards = [data.batch["response_level_rewards"] for data in data_list] - if len(response_level_rewards) == 1: - return True - rewards = torch.cat(response_level_rewards, dim=0) - - domain = data_list[0].non_tensor_batch["domain"][0] - query_filter_config = config.rewards[domain].query_filter_config - - if query_filter_config.type == "no_filter": - return True - elif query_filter_config.type == "mean_filter": - threshold_up = query_filter_config.filter_args.get("threshold_up", math.inf) - threshold_down = query_filter_config.filter_args.get("threshold_down", -1) - if torch.mean(rewards) <= threshold_down or torch.mean(rewards) >= threshold_up: - return False - elif query_filter_config.type == "std_filter": - std_threshold = query_filter_config.filter_args.get("std_threshold", -1) - if torch.std(rewards) <= std_threshold: - return False - return True class RLVRPipeline(BasePipeline): @@ -129,10 +104,7 @@ class RLVRPipeline(BasePipeline): def __init__(self, pipeline_config: RLVRConfig): super().__init__(pipeline_config) self.pipeline_config = pipeline_config - self.is_lora = is_lora_training(self.pipeline_config) - scheduler_cls = ( - AsyncDynamicSamplingScheduler if self.pipeline_config.async_pipeline else DynamicSamplingScheduler - ) + self.use_ref_model = self.pipeline_config.enable_reference and (not is_lora_training(self.pipeline_config)) self.tokenizer = default_tokenizer_provider(model_args=self.pipeline_config.actor_train.model_args) dataset_paths = [] @@ -140,12 +112,11 @@ def __init__(self, pipeline_config: RLVRConfig): dataset_paths.extend(self.pipeline_config.actor_train.data_args.file_name) print(f"load_dataset_paths: {chr(10)} {chr(10).join(dataset_paths)}") - dataset = datasets.load_dataset("json", data_files=dataset_paths)["train"] + dataset = get_dataset(self.pipeline_config.actor_train.data_args) self.val_dataset = None if self.pipeline_config.validation and self.pipeline_config.validation.data_args: - val_dataset_paths = self.pipeline_config.validation.data_args.file_name - self.val_dataset = datasets.load_dataset("json", data_files=val_dataset_paths)["train"] + self.val_dataset = get_dataset(self.pipeline_config.validation.data_args) # 加上format,然后转ids的func template_name = ( @@ -153,7 +124,7 @@ def __init__(self, pipeline_config: RLVRConfig): if self.pipeline_config.global_template else self.pipeline_config.actor_train.data_args.template ) - encode_function = get_encode_function(template_name, self.pipeline_config.actor_train.data_args, self.tokenizer) + encode_function = get_encode_function(template_name, self.tokenizer, self.pipeline_config.actor_train.data_args) dataset = preprocess_dataset( dataset, @@ -182,7 +153,7 @@ def __init__(self, pipeline_config: RLVRConfig): self.val_dataset, self.pipeline_config.prompt_length, encode_function, - data_args=self.pipeline_config.validation.data_args, + data_args=self.pipeline_config.actor_train.data_args, ) self.val_dataset = self.val_dataset.map( partial(update_dataset_domain, self.pipeline_config.tag_2_domain), @@ -218,7 +189,7 @@ def __init__(self, pipeline_config: RLVRConfig): ) download_clusters = [self.actor_train, self.actor_infer] # use unwrapped model as reference for lora training - if not self.is_lora and self.pipeline_config.enable_reference: + if self.use_ref_model: self.reference: Any = Cluster( name=self.pipeline_config.reference.name, worker_cls=self.pipeline_config.reference.worker_cls, @@ -257,7 +228,7 @@ def __init__(self, pipeline_config: RLVRConfig): else: domain_batch_size = int(domain_ratios[domain] * self.pipeline_config.rollout_batch_size) accumulated += domain_batch_size - generate_scheduler = scheduler_cls.options( + generate_scheduler = ray.remote(DynamicSamplingScheduler).options( scheduling_strategy=NodeAffinitySchedulingStrategy( node_id=ray.get_runtime_context().get_node_id(), soft=False, @@ -270,9 +241,6 @@ def __init__(self, pipeline_config: RLVRConfig): dataset=self.domain_datasets[domain], collect_fn_cls=DataCollatorWithPaddingForPaddedKeys, collect_fn_kwargs=dict(max_length=self.pipeline_config.prompt_length, padding="max_length"), - response_filter_fn=lambda data_item, config: True, - query_filter_fn=query_filter_fn, - response_callback_fn=generate_scheduler.report_response.remote, state=self.state.kv.get(f"scheduler_state_{domain}", None), ) ) @@ -287,7 +255,7 @@ def __init__(self, pipeline_config: RLVRConfig): if self.val_dataset: val_pipeline_config = copy.deepcopy(self.pipeline_config) val_pipeline_config.is_use_additional_prompts = False - self.val_generate_scheduler = scheduler_cls.options( + self.val_generate_scheduler = ray.remote(DynamicSamplingScheduler).options( scheduling_strategy=NodeAffinitySchedulingStrategy( node_id=ray.get_runtime_context().get_node_id(), soft=False, @@ -301,9 +269,6 @@ def __init__(self, pipeline_config: RLVRConfig): dataset=self.val_dataset, collect_fn_cls=DataCollatorWithPaddingForPaddedKeys, collect_fn_kwargs=dict(max_length=self.pipeline_config.prompt_length, padding="max_length"), - response_filter_fn=lambda data_item, config: True, - query_filter_fn=lambda data_list, config: True, - response_callback_fn=self.val_generate_scheduler.report_response.remote, is_val=True, ) ) @@ -312,7 +277,7 @@ def __init__(self, pipeline_config: RLVRConfig): refs.extend(self.actor_infer.initialize(pipeline_config=self.pipeline_config, blocking=False)) ray.get(refs) - if not self.is_lora and self.pipeline_config.enable_reference: + if self.use_ref_model: refs.extend(self.reference.initialize(pipeline_config=self.pipeline_config, blocking=True)) refs = [] @@ -433,24 +398,22 @@ def run(self): actor_infer_timer = _Timer(window_size=5) actor_infer_response_timer = _Timer(window_size=5) actor_train_timer = _Timer(window_size=5) - + metrics_mgr.timers["tps"] = tps_timer metrics_mgr.timers["actor_infer"] = actor_infer_timer metrics_mgr.timers["actor_infer_response"] = actor_infer_response_timer metrics_mgr.timers["actor_train"] = actor_train_timer pre_step_total_time = 0 - if self.pipeline_config.async_pipeline and self.pipeline_config.generate_opt_level == 1: + if self.pipeline_config.async_pipeline: for reward_cluster in self.rewards.values(): reward_cluster.load_states() - first_step = True for global_step in range(self.pipeline_config.max_steps): if global_step <= self.state.step: global_step += 1 continue logger.info(f"pipeline step {global_step} start...") - should_eval = self.val_dataset and global_step % self.pipeline_config.eval_steps == 0 metrics_mgr.clear_metrics() with tps_timer, Timer(name="step_total", logger=None) as step_total_timer: @@ -458,7 +421,10 @@ def run(self): logger.info(f"pre_step_total_time: {pre_step_total_time}") metrics_mgr.add_metric("time/step_total", pre_step_total_time) batch: DataProto = DataProto( - meta_info={"global_step": global_step, "collect_unfinished": self.pipeline_config.async_pipeline} + meta_info={ + "global_step": global_step, + "collect_unfinished": self.pipeline_config.async_pipeline, + } ) # 先model update,resume时不需要保存infer cluster的状态 @@ -467,12 +433,9 @@ def run(self): self.actor_train.offload_states(blocking=True) with Timer(name="step_stop_server", logger=None) as step_stop_server_timer: - if self.pipeline_config.async_pipeline and not first_step and self.pipeline_config.generate_opt_level == 1: - scheduler_refs = [] - for scheduler in self.generate_schedulers.values(): - scheduler_refs.append(scheduler.pause_sampling.remote(data=batch)) - ray.get(scheduler_refs, timeout=self.pipeline_config.rpc_timeout) - self.actor_infer.stop_server() + if self.pipeline_config.async_pipeline: + ray.get([scheduler.pause_sampling.remote() for scheduler in self.generate_schedulers.values()]) + self.actor_infer.offload_states(include=OffloadStateType.other_params) metrics_mgr.add_metric("time/step_stop_server", step_stop_server_timer.last) with Timer(name="step_model_update", logger=None) as step_model_update_timer: @@ -481,9 +444,14 @@ def run(self): batch.meta_info["generation_config"] = self.get_generation_config() metrics_mgr.add_metric("time/step_model_update", step_model_update_timer.last) - if should_eval and not self.pipeline_config.async_pipeline: + self.actor_infer.load_states(blocking=True) + if not self.pipeline_config.async_pipeline: + for reward_cluster in self.rewards.values(): + reward_cluster.load_states() + + if self.val_dataset and global_step % self.pipeline_config.eval_steps == 0: with Timer(name="val_step", logger=None) as val_step_timer: - val_metrics = self.val() + val_metrics = self.val(global_step=global_step) metrics_mgr.add_metrics(val_metrics) metrics_mgr.add_metric("time/val_step", val_step_timer.last) @@ -494,33 +462,10 @@ def run(self): Timer(name="step_generate", logger=None) as step_generate_timer, ): domain_batches = {} - if self.pipeline_config.generate_opt_level == 1: - self.actor_infer.start_server(data=DataProto(meta_info=batch.meta_info)) - batch.meta_info["is_offload_states"] = False - if self.pipeline_config.async_pipeline: - if should_eval: - # 为Validation创建独立的DataProto - val_batch = DataProto() - val_batch.meta_info = { - "global_step": global_step, - "generation_config": self.pipeline_config.validation.generating_args.to_dict() - } - self.val_generate_scheduler.start_sampling.remote(data=val_batch, batch_size=len(self.val_dataset)) - - scheduler_refs = [] - for domain, scheduler in self.generate_schedulers.items(): - scheduler_refs.append( - scheduler.start_sampling.remote(data=batch, batch_size=self.domain_batch_size[domain]) - ) - ray.get(scheduler_refs, timeout=self.pipeline_config.rpc_timeout) - else: - for reward_cluster in self.rewards.values(): - reward_cluster.load_states() - scheduler_refs = {} for domain, scheduler in self.generate_schedulers.items(): scheduler_refs[domain] = scheduler.get_batch.remote( - data=batch, batch_size=self.domain_batch_size[domain] + data=batch, global_step=global_step, batch_size=self.domain_batch_size[domain] ) for domain, scheduler_ref in scheduler_refs.items(): domain_batch: DataProto = ray.get(scheduler_ref, timeout=self.pipeline_config.rpc_timeout) @@ -532,37 +477,42 @@ def run(self): dump_rollout_to_specific_path(self.pipeline_config.rollout_dump_dir, global_step, generate_output, self.tokenizer) generate_output.meta_info.pop("is_offload_states", None) - if not self.pipeline_config.async_pipeline and self.pipeline_config.generate_opt_level == 1: + if not self.pipeline_config.async_pipeline: + ray.get([scheduler.pause_sampling.remote() for scheduler in self.generate_schedulers.values()]) + self.actor_infer.offload_states() for reward_cluster in self.rewards.values(): reward_cluster.offload_states() - gen_metrics = self.actor_infer.stop_server() - metrics_mgr.add_domain_metrics( - domain, reduce_metrics(gen_metrics.meta_info.pop("metrics", {})) - ) metrics_mgr.add_metric("time/step_generate", step_generate_timer.last) batch = generate_output batch.meta_info["global_step"] = global_step batch.meta_info["_broadcast_non_tensor_batch"] = True + batch.meta_info["loss_mask_keys"] = ['response_mask', 'final_response_mask'] batch.non_tensor_batch['sample_uuid'] = np.array([str(uuid.uuid4()) for _ in range(batch.batch.shape[0])], dtype=object) - + batch.batch["prompt_id"] = torch.arange(batch.batch.batch_size[0], device=batch.batch.device) with Timer(name="cal_ref_log_probs", logger=None) as cal_ref_log_probs_timer: if self.pipeline_config.enable_reference: - if self.is_lora: + worker_config = self.pipeline_config.reference if self.use_ref_model else self.pipeline_config.actor_train + worker = self.reference if self.use_ref_model else self.pipeline_config.actor_train + if worker_config.use_dynamic_batching_in_infer: + batch, dynamic_batching_metrics = dynamic_batching_shard( + batch, + worker.dp_size, + worker_config.max_tokens_per_microbatch_in_infer, + worker_config.sequence_length_round_in_infer, + worker_config.strategy_args.strategy_config.get("pipeline_model_parallel_size", 1), + worker_config.strategy_args.strategy_config.get("virtual_pipeline_model_parallel_size", None), + "reference/compute_log_probs", + ) + metrics_mgr.add_metrics(dynamic_batching_metrics) + if not self.use_ref_model: batch.meta_info["disable_adapter"] = True batch.meta_info["is_offload_states"] = False + batch_balance(batch, dp_size=self.actor_train.dp_size, minibatch_size=len(batch)) ref_log_probs = self.actor_train.compute_log_probs(batch, blocking=True) else: - if self.pipeline_config.reference.use_dynamic_batching_in_infer: - batch, dynamic_batching_metrics = dynamic_batching_shard( - batch, - self.reference.dp_size, - self.pipeline_config.reference.max_tokens_per_microbatch_in_infer, - self.pipeline_config.reference.sequence_length_round_in_infer, - "reference/compute_log_probs", - ) - metrics_mgr.add_metrics(dynamic_batching_metrics) + batch_balance(batch, dp_size=self.reference.dp_size, minibatch_size=len(batch)) ref_log_probs = self.reference.compute_log_probs(batch, blocking=True) metrics_mgr.add_reduced_metrics(ref_log_probs.meta_info.pop("metrics", {})) ref_log_probs.rename(old_keys="log_probs", new_keys="ref_log_probs") @@ -570,19 +520,22 @@ def run(self): metrics_mgr.add_metric("time/ref_log_probs_values", cal_ref_log_probs_timer.last) with Timer(name="cal_old_log_probs_values", logger=None) as cal_old_logpb_timer: - if self.is_lora: + if self.pipeline_config.enable_reference and not self.use_ref_model: batch.meta_info["disable_adapter"] = False batch.meta_info["is_offload_states"] = False if self.pipeline_config.adv_estimator == "gae": values_refs: List[ray.ObjectRef] = self.critic.compute_values(batch, blocking=False) if self.pipeline_config.enable_old_logprobs_recompute: + batch_balance(batch, dp_size=self.actor_train.dp_size, minibatch_size=len(batch)) if self.pipeline_config.actor_train.use_dynamic_batching_in_infer: batch, dynamic_batching_metrics = dynamic_batching_shard( batch, self.actor_train.dp_size, self.pipeline_config.actor_train.max_tokens_per_microbatch_in_infer, self.pipeline_config.actor_train.sequence_length_round_in_infer, + self.pipeline_config.actor_train.strategy_args.strategy_config.get("pipeline_model_parallel_size", 1), + self.pipeline_config.actor_train.strategy_args.strategy_config.get("virtual_pipeline_model_parallel_size", None), "actor_train/compute_log_probs", ) metrics_mgr.add_metrics(dynamic_batching_metrics) @@ -622,7 +575,7 @@ def run(self): metrics_mgr.add_metric("time/old_log_probs", cal_old_logpb_timer.last) # 要按domain group by处理reward - batch.batch["prompt_id"] = torch.arange(batch.batch.batch_size[0], device=batch.batch.device) + batch.reorder(indices=torch.argsort(batch.batch["prompt_id"])) batch_grouped: Dict[str, DataProto] = batch.group_by("domain") batch_list = [] for domain, domain_batch in batch_grouped.items(): @@ -698,6 +651,12 @@ def run(self): batch_grouped: Dict[str, DataProto] = batch.group_by("domain") metrics_mgr.add_domain_all_metrics(global_step, batch_grouped) + if self.pipeline_config.enable_old_logprobs_recompute: + batch, corr_metrics = apply_train_infer_correction_to_batch(self.pipeline_config, batch, + update_mask_keys=batch.meta_info[ + 'loss_mask_keys']) + metrics_mgr.add_metrics(corr_metrics) + with Timer(name="step_train", logger=None) as step_train_timer: if self.pipeline_config.adv_estimator == "gae": critic_train_metrics_refs: List[ray.ObjectRef] = self.critic.train_step(batch, blocking=False) @@ -705,6 +664,12 @@ def run(self): with actor_train_timer: # implement critic warmup if self.pipeline_config.critic_warmup <= global_step: + # Reorder data for DP rank load balancing + batch_balance_metrics = batch_balance(batch, dp_size=self.actor_train.dp_size, + minibatch_size=self.pipeline_config.actor_train.training_args.per_device_train_batch_size + * self.pipeline_config.actor_train.training_args.gradient_accumulation_steps + * self.actor_train.dp_size, logging_prefix="global_seqlen/actor_train") + metrics_mgr.add_metrics(batch_balance_metrics) # update actor if self.pipeline_config.actor_train.use_dynamic_batching_in_train: batch, dynamic_batching_metrics = dynamic_batching_shard( @@ -712,6 +677,8 @@ def run(self): self.actor_train.dp_size, self.pipeline_config.actor_train.max_tokens_per_microbatch_in_train, self.pipeline_config.actor_train.sequence_length_round_in_train, + self.pipeline_config.actor_train.strategy_args.strategy_config.get("pipeline_model_parallel_size", 1), + self.pipeline_config.actor_train.strategy_args.strategy_config.get("virtual_pipeline_model_parallel_size", None), "actor_train/train_step", ) metrics_mgr.add_metrics(dynamic_batching_metrics) @@ -737,16 +704,6 @@ def run(self): for domain, scheduler in self.generate_schedulers.items(): self.state.kv[f"scheduler_state_{domain}"] = ray.get(scheduler.get_scheduler_state.remote()) - if ( - self.pipeline_config.async_pipeline - and self.val_dataset - and global_step % self.pipeline_config.eval_steps == 0 - ): - with Timer(name="val_step", logger=None) as val_step_timer: - val_metrics = self.val() - metrics_mgr.add_metrics(val_metrics) - metrics_mgr.add_metric("time/val_step", val_step_timer.last) - metrics = metrics_mgr.get_metrics() # do ckpt self.state.step = global_step @@ -774,32 +731,30 @@ def run(self): logger.info(f"pipeline step {global_step} finished") global_step += 1 - first_step = False pre_step_total_time = step_total_timer.last - + + ray.get([scheduler.shutdown.remote() for scheduler in self.generate_schedulers.values()]) + ray.get(self.val_generate_scheduler.shutdown.remote()) logger.info("pipeline complete!") @torch.no_grad() - def val(self): + def val(self, global_step): val_metrics_mgr = MetricsManager() batch = DataProto() with Timer(name="step_generate", logger=None) as step_generate_timer: - batch.meta_info["is_offload_states"] = False - batch.meta_info["generation_config"] = self.pipeline_config.validation.generating_args.to_dict() - if not self.pipeline_config.async_pipeline: - self.actor_infer.start_server(data=DataProto(meta_info=batch.meta_info)) - for reward_cluster in self.rewards.values(): - reward_cluster.load_states() + batch.meta_info = { + "is_offload_states": False, + "generation_config": self.pipeline_config.validation.generating_args.to_dict(), + "global_step": global_step, + } + generate_output: DataProto = ray.get( - self.val_generate_scheduler.get_batch.remote(data=batch, batch_size=len(self.val_dataset)), + self.val_generate_scheduler.get_batch.remote(data=batch, global_step=global_step, batch_size=len(self.val_dataset)), timeout=self.pipeline_config.rpc_timeout, ) - if not self.pipeline_config.async_pipeline and self.pipeline_config.generate_opt_level == 1: - self.actor_infer.stop_server() - for reward_cluster in self.rewards.values(): - reward_cluster.offload_states() + generate_output.meta_info.pop("is_offload_states", None) val_metrics_mgr.add_metric("time/step_generate", step_generate_timer.last) diff --git a/roll/pipeline/rlvr/rlvr_rollout_pipeline.py b/roll/pipeline/rlvr/rlvr_rollout_pipeline.py index 64a28c491..f81492111 100644 --- a/roll/pipeline/rlvr/rlvr_rollout_pipeline.py +++ b/roll/pipeline/rlvr/rlvr_rollout_pipeline.py @@ -40,7 +40,6 @@ def __init__(self, pipeline_config: RLVRConfig): "rollout pipeline should strategy sleep_level 1, set sleep_level: 1." ) - scheduler_cls = DynamicSamplingScheduler self.tokenizer = default_tokenizer_provider(model_args=self.pipeline_config.actor_infer.model_args) self.val_dataset = None @@ -92,7 +91,7 @@ def __init__(self, pipeline_config: RLVRConfig): val_pipeline_config = copy.deepcopy(self.pipeline_config) val_pipeline_config.is_use_additional_prompts = False - self.val_generate_scheduler = scheduler_cls.options( + self.val_generate_scheduler = ray.remote(DynamicSamplingScheduler).options( scheduling_strategy=NodeAffinitySchedulingStrategy( node_id=ray.get_runtime_context().get_node_id(), soft=False, @@ -105,9 +104,6 @@ def __init__(self, pipeline_config: RLVRConfig): dataset=self.val_dataset, collect_fn_cls=DataCollatorWithPaddingForPaddedKeys, collect_fn_kwargs=dict(max_length=self.pipeline_config.prompt_length, padding="max_length"), - response_filter_fn=lambda data_item, config: True, - query_filter_fn=lambda data_list, config: True, - response_callback_fn=self.val_generate_scheduler.report_response.remote, is_val=True, ) ) @@ -131,14 +127,13 @@ def run(self): with Timer(name="step_generate", logger=None) as step_generate_timer: batch.meta_info["is_offload_states"] = False batch.meta_info["generation_config"] = self.pipeline_config.validation.generating_args.to_dict() - self.actor_infer.start_server(data=DataProto(meta_info=batch.meta_info)) + self.actor_infer.load_states() for reward_cluster in self.rewards.values(): reward_cluster.load_states() generate_output: DataProto = ray.get( - self.val_generate_scheduler.get_batch.remote(data=batch, batch_size=len(self.val_dataset)), + self.val_generate_scheduler.get_batch.remote(data=batch, global_step=global_step, batch_size=len(self.val_dataset)), timeout=self.pipeline_config.rpc_timeout, ) - self.actor_infer.stop_server() for reward_cluster in self.rewards.values(): reward_cluster.offload_states() generate_output.meta_info.pop("is_offload_states", None) @@ -169,4 +164,6 @@ def run(self): logger.info(f"pipeline step {global_step} finished") + ray.get(self.val_generate_scheduler.shutdown.remote()) + logger.info("pipeline complete!") diff --git a/roll/pipeline/rlvr/rlvr_vlm_pipeline.py b/roll/pipeline/rlvr/rlvr_vlm_pipeline.py index e0e0407ab..19a462801 100644 --- a/roll/pipeline/rlvr/rlvr_vlm_pipeline.py +++ b/roll/pipeline/rlvr/rlvr_vlm_pipeline.py @@ -12,22 +12,24 @@ import ray import torch from codetiming import Timer -from datasets import load_dataset, load_from_disk +from datasets import load_from_disk from ray.util.scheduling_strategies import NodeAffinitySchedulingStrategy from ray.util.timer import _Timer from transformers import AutoConfig, ProcessorMixin from transformers.image_utils import load_images from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize +from roll.configs import GeneratingArguments from roll.datasets.collator import DataCollatorWithPaddingForMM from roll.datasets.dataset import get_dataset from roll.distributed.executor.cluster import Cluster from roll.distributed.scheduler.generate_scheduler import DynamicSamplingScheduler from roll.distributed.scheduler.protocol import DataProto -from roll.models.model_providers import default_processor_provider +from roll.models.model_providers import default_processor_provider, get_extra_data_provider from roll.pipeline.base_pipeline import BasePipeline from roll.pipeline.rlvr.rlvr_config import RLVRConfig -from roll.pipeline.rlvr.rlvr_pipeline import query_filter_fn, update_dataset_domain +from roll.pipeline.rlvr.rlvr_pipeline import update_dataset_domain +from roll.pipeline.rlvr.utils import dump_rollout_to_specific_path from roll.utils.checkpoint_manager import download_model from roll.utils.functionals import ( RunningMoments, @@ -42,6 +44,7 @@ from roll.utils.logging import get_logger from roll.utils.metrics.metrics_manager import MetricsManager from roll.utils.packages import is_transformers_version_greater_than +from roll.utils.offload_states import OffloadStateType logger = get_logger() @@ -118,10 +121,10 @@ def encode_function( image_flag = [True] * len(prompt_getter(data)) image_list = [] for idx, image in enumerate(image_getter(data)): - if image is None: + if not image: image_flag[idx] = False try: - if isinstance(image, bytes): # bytes data + if isinstance(image, bytes): # bytes data # TODO: support multiple images image_out = Image.open(BytesIO(image)) else: @@ -153,6 +156,8 @@ def encode_function( "prompt": text_list, "ground_truth": ground_truth_getter(data), "reward_model": data["reward_model"], + # for text and multi-modal mixed data usage, indicating valid image + "image_flag": image_flag, } return encodings @@ -174,6 +179,8 @@ def get_vlm_dataset(data_args, encode_function, processor, get_eval=False): "prompt": datasets.Value(dtype="string"), "ground_truth": datasets.Value(dtype="string"), "reward_model": dataset.features["reward_model"], + # for text and multi-modal mixed data usage, indicating valid image + "image_flag": datasets.Value("bool"), } ) remove_columns = list(dataset.features.keys() - features.keys()) @@ -200,63 +207,6 @@ def get_vlm_dataset(data_args, encode_function, processor, get_eval=False): return dataset -def get_extra_data_provider(model_name_or_path: str, processor=None): - model_name_or_path = download_model(model_name_or_path) - config = AutoConfig.from_pretrained(model_name_or_path, trust_remote_code=True) - if "qwen2" in config.model_type: - import types - - from transformers import BatchFeature # help define a object to accesss attr - - dummy_self = BatchFeature( - { - "config": BatchFeature( - { - "vision_config": BatchFeature({"spatial_merge_size": processor.image_processor.merge_size}), - "image_token_id": processor.tokenizer.convert_tokens_to_ids("<|image_pad|>"), - "video_token_id": processor.tokenizer.convert_tokens_to_ids("<|video_pad|>"), - "vision_start_token_id": processor.tokenizer.convert_tokens_to_ids("<|vision_start|>"), - } - ) - } - ) - if is_transformers_version_greater_than("4.52.0"): - from transformers.models.qwen2_vl import Qwen2VLModel - - get_rope_index = types.MethodType(Qwen2VLModel.get_rope_index, dummy_self) - else: - from transformers.models.qwen2_vl import Qwen2VLForConditionalGeneration - - get_rope_index = types.MethodType(Qwen2VLForConditionalGeneration.get_rope_index, dummy_self) - - def extra_data_provider( - input_ids: torch.LongTensor, - image_grid_thw: Optional[torch.LongTensor] = None, - video_grid_thw: Optional[torch.LongTensor] = None, - attention_mask: Optional[torch.Tensor] = None, - ): - rope_index = get_rope_index(input_ids, image_grid_thw, video_grid_thw, attention_mask)[0] - # (3, bsz, seqlen) -> (bsz, 3, seqlen) to put it into DataProto, - # transpose it batck to (3, bsz, seqlen) before forward for model - rope_index = rope_index.transpose(0, 1) - return {"position_ids": rope_index} - - return extra_data_provider - - def default_extra_data_provider( - input_ids: torch.LongTensor, - attention_mask: Optional[torch.Tensor] = None, - ): - bsz, seqlen = input_ids.shape - position_ids = torch.arange(seqlen, dtype=torch.long, device=input_ids.device) - position_ids = position_ids.unsqueeze(0).expand(bsz, -1) - if attention_mask is not None: - position_ids = position_ids.masked_fill(attention_mask == 0, 0) - return {"position_ids": position_ids} - - return default_extra_data_provider - - class RLVRVLMPipeline(BasePipeline): def __init__(self, pipeline_config: RLVRConfig): super().__init__(pipeline_config) @@ -366,7 +316,7 @@ def __init__(self, pipeline_config: RLVRConfig): else: domain_batch_size = int(domain_ratios[domain] * self.pipeline_config.rollout_batch_size) accumulated += domain_batch_size - generate_scheduler = DynamicSamplingScheduler.options( + generate_scheduler = ray.remote(DynamicSamplingScheduler).options( scheduling_strategy=NodeAffinitySchedulingStrategy( node_id=ray.get_runtime_context().get_node_id(), soft=False ) @@ -387,15 +337,11 @@ def __init__(self, pipeline_config: RLVRConfig): prompt_key="prompt", answer_key="ground_truth", image_key="images", - image_flag_key=None, + image_flag_key="image_flag", max_length=self.pipeline_config.prompt_length, padding="max_length", ), - response_filter_fn=lambda data_item, config: True, - query_filter_fn=query_filter_fn, - response_callback_fn=generate_scheduler.report_response.remote, state=self.state.kv.get(f"scheduler_state_{domain}", None), - is_vlm=True, ) ) self.generate_schedulers[domain] = generate_scheduler @@ -409,7 +355,7 @@ def __init__(self, pipeline_config: RLVRConfig): if self.val_dataset: val_pipeline_config = copy.deepcopy(self.pipeline_config) val_pipeline_config.is_use_additional_prompts = False - self.val_generate_scheduler = DynamicSamplingScheduler.options( + self.val_generate_scheduler = ray.remote(DynamicSamplingScheduler).options( scheduling_strategy=NodeAffinitySchedulingStrategy( node_id=ray.get_runtime_context().get_node_id(), soft=False ) @@ -432,14 +378,11 @@ def __init__(self, pipeline_config: RLVRConfig): prompt_key="prompt", answer_key="ground_truth", image_key="images", - image_flag_key=None, + image_flag_key="image_flag", max_length=self.pipeline_config.prompt_length, padding="max_length", ), - response_filter_fn=lambda data_item, config: True, - query_filter_fn=lambda data_list, config: True, - response_callback_fn=self.val_generate_scheduler.report_response.remote, - is_vlm=True, + is_val=True, ) ) @@ -475,6 +418,15 @@ def __init__(self, pipeline_config: RLVRConfig): for domain in self.rewards.keys(): self.running[domain] = RunningMoments() + def get_generation_config(self, generating_args: Optional[GeneratingArguments] = None): + generating_args = ( + generating_args if generating_args is not None else self.actor_infer.worker_config.generating_args + ) + generation_config = generating_args.to_dict() + if self.pipeline_config.async_pipeline: + generation_config["logprobs"] = 1 + return generation_config + @torch.no_grad() def run(self): metrics_mgr = MetricsManager() @@ -489,6 +441,11 @@ def run(self): metrics_mgr.timers["actor_infer_response"] = actor_infer_response_timer metrics_mgr.timers["actor_train"] = actor_train_timer + pre_step_total_time = 0 + if self.pipeline_config.async_pipeline: + for reward_cluster in self.rewards.values(): + reward_cluster.load_states() + for global_step in range(self.pipeline_config.max_steps): if global_step <= self.state.step: global_step += 1 @@ -498,48 +455,54 @@ def run(self): metrics_mgr.clear_metrics() with tps_timer, Timer(name="step_total", logger=None) as step_total_timer: + logger.info(f"pre_step_total_time: {pre_step_total_time}") + metrics_mgr.add_metric("time/step_total", pre_step_total_time) + batch: DataProto = DataProto( + meta_info={ + "global_step": global_step, + "collect_unfinished": self.pipeline_config.async_pipeline, + "max_steps": self.pipeline_config.max_steps, + "is_training": True, + } + ) if self.pipeline_config.adv_estimator == "gae": self.critic.offload_states(blocking=True) self.actor_train.offload_states(blocking=True) + with Timer(name="step_stop_server", logger=None) as step_stop_server_timer: + if self.pipeline_config.async_pipeline: + ray.get([scheduler.pause_sampling.remote() for scheduler in self.generate_schedulers.values()]) + self.actor_infer.offload_states(include=OffloadStateType.other_params) + metrics_mgr.add_metric("time/step_stop_server", step_stop_server_timer.last) + with Timer(name="step_model_update", logger=None) as step_model_update_timer: model_update_metrics: Dict = self.model_update(global_step) metrics_mgr.add_metrics(model_update_metrics) - metrics_mgr.add_metric("time/step_model_update", step_model_update_timer.last) + batch.meta_info["generation_config"] = self.get_generation_config() + metrics_mgr.add_metric("time/step_model_update", step_model_update_timer.last) + + self.actor_infer.load_states(blocking=True) + + if not self.pipeline_config.async_pipeline: + for reward_cluster in self.rewards.values(): + reward_cluster.load_states() if self.val_dataset and global_step % self.pipeline_config.eval_steps == 0: with Timer(name="val_step", logger=None) as val_step_timer: - val_metrics = self.val() - metrics_mgr.add_metrics(val_metrics) - metrics_mgr.add_metric("time/val_step", val_step_timer.last) - - batch: DataProto = DataProto() - batch.meta_info = {"global_step": global_step} + val_metrics = self.val(global_step=global_step) + metrics_mgr.add_metrics(val_metrics) + metrics_mgr.add_metric("time/val_step", val_step_timer.last) # 要按domain group by生成对应的batch with actor_infer_timer, actor_infer_response_timer, Timer( name="step_generate", logger=None ) as step_generate_timer: domain_batches = {} - batch.meta_info["generation_config"] = self.actor_infer.worker_config.generating_args.to_dict() - self.actor_infer.start_server(data=DataProto(meta_info=batch.meta_info)) - for reward_cluster in self.rewards.values(): - reward_cluster.load_states() - - batch.meta_info["is_offload_states"] = False - # meta mainly for dynamic reward threshold, such as global_step/max_steps - batch.meta_info.update( - { - "global_step": self.global_step, - "max_steps": self.pipeline_config.max_steps, - "is_training": True, - } - ) scheduler_refs = {} for domain, scheduler in self.generate_schedulers.items(): scheduler_refs[domain] = scheduler.get_batch.remote( - data=batch, batch_size=self.domain_batch_size[domain] + data=batch, global_step=global_step, batch_size=self.domain_batch_size[domain] ) for domain, scheduler_ref in scheduler_refs.items(): domain_batch: DataProto = ray.get(scheduler_ref, timeout=self.pipeline_config.rpc_timeout) @@ -548,17 +511,22 @@ def run(self): ) domain_batches[domain] = domain_batch generate_output = DataProto.concat([domain_batch for domain_batch in domain_batches.values()]) + dump_rollout_to_specific_path( + self.pipeline_config.rollout_dump_dir, global_step, generate_output, self.tokenizer + ) generate_output.meta_info.pop("is_offload_states", None) - for reward_cluster in self.rewards.values(): - reward_cluster.offload_states() - gen_metrics = self.actor_infer.stop_server() - metrics_mgr.add_metrics(reduce_metrics(gen_metrics.meta_info.pop("metrics", {}))) + if not self.pipeline_config.async_pipeline: + ray.get([scheduler.pause_sampling.remote() for scheduler in self.generate_schedulers.values()]) + for reward_cluster in self.rewards.values(): + reward_cluster.offload_states() + self.actor_infer.offload_states() metrics_mgr.add_metric("time/step_generate", step_generate_timer.last) batch = generate_output # mark here to make megatron get_data_input broadcast with non_batch_tensor - batch.meta_info["_broadcast_non_tensor_batch"]= True + batch.meta_info["_broadcast_non_tensor_batch"] = True + batch.meta_info["loss_mask_keys"] = ["response_mask", "final_response_mask"] batch.non_tensor_batch['sample_uuid'] = np.array([str(uuid.uuid4()) for _ in range(batch.batch.shape[0])], dtype=object) with Timer(name="cal_ref_log_probs", logger=None) as cal_ref_log_probs_timer: @@ -646,6 +614,22 @@ def run(self): metrics_mgr.add_domain_metrics(domain, {"time/compute_advantage": compute_advantage_timer.last}) batch = DataProto.concat(batch_list) + + if batch.batch["final_response_mask"].sum() == 0: + logger.info("Warning: final_response_mask.sum() == 0! Current step will be skipped.") + metrics_mgr.add_metric("mask/final_mask_sum_eq_0", 1) + metrics = metrics_mgr.get_metrics() + # do ckpt + self.state.step = global_step + self.state.log_history.append(metrics) + for domain, scheduler in self.generate_schedulers.items(): + self.state.kv[f"scheduler_state_{domain}"] = ray.get(scheduler.get_scheduler_state.remote()) + self.do_checkpoint(global_step=global_step) + self.tracker.log(values=metrics, step=global_step) + continue + else: + metrics_mgr.add_metric("mask/final_mask_sum_eq_0", 0) + batch.reorder(indices=torch.argsort(batch.batch["prompt_id"])) batch.pop("prompt_id") @@ -677,7 +661,7 @@ def run(self): critic_train_metrics = DataProto.materialize_concat(data_refs=critic_train_metrics_refs) metrics_mgr.add_reduced_metrics(critic_train_metrics.meta_info.pop("metrics", {})) - metrics_mgr.add_metric("time/step_train", step_train_timer.last) + metrics_mgr.add_metric("time/step_train", step_train_timer.last) tps_timer.push_units_processed(n=torch.sum(batch.batch["attention_mask"]).detach().item()) actor_infer_timer.push_units_processed(n=torch.sum(batch.batch["attention_mask"]).detach().item()) @@ -715,10 +699,15 @@ def run(self): logger.info(f"pipeline step {global_step} finished") global_step += 1 + pre_step_total_time = step_total_timer.last + + ray.get([scheduler.shutdown.remote() for scheduler in self.generate_schedulers.values()]) + ray.get(self.val_generate_scheduler.shutdown.remote()) + logger.info("pipeline complete!") @torch.no_grad() - def val(self): + def val(self, global_step): val_metrics_mgr = MetricsManager() batch = DataProto() @@ -728,18 +717,11 @@ def val(self): batch.meta_info.update( {"global_step": self.global_step, "max_steps": self.pipeline_config.max_steps, "is_training": False} ) - - self.actor_infer.start_server(data=DataProto(meta_info=batch.meta_info)) - for reward_cluster in self.rewards.values(): - reward_cluster.load_states() generate_output: DataProto = ray.get( - self.val_generate_scheduler.get_batch.remote(data=batch, batch_size=len(self.val_dataset)), + self.val_generate_scheduler.get_batch.remote(data=batch, global_step=global_step, batch_size=len(self.val_dataset)), timeout=self.pipeline_config.rpc_timeout, ) - self.actor_infer.stop_server() generate_output.meta_info.pop("is_offload_states", None) - for reward_cluster in self.rewards.values(): - reward_cluster.offload_states() val_metrics_mgr.add_metric("time/step_generate", step_generate_timer.last) batch = generate_output @@ -752,7 +734,7 @@ def val(self): grouped_batch = epoch_batch.group_by("tag") for group_key, group_batch in grouped_batch.items(): score_mean = group_batch.batch["scores"].mean().item() - print(f"{group_key}: {score_mean}") + logger.info(f"val_score/{group_key}: {score_mean}") val_metrics_mgr.add_domain_metrics( "val_score", {f"{group_key}/mean": group_batch.batch["scores"].detach().float().mean().item()} ) diff --git a/roll/pipeline/rlvr/utils.py b/roll/pipeline/rlvr/utils.py index 7173a1118..11e60b567 100644 --- a/roll/pipeline/rlvr/utils.py +++ b/roll/pipeline/rlvr/utils.py @@ -3,18 +3,20 @@ import time import numpy import copy +import requests + +import torch + from codetiming import Timer import multiprocessing - from roll.distributed.scheduler.protocol import DataProto - from roll.utils.logging import get_logger logger = get_logger() -COLUMNS_CONFIG = [ +COLUMMNS_CONFIG = [ ['global_step','bigint'], ['id','string'], ['source','string'], diff --git a/roll/pipeline/sft/sft_config.py b/roll/pipeline/sft/sft_config.py index bf60f429e..d23fab07e 100644 --- a/roll/pipeline/sft/sft_config.py +++ b/roll/pipeline/sft/sft_config.py @@ -59,7 +59,5 @@ def __post_init__(self): self.sft_train.name = "sft_train" - self.validate_worker_config() - def set_max_steps(self, max_steps: int): self.sft_train.training_args.max_steps = max_steps diff --git a/roll/pipeline/sft/sft_pipeline.py b/roll/pipeline/sft/sft_pipeline.py index 8bdc0b365..1d30bf5f6 100644 --- a/roll/pipeline/sft/sft_pipeline.py +++ b/roll/pipeline/sft/sft_pipeline.py @@ -4,9 +4,9 @@ import numpy as np import ray import torch +from tqdm import tqdm from codetiming import Timer from torch.utils.data import DataLoader -from tqdm import tqdm from roll.datasets.chat_template import get_chat_template from roll.datasets.collator import DataCollatorForSFT @@ -18,7 +18,7 @@ from roll.utils.constants import IGNORE_INDEX from roll.utils.logging import get_logger from roll.utils.metrics.metrics_manager import MetricsManager - +from roll.utils.functionals import batch_balance, reduce_metrics logger = get_logger() @@ -38,7 +38,7 @@ def preprocess_dataset(dataset, prompt_len, encode_func, num_proc): def get_encode_function(template_name, tokenizer, prompt_key, query_key, response_key, system_key=None): chat_template_func = get_chat_template(template_name, tokenizer) - + def build_conversation(system_prompt, prompt, query, response): conversation = [] if system_prompt: @@ -98,26 +98,26 @@ def __init__(self, pipeline_config: SFTConfig): dataset_paths.append(train_file_name) logger.info(f"load_dataset_paths: {chr(10)} {chr(10).join(dataset_paths)}") self.dataset = datasets.load_dataset("json", data_files=dataset_paths)["train"] - + self.val_dataset = None if self.pipeline_config.validation and self.pipeline_config.validation.data_args: val_dataset_paths = self.pipeline_config.validation.data_args.file_name self.val_dataset = datasets.load_dataset("json", data_files=val_dataset_paths)["train"] - + template_name = ( self.pipeline_config.global_template if self.pipeline_config.global_template else self.pipeline_config.sft_train.data_args.template ) - encode_function = get_encode_function(template_name, self.tokenizer, - self.pipeline_config.prompt_key, - self.pipeline_config.query_key, - self.pipeline_config.response_key, + encode_function = get_encode_function(template_name, self.tokenizer, + self.pipeline_config.prompt_key, + self.pipeline_config.query_key, + self.pipeline_config.response_key, self.pipeline_config.system_key) self.dataset = preprocess_dataset( - self.dataset, - self.pipeline_config.sequence_length, - encode_function, + self.dataset, + self.pipeline_config.sequence_length, + encode_function, num_proc=self.pipeline_config.sft_train.data_args.preprocessing_num_workers) data_collator = DataCollatorForSFT( @@ -144,16 +144,16 @@ def __init__(self, pipeline_config: SFTConfig): dp_size = self.sft_train.dp_size ga_steps = self.pipeline_config.sft_train.training_args.gradient_accumulation_steps per_device_bs = self.pipeline_config.sft_train.training_args.per_device_train_batch_size - global_train_batch_size = dp_size * ga_steps * per_device_bs + self.global_train_batch_size = dp_size * ga_steps * per_device_bs logger.info(f"data parallel size = {dp_size},\n" f"gradient accumulation steps = {ga_steps},\n" f"per device train batch size = {per_device_bs},\n" - f"global train batch size = {global_train_batch_size}") + f"global train batch size = {self.global_train_batch_size}") self.dataloader = DataLoader( dataset=self.dataset, - batch_size=global_train_batch_size, - shuffle=True, # Enable shuffle for better training + batch_size=self.global_train_batch_size, + shuffle=False, drop_last=True, num_workers=self.pipeline_config.sft_train.training_args.dataloader_num_workers, collate_fn=data_collator, @@ -161,11 +161,11 @@ def __init__(self, pipeline_config: SFTConfig): if self.val_dataset: self.val_dataset = preprocess_dataset( - self.val_dataset, - self.pipeline_config.sequence_length, - encode_function, + self.val_dataset, + self.pipeline_config.sequence_length, + encode_function, num_proc=self.pipeline_config.sft_train.data_args.preprocessing_num_workers) - + global_val_batch_size = dp_size * ga_steps * self.pipeline_config.sft_train.infer_batch_size self.val_dataloader = DataLoader( dataset=self.val_dataset, @@ -207,7 +207,12 @@ def run(self): with Timer(name="step_train", logger=None) as step_train_timer: batch: DataProto = DataProto.from_single_dict(batch_dict) - batch.meta_info = {"global_step": global_step, "is_offload_optimizer_states_in_train_step": False} + batch.meta_info = {"global_step": global_step, "is_offload_optimizer_states_in_train_step": False, + "loss_mask_keys": ["labels"]} + # Reorder data for DP rank load balancing + batch_balance_metrics = batch_balance(batch, dp_size=self.sft_train.dp_size, + minibatch_size=self.global_train_batch_size) + metrics_mgr.add_metrics(batch_balance_metrics) train_metrics_refs = self.sft_train.train_step(batch, blocking=False) train_metrics = DataProto.materialize_concat(data_refs=train_metrics_refs) train_metrics = train_metrics.meta_info.pop("metrics", {}) @@ -221,7 +226,7 @@ def run(self): # Update tqdm progress bar loss = metrics.get("sft_train/loss", 0) pbar.set_postfix({"loss": f"{loss:.4f}", "step": f"{global_step}/{total_steps}"}) - + self.state.step = global_step self.state.log_history.append(metrics) self.do_checkpoint(global_step=global_step) @@ -240,11 +245,12 @@ def run(self): @torch.no_grad() def val(self): val_loss_list = [] - for batch_dict in self.val_dataloader: + pbar = tqdm(self.val_dataloader, desc="Validating", leave=False) + for batch_dict in pbar: batch: DataProto = DataProto.from_single_dict(batch_dict) - batch.meta_info = {"is_offload_optimizer_states_in_train_step": False} + batch.meta_info = {"is_offload_optimizer_states_in_train_step": False, 'loss_mask_keys': ['labels']} val_metrics_refs = self.sft_train.val_step(batch, blocking=False) val_metrics = DataProto.materialize_concat(data_refs=val_metrics_refs) - val_metrics = val_metrics.meta_info.pop("metrics", {}) - val_loss_list.append(val_metrics[f"sft_train/loss"]) - return {"sft_train/val_loss": np.concatenate(val_loss_list)} + val_metrics = reduce_metrics(val_metrics.meta_info.pop("metrics", {})) + val_loss_list.append(val_metrics[f"sft_train/loss@sum"]) + return {"sft_train/val_loss": val_loss_list} diff --git a/roll/pipeline/sft/sft_worker.py b/roll/pipeline/sft/sft_worker.py index aedc73ae7..d76866b96 100644 --- a/roll/pipeline/sft/sft_worker.py +++ b/roll/pipeline/sft/sft_worker.py @@ -10,6 +10,7 @@ from roll.distributed.scheduler.protocol import DataProto from roll.distributed.strategy.factory import create_strategy from roll.distributed.strategy.strategy import InferenceStrategy, TrainStrategy +from roll.utils.functionals import reduce_metrics from roll.models.model_providers import default_actor_model_provider from roll.platforms import current_platform @@ -32,12 +33,8 @@ def train_step(self, data: DataProto): data = data.to(current_platform.device_type) data = self.strategy.get_data_input(data) - loss_func = self.loss_func - if self.worker_config.use_sequence_packing: - from roll.utils.sequence_packing import SequencePackingSFTLossWrapper - loss_func = SequencePackingSFTLossWrapper(self.strategy, loss_func) + metrics = self.strategy.train_step(batch=data, loss_func=self.loss_func) - metrics = self.strategy.train_step(batch=data, loss_func=loss_func) output = DataProto(meta_info={"metrics": metrics}).to("cpu") return output @@ -47,16 +44,19 @@ def val_step(self, data: DataProto): data.meta_info["micro_batch_size"] = self.worker_config.infer_batch_size data = self.strategy.get_data_input(data) metrics = self.strategy.forward_step(batch=data, forward_func=self.loss_func) + if metrics is None: + metrics = {} + metrics = reduce_metrics(metrics) output = DataProto(meta_info={"metrics": metrics}).to("cpu") return output @register(Dispatch.ONE_TO_ALL) - def do_checkpoint(self, global_step): + def do_checkpoint(self, global_step, is_last_step=False): with Timer("do_checkpoint") as total_timer: ckpt_id = f"checkpoint-{global_step}" save_dir = os.path.join(self.pipeline_config.output_dir, self.worker_name, ckpt_id, self.cluster_name) self.logger.info(f"save checkpoint-{global_step} to {save_dir}") - exec_metrics: Dict = self.strategy.save_checkpoint(save_dir, global_step, ckpt_id) + exec_metrics: Dict = self.strategy.save_checkpoint(save_dir, global_step, ckpt_id, is_last_step=is_last_step) metrics = { f"time/{self.cluster_name}/do_checkpoint/total": total_timer.last, @@ -68,6 +68,6 @@ def do_checkpoint(self, global_step): def loss_func(self, data: DataProto, output_tensor: torch.Tensor): labels = data.batch["labels"] - loss = self.strategy.op_compute_language_loss(output_tensor, labels) - metrics = {f"{self.worker_config.name}/loss": loss.detach().float().unsqueeze(0)} - return loss, metrics \ No newline at end of file + batch_num_tokens = data.meta_info['batch_num_tokens']['labels'] + loss, metrics = self.strategy.op_compute_language_loss(output_tensor, labels, batch_num_tokens) + return loss, metrics diff --git a/roll/platforms/cuda.py b/roll/platforms/cuda.py index 5b46ba925..a809003e0 100644 --- a/roll/platforms/cuda.py +++ b/roll/platforms/cuda.py @@ -1,6 +1,7 @@ from .platform import Platform from ..utils.logging import get_logger +import os import torch logger = get_logger() @@ -36,8 +37,9 @@ def get_custom_env_vars(cls) -> dict: "VLLM_ALLOW_INSECURE_SERIALIZATION": "1", "TORCHINDUCTOR_COMPILE_THREADS": "2", "PYTORCH_CUDA_ALLOC_CONF": "expandable_segments:True", - "NCCL_CUMEM_ENABLE": "0", # https://github.com/NVIDIA/nccl/issues/1234 + "NCCL_CUMEM_ENABLE": os.getenv("NCCL_CUMEM_ENABLE", "0"), # https://github.com/NVIDIA/nccl/issues/1234 "NCCL_NVLS_ENABLE": "0", + "NVTE_BWD_LAYERNORM_SM_MARGIN": os.getenv('NVTE_BWD_LAYERNORM_SM_MARGIN', "0"), } return env_vars @@ -46,7 +48,8 @@ def get_vllm_worker_class(cls): try: from vllm import envs - if envs.VLLM_USE_V1: + # VLLM_USE_V1 is deprecated in vllm>=0.11.1 + if not hasattr(envs, "VLLM_USE_V1") or envs.VLLM_USE_V1: from vllm.v1.worker.gpu_worker import Worker logger.info("Successfully imported vLLM V1 Worker.") diff --git a/roll/platforms/npu.py b/roll/platforms/npu.py index ce872350e..0fd955113 100644 --- a/roll/platforms/npu.py +++ b/roll/platforms/npu.py @@ -50,7 +50,8 @@ def get_vllm_worker_class(cls): try: from vllm import envs - if envs.VLLM_USE_V1: + # VLLM_USE_V1 is deprecated in vllm>=0.11.1 + if not hasattr(envs, "VLLM_USE_V1") or envs.VLLM_USE_V1: from vllm_ascend.worker.worker_v1 import NPUWorker as Worker logger.info("Successfully imported vLLM V1 Worker.") @@ -79,4 +80,5 @@ def apply_ulysses_patch(cls) -> None: @classmethod def device_memory_used(cls) -> None: - return torch.npu.mem_get_info()[0] \ No newline at end of file + free, total = torch.npu.mem_get_info() + return total - free \ No newline at end of file diff --git a/roll/platforms/rocm.py b/roll/platforms/rocm.py index c55b59a84..ecab6b0fc 100644 --- a/roll/platforms/rocm.py +++ b/roll/platforms/rocm.py @@ -14,6 +14,7 @@ class RocmPlatform(Platform): device_control_env_var: str = "HIP_VISIBLE_DEVICES" ray_experimental_noset: str = "RAY_EXPERIMENTAL_NOSET_CUDA_VISIBLE_DEVICES" communication_backend: str = "nccl" + Event: type = torch.cuda.Event @classmethod def is_rocm(cls) -> bool: @@ -35,17 +36,14 @@ def get_custom_env_vars(cls) -> dict: "VLLM_ALLOW_INSECURE_SERIALIZATION": "1", # These VLLM related enviroment variables are related to backend. maybe used afterwards. # "VLLM_USE_TRITON_FLASH_ATTN":"0", - # "VLLM_ROCM_USE_AITER":"1", - # "VLLM_ROCM_USE_AITER_MOE":"1", + "VLLM_ROCM_USE_AITER":"1", + "VLLM_ROCM_USE_AITER_MOE":"1", # "VLLM_ROCM_USE_AITER_ASMMOE":"1", - # "VLLM_ROCM_USE_AITER_PAGED_ATTN":"1", + "VLLM_ROCM_USE_AITER_PAGED_ATTN":"1", # "RAY_DEBUG": "legacy", - "VLLM_USE_V1": "1", + "VLLM_USE_V1": "0", "TORCHINDUCTOR_COMPILE_THREADS": "2", "PYTORCH_HIP_ALLOC_CONF": "expandable_segments:True", - "SAFETENSORS_FAST_GPU":"1", - "VLLM_ROCM_USE_AITER_MHA":"0", - "VLLM_ALLOW_LONG_MAX_MODEL_LEN":"1", # "NCCL_DEBUG_SUBSYS":"INIT,COLL", # "NCCL_DEBUG":"INFO", # "NCCL_DEBUG_FILE":"rccl.%h.%p.log", @@ -77,7 +75,8 @@ def get_vllm_worker_class(cls): try: from vllm import envs - if envs.VLLM_USE_V1: + # VLLM_USE_V1 is deprecated in vllm>=0.11.1 + if not hasattr(envs, "VLLM_USE_V1") or envs.VLLM_USE_V1: from vllm.v1.worker.gpu_worker import Worker logger.info("Successfully imported vLLM V1 Worker.") diff --git a/roll/platforms/unknown.py b/roll/platforms/unknown.py index 43db59ed5..71e702e3d 100644 --- a/roll/platforms/unknown.py +++ b/roll/platforms/unknown.py @@ -47,7 +47,8 @@ def get_vllm_worker_class(cls): try: from vllm import envs - if envs.VLLM_USE_V1: + # VLLM_USE_V1 is deprecated in vllm>=0.11.1 + if not hasattr(envs, "VLLM_USE_V1") or envs.VLLM_USE_V1: from vllm.v1.worker.gpu_worker import Worker logger.info("Successfully imported vLLM V1 Worker.") @@ -66,6 +67,7 @@ def get_vllm_run_time_env_vars(cls, gpu_rank:str) -> dict: env_vars = { "PYTORCH_CUDA_ALLOC_CONF" : "", "VLLM_ALLOW_INSECURE_SERIALIZATION":"1", + "CUDA_VISIBLE_DEVICES": f"{gpu_rank}", } return env_vars diff --git a/roll/third_party/deepspeed/model_update.py b/roll/third_party/deepspeed/model_update.py new file mode 100644 index 000000000..b6452902c --- /dev/null +++ b/roll/third_party/deepspeed/model_update.py @@ -0,0 +1,205 @@ +import ray +import torch.distributed as dist +from deepspeed.runtime.zero import GatheredParameters +from peft import get_peft_model_state_dict + +from roll.configs.base_config import PPOConfig +from roll.configs.worker_config import is_actor_infer_overlapping_with_any_cluster +from roll.utils.collective import collective +from roll.utils.logging import get_logger +from roll.utils.network_utils import collect_free_port, get_node_ip +from roll.utils.send_recv_utils import serialize_named_weights + + +logger = get_logger() + + +def _get_ds_param_size(param): + if hasattr(param, "ds_numel"): + ds_numel = param.ds_numel + else: + ds_numel = param.numel() + return ds_numel * param.element_size() + + +def _gather_weights(is_zero3, named_params): + if not is_zero3: + return [(n, p.data) for n, p in named_params] + with GatheredParameters([p for _, p in named_params]): + return [(n, p.data) for n, p in named_params] + + +def gather_deepspeed_weights(model, ds_config, buffer_size): + is_zero3 = ds_config.is_zero3() + named_params = [(name, param) for name, param in model.named_parameters()] + + waiting_params, waiting_params_size = [], 0 + for name, param in named_params: + if waiting_params and waiting_params_size + _get_ds_param_size(param) > buffer_size: + yield _gather_weights(is_zero3, waiting_params) + waiting_params, waiting_params_size = [], 0 + waiting_params_size += _get_ds_param_size(param) + waiting_params.append((name, param)) + + if waiting_params: + yield _gather_weights(is_zero3, waiting_params) + + +class DeepSpeedWeightUpdater: + def __init__(self, pipeline_config: PPOConfig, infer_cluster, worker_config, model_update_name: str, model, ds_config, is_lora): + self.pipeline_config = pipeline_config + self.worker_config = worker_config + self.model_update_name = model_update_name + self.model = model + self.ds_config = ds_config + self.model_update_infer_workers = infer_cluster.workers + self._model_update_buffer_size = pipeline_config.model_update_buffer_size_mb * 1024 * 1024 # Convert MB to bytes + self.is_lora = is_lora + self.infer_worker_config = infer_cluster.worker_config + self.infer_cluster = infer_cluster + self.is_colocated = is_actor_infer_overlapping_with_any_cluster(infer_cluster.worker_config, actor_train=worker_config) + + # Colocated mode attributes + self._infer_parallel_cpu_group = None + self._co_infer_worker = None + self._buffer_num = None + self._broadcast_workers = None + + # Separated mode attributes + self.model_update_group_name = None + self._model_update_locker = None + + if self.is_colocated: + self._setup_colocated_model_update() + else: + self._setup_separated_model_update() + + def model_update(self): + if self.is_colocated: + return self._colocated_model_update() + return self._separated_model_update() + + def _setup_colocated_model_update(self): + logger.info(f"RANK {dist.get_rank()} Setup colocated model update") + infer_worker_devices_num = self.infer_worker_config.num_gpus_per_worker + train_world_size = dist.get_world_size() + + device_start_diff = min(self.worker_config.device_mapping) - min(self.infer_worker_config.device_mapping) + device_end_diff = max(self.worker_config.device_mapping) - max(self.infer_worker_config.device_mapping) + + assert device_start_diff % infer_worker_devices_num == 0 + assert device_end_diff % infer_worker_devices_num == 0 + + for start_rank in range(0, train_world_size, infer_worker_devices_num): + end_rank = start_rank + infer_worker_devices_num + assert end_rank <= train_world_size + group_ranks = list(range(start_rank, end_rank)) + new_group = dist.new_group(ranks=group_ranks, backend="gloo") + if dist.get_rank() in group_ranks: + self._infer_parallel_cpu_group = new_group + infer_worker_idx = dist.get_rank() + device_start_diff // infer_worker_devices_num + self._co_infer_worker = None + if 0 <= infer_worker_idx < len(self.model_update_infer_workers): + self._co_infer_worker = self.model_update_infer_workers[infer_worker_idx] + + # rank0 broadcast to mismatch workers + if dist.get_rank() == 0 and (device_start_diff > 0 or device_end_diff < 0): + self._broadcast_workers = [] + if device_start_diff > 0: + self._broadcast_workers.extend(self.model_update_infer_workers[: device_start_diff // infer_worker_devices_num]) + if device_end_diff < 0: + self._broadcast_workers.extend(self.model_update_infer_workers[device_end_diff // infer_worker_devices_num :]) + self._setup_broadcast_group() + + def _setup_separated_model_update(self): + if dist.get_rank() != 0: + return + + self._broadcast_workers = self.model_update_infer_workers + self._setup_broadcast_group() + + def _setup_broadcast_group(self): + if not self._broadcast_workers: + return + self.model_update_group_name = f"{self.model_update_name}_deepspeed" + num_gpus_per_infer_worker = self.infer_worker_config.num_gpus_per_worker + infer_device_num = num_gpus_per_infer_worker * len(self._broadcast_workers) + master_address, master_port = get_node_ip(), collect_free_port() + + refs = [ + infer_worker.setup_collective_group.remote( + master_address=master_address, + master_port=master_port, + group_name=self.model_update_group_name, + rank_offset=i * num_gpus_per_infer_worker + 1, + world_size=infer_device_num + 1, + ) + for i, infer_worker in enumerate(self._broadcast_workers) + ] + collective.init_collective_group( + infer_device_num + 1, + 0, + group_name=self.model_update_group_name, + master_addr=master_address, + master_port=master_port, + ) + ray.get(refs) + + logger.info(f"Init weights update group {self.model_update_group_name}") + + def _colocated_model_update(self): + refs = [] + for named_weights in gather_deepspeed_weights( + self.model, self.ds_config, buffer_size=self._model_update_buffer_size + ): + serialized_tensors = serialize_named_weights( + named_weights, infer_strategy=self.infer_worker_config.strategy_args.strategy_name + ) + infer_parallel_size = dist.get_world_size(self._infer_parallel_cpu_group) + co_infer_rank = dist.get_rank(self._infer_parallel_cpu_group) + infer_parallel_tensors = [serialized_tensors] # tensors for each infer parallel rank + if infer_parallel_size > 1: + infer_parallel_tensors = [None] * infer_parallel_size if co_infer_rank == 0 else None + dist.gather_object( + serialized_tensors, infer_parallel_tensors, group_dst=0, group=self._infer_parallel_cpu_group + ) + if refs: + ray.get(refs) + refs = [] + if co_infer_rank == 0 and self._co_infer_worker is not None: + refs.append(self._co_infer_worker.update_parameter_in_bucket.remote(infer_parallel_tensors)) + if self._broadcast_workers: + refs.extend(self._broadcast_to_infer_workers(named_weights)) + if refs: + ray.get(refs) + return {} + + def _broadcast_to_infer_workers(self, named_weights) -> list[ray.ObjectRef]: + if not self._broadcast_workers: + return [] + refs = [ + worker.broadcast_parameter.remote( + group_name=self.model_update_group_name, + names=[n for n, _ in named_weights], + dtypes=[w.dtype for _, w in named_weights], + shapes=[w.shape for _, w in named_weights], + ) + for worker in self._broadcast_workers + ] + handles = [] + for _, weight in named_weights: + handles.append( + collective.broadcast(tensor=weight, src_rank=0, group_name=self.model_update_group_name, async_op=True) + ) + for handle in handles: + handle.wait() + return refs + + def _separated_model_update(self): + logger.info(f"start broadcast model update {self.model_update_group_name}") + for named_weights in gather_deepspeed_weights( + self.model, self.ds_config, buffer_size=self._model_update_buffer_size + ): + refs = self._broadcast_to_infer_workers(named_weights) + ray.get(refs) + return {} diff --git a/roll/third_party/vllm/vllm_0_10_0/__init__.py b/roll/third_party/fsdp2/__init__.py similarity index 100% rename from roll/third_party/vllm/vllm_0_10_0/__init__.py rename to roll/third_party/fsdp2/__init__.py diff --git a/roll/third_party/fsdp2/model_update.py b/roll/third_party/fsdp2/model_update.py new file mode 100644 index 000000000..f575ef82d --- /dev/null +++ b/roll/third_party/fsdp2/model_update.py @@ -0,0 +1,323 @@ +import os +from dataclasses import asdict + +import ray +import torch +import torch.distributed as dist +from torch.distributed.tensor import DTensor + +from roll.configs.base_config import PPOConfig +from roll.configs.worker_config import is_actor_infer_overlapping_with_any_cluster +from roll.utils.collective import collective +from roll.utils.logging import get_logger +from roll.utils.network_utils import collect_free_port, get_node_ip +from roll.utils.send_recv_utils import serialize_named_weights + +logger = get_logger() + + +def gather_fsdp2_weights(model, buffer_size, is_lora=False): + """ + Gather FSDP2 weights for model update. + For FSDP2, we need to get the full tensor from the sharded parameters. + """ + if is_lora: + from peft.utils import get_peft_model_state_dict + + lora_state_dict = get_peft_model_state_dict(model) + named_params = [(name, param) for name, param in lora_state_dict.items()] + else: + named_params = [(name, param) for name, param in model.named_parameters()] + + waiting_params, waiting_params_size = [], 0 + for name, param in named_params: + full_tensor_size = param.numel() * param.element_size() + if waiting_params and waiting_params_size + full_tensor_size > buffer_size: + yield [(n, p.data if not isinstance(p.data, DTensor) else p.data.full_tensor()) for n, p in waiting_params] + waiting_params, waiting_params_size = [], 0 + + waiting_params_size += full_tensor_size + waiting_params.append((name, param)) + + if waiting_params: + yield [(n, p.data if not isinstance(p.data, DTensor) else p.data.full_tensor()) for n, p in waiting_params] + + +class FSDP2WeightUpdater: + def __init__( + self, pipeline_config: PPOConfig, infer_cluster, worker_config, model_update_name: str, model, is_lora + ): + self.pipeline_config = pipeline_config + self.worker_config = worker_config + self.model_update_name = model_update_name + self.model = model + self.model_update_infer_workers = infer_cluster.workers + self._model_update_buffer_size = ( + pipeline_config.model_update_buffer_size_mb * 1024 * 1024 + ) # Convert MB to bytes + self.is_lora = is_lora + self.infer_worker_config = infer_cluster.worker_config + self.infer_cluster = infer_cluster + self.is_colocated = is_actor_infer_overlapping_with_any_cluster( + infer_cluster.worker_config, actor_train=worker_config + ) + + # Colocated mode attributes + self._infer_parallel_cpu_group = None + self._co_infer_worker = None + self._buffer_num = None + self._broadcast_workers = None + + # Separated mode attributes + self.model_update_group_name = None + self._model_update_locker = None + + if self.is_colocated: + self._setup_colocated_model_update() + else: + self._setup_separated_model_update() + + def model_update(self): + if self.is_colocated: + return self._colocated_model_update() + return self._separated_model_update() + + def _setup_colocated_model_update(self): + logger.info(f"RANK {dist.get_rank()} Setup colocated model update") + infer_worker_devices_num = self.infer_worker_config.num_gpus_per_worker + train_world_size = dist.get_world_size() + + device_start_diff = min(self.worker_config.device_mapping) - min(self.infer_worker_config.device_mapping) + device_end_diff = max(self.worker_config.device_mapping) - max(self.infer_worker_config.device_mapping) + + assert device_start_diff % infer_worker_devices_num == 0 + assert device_end_diff % infer_worker_devices_num == 0 + + for start_rank in range(0, train_world_size, infer_worker_devices_num): + end_rank = start_rank + infer_worker_devices_num + assert end_rank <= train_world_size + group_ranks = list(range(start_rank, end_rank)) + new_group = dist.new_group(ranks=group_ranks, backend="gloo") + if dist.get_rank() in group_ranks: + self._infer_parallel_cpu_group = new_group + infer_worker_idx = (dist.get_rank() // infer_worker_devices_num) + ( + device_start_diff // infer_worker_devices_num + ) + self._co_infer_worker = None + self._co_infer_worker_rank = None + if 0 <= infer_worker_idx < len(self.model_update_infer_workers): + self._co_infer_worker = self.model_update_infer_workers[infer_worker_idx] + self._co_infer_worker_rank = infer_worker_idx + + # rank0 broadcast to mismatch workers + if dist.get_rank() == 0 and (device_start_diff > 0 or device_end_diff < 0): + self._broadcast_workers = [] + if device_start_diff > 0: + self._broadcast_workers.extend( + self.model_update_infer_workers[: device_start_diff // infer_worker_devices_num] + ) + if device_end_diff < 0: + self._broadcast_workers.extend( + self.model_update_infer_workers[device_end_diff // infer_worker_devices_num :] + ) + self._setup_broadcast_group() + + def _get_local_visible_gpu_rank(self) -> int: + """Return the first visible GPU rank from CUDA_VISIBLE_DEVICES. + + In colocated mode (CUDA IPC), the serialized CUDA tensor must be rebuilt + on the exact same physical GPU as the sender rank used. We use the + physical GPU id (gpu_rank) to align TP-ranks between train and vLLM. + """ + cuda_visible = os.environ.get("CUDA_VISIBLE_DEVICES", "") + if not cuda_visible: + return 0 + return int(cuda_visible.split(",")[0].strip()) + + def _get_local_global_gpu_id(self) -> int: + """Return global GPU id for current train rank based on device_mapping. + + device_mapping uses global ids: global_id = node_rank * gpu_per_node + gpu_rank. + This is the only stable identifier to align tensors across nodes. + """ + return int(self.worker_config.device_mapping[dist.get_rank()]) + + def _get_co_infer_gpu_rank_order(self) -> list[int] | None: + """Get per-TP-rank GPU order as seen by the colocated infer worker.""" + if self._co_infer_worker is None: + return None + cached = getattr(self, "_co_infer_gpu_rank_order", None) + if cached is not None: + return cached + devices_info = ray.get(self._co_infer_worker.get_devices_info.remote()) + order = [int(d["gpu_rank"]) for d in devices_info] + setattr(self, "_co_infer_gpu_rank_order", order) + return order + + def _get_co_infer_global_gpu_id_order(self) -> list[int] | None: + """Get per-TP-rank global GPU id order for the colocated infer worker. + + vLLM indexes `serialized_named_tensors` by its internal worker rank, which + follows `resource_placement_groups` order, which in turn follows the + infer worker's device_mapping slice order. + """ + if self._co_infer_worker_rank is None: + return None + num = self.infer_worker_config.num_gpus_per_worker + start = int(self._co_infer_worker_rank) * num + end = start + num + return [int(x) for x in self.infer_worker_config.device_mapping[start:end]] + + def _setup_separated_model_update(self): + if dist.get_rank() != 0: + return + + self._broadcast_workers = self.model_update_infer_workers + self._setup_broadcast_group() + + def _setup_broadcast_group(self): + if not self._broadcast_workers: + return + self.model_update_group_name = f"{self.model_update_name}_fsdp2" + num_gpus_per_infer_worker = self.infer_worker_config.num_gpus_per_worker + infer_device_num = num_gpus_per_infer_worker * len(self._broadcast_workers) + master_address, master_port = get_node_ip(), collect_free_port() + + refs = [ + infer_worker.setup_collective_group.remote( + master_address=master_address, + master_port=master_port, + group_name=self.model_update_group_name, + rank_offset=i * num_gpus_per_infer_worker + 1, + world_size=infer_device_num + 1, + ) + for i, infer_worker in enumerate(self._broadcast_workers) + ] + collective.init_collective_group( + infer_device_num + 1, + 0, + group_name=self.model_update_group_name, + master_addr=master_address, + master_port=master_port, + ) + ray.get(refs) + + logger.info(f"Init weights update group {self.model_update_group_name}") + + def _colocated_model_update(self): + refs = [] + infer_parallel_size = dist.get_world_size(self._infer_parallel_cpu_group) + co_infer_rank = dist.get_rank(self._infer_parallel_cpu_group) + for named_weights in gather_fsdp2_weights( + self.model, buffer_size=self._model_update_buffer_size, is_lora=self.is_lora + ): + if self._co_infer_worker is not None: + serialized_tensors = serialize_named_weights( + named_weights, infer_strategy=self.infer_worker_config.strategy_args.strategy_name + ) + send_global_gpu_id = self._get_local_global_gpu_id() + send_obj = {"global_gpu_id": send_global_gpu_id, "payload": serialized_tensors} + infer_parallel_tensors = [serialized_tensors] # tensors for each infer parallel rank + if infer_parallel_size > 1: + infer_parallel_tensors = [None] * infer_parallel_size if co_infer_rank == 0 else None + dist.gather_object( + send_obj, infer_parallel_tensors, group_dst=0, group=self._infer_parallel_cpu_group + ) + if refs: + ray.get(refs) + refs = [] + if co_infer_rank == 0 and self._co_infer_worker is not None: + # Align gathered per-train-rank payloads with vLLM TP-rank GPU order. + if infer_parallel_size > 1: + assert isinstance(infer_parallel_tensors, list) + infer_global_gpu_id_order = self._get_co_infer_global_gpu_id_order() + if infer_global_gpu_id_order is not None and len(infer_global_gpu_id_order) == infer_parallel_size: + global_id_to_idx = {gid: i for i, gid in enumerate(infer_global_gpu_id_order)} + reordered = [None] * infer_parallel_size + extras = [] + for item in infer_parallel_tensors: + if not isinstance(item, dict) or "global_gpu_id" not in item or "payload" not in item: + # Backward compatibility: old format was the raw payload. + extras.append(item) + continue + idx = global_id_to_idx.get(int(item["global_gpu_id"])) + if idx is None: + extras.append(item) + continue + reordered[idx] = item["payload"] + # Fill holes with any extras to avoid hard crash; vLLM side will still + # error if GPU mismatch, but this gives best-effort compatibility. + for i in range(infer_parallel_size): + if reordered[i] is None and extras: + extra = extras.pop(0) + reordered[i] = ( + extra["payload"] if isinstance(extra, dict) and "payload" in extra else extra + ) + if any(x is None for x in reordered): + missing = [i for i, x in enumerate(reordered) if x is None] + raise RuntimeError( + "FSDP2 colocated model update failed to align TP-ranks to GPUs. " + f"Missing indices={missing}, infer_global_gpu_id_order={infer_global_gpu_id_order}, " + f"gathered={infer_parallel_tensors}" + ) + infer_parallel_tensors = reordered + else: + infer_parallel_tensors = [ + (x["payload"] if isinstance(x, dict) and "payload" in x else x) + for x in infer_parallel_tensors + ] + else: + infer_parallel_tensors = [serialized_tensors] + refs.append( + self._co_infer_worker.update_parameter_in_bucket.remote( + infer_parallel_tensors, is_lora=self.is_lora + ) + ) + if self._broadcast_workers: + refs.extend(self._broadcast_to_infer_workers(named_weights)) + if refs: + ray.get(refs) + self._add_lora_to_infer_workers() + torch.cuda.empty_cache() + return {} + + def _broadcast_to_infer_workers(self, named_weights) -> list[ray.ObjectRef]: + if not self._broadcast_workers: + return [] + refs = [ + worker.broadcast_parameter.remote( + group_name=self.model_update_group_name, + names=[n for n, _ in named_weights], + dtypes=[w.dtype for _, w in named_weights], + shapes=[w.shape for _, w in named_weights], + is_lora=self.is_lora, + ) + for worker in self._broadcast_workers + ] + handles = [] + for _, weight in named_weights: + handles.append( + collective.broadcast(tensor=weight, src_rank=0, group_name=self.model_update_group_name, async_op=True) + ) + for handle in handles: + handle.wait() + return refs + + def _separated_model_update(self): + logger.info(f"start broadcast model update {self.model_update_group_name}") + for named_weights in gather_fsdp2_weights( + self.model, buffer_size=self._model_update_buffer_size, is_lora=self.is_lora + ): + refs = self._broadcast_to_infer_workers(named_weights) + ray.get(refs) + self._add_lora_to_infer_workers() + torch.cuda.empty_cache() + return {} + + def _add_lora_to_infer_workers(self): + if dist.get_rank() != 0 or not self.is_lora: + return + peft_config = self.model.peft_config.get("default", None) + ray.get( + [worker.add_lora.remote(peft_config=asdict(peft_config)) for worker in self.model_update_infer_workers] + ) diff --git a/roll/third_party/fsdp2/qwen3_moe_patch.py b/roll/third_party/fsdp2/qwen3_moe_patch.py new file mode 100644 index 000000000..5686a8495 --- /dev/null +++ b/roll/third_party/fsdp2/qwen3_moe_patch.py @@ -0,0 +1,36 @@ +import torch +import torch.nn.functional as F + + +# force each expert to participate in computation graph so FSDP could gather all expert outputs +def qwen3_moe_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + router_logits = self.gate(hidden_states) + + routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) + routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1) + if self.norm_topk_prob: + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + routing_weights = routing_weights.to(hidden_states.dtype) + + final_hidden_states = torch.zeros( + (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device + ) + + expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0) + + for expert_idx in range(self.num_experts): + expert_layer = self.experts[expert_idx] + idx, top_x = torch.where(expert_mask[expert_idx]) + + if top_x.numel() > 0: + current_state = hidden_states[None, top_x].reshape(-1, hidden_dim) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None] + final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype)) + else: + dummy_output = expert_layer(hidden_states[:1]) * 0.0 + final_hidden_states[:1] = final_hidden_states[:1] + dummy_output + + final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim) + return final_hidden_states, router_logits diff --git a/roll/third_party/fsdp2/tiled_mlp.py b/roll/third_party/fsdp2/tiled_mlp.py new file mode 100644 index 000000000..361688dd0 --- /dev/null +++ b/roll/third_party/fsdp2/tiled_mlp.py @@ -0,0 +1,239 @@ +# Copyright 2024 Bytedance Ltd. and/or its affiliates +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +FSDP2-compatible TiledMLP implementation for memory-efficient MLP computation. + +This module provides a tiled MLP implementation that reduces peak memory usage +by processing the MLP forward/backward pass in chunks (tiles). This is particularly +useful for large models with FSDP2 training. + +Reference: https://github.com/volcengine/verl/blob/main/verl/models/transformers/tiled_mlp.py#L1-L237 +""" + +import threading +from typing import Optional + +import torch +import torch.nn as nn + + +class GradientAccumulator: + """Gradient accumulator for TiledMLP (FSDP compatible). + + This class manages gradient accumulation across multiple shards during + the backward pass of TiledMLP. It ensures correct gradient computation + when processing input in chunks. + """ + + def __init__(self, params: list[torch.nn.Parameter], total_shards: int, dtype: torch.dtype = None): + self.params = params + self.total_shards = total_shards + self.grad_accumulation_dtype = dtype or torch.float32 + self.accumulated_grads = {} + self.hooks = [] + self.lock = threading.Lock() + + for param in self.params: + if param.grad is not None: + self.accumulated_grads[param] = param.grad.to(self.grad_accumulation_dtype) + param.grad = None + else: + self.accumulated_grads[param] = torch.zeros_like(param, dtype=self.grad_accumulation_dtype) + + def install_hooks(self, is_last_shard: bool): + """Install gradient hooks for the current shard.""" + self._remove_hooks() + + def create_hook(param): + def hook(grad): + with self.lock: + grad_to_accum_dtype = grad.to(self.grad_accumulation_dtype) + self.accumulated_grads[param] += grad_to_accum_dtype + + if is_last_shard: + param.grad = None # Critical: prevent double accumulation + final_grad = self.accumulated_grads[param].to(param.dtype) + return final_grad + return None + + return hook + + for param in self.params: + if param.requires_grad: + hook = param.register_hook(create_hook(param)) + self.hooks.append(hook) + + def _remove_hooks(self): + """Remove all registered hooks.""" + for hook in self.hooks: + hook.remove() + self.hooks.clear() + + def cleanup(self): + """Cleanup hooks and resources.""" + self._remove_hooks() + + +class TiledMLP(torch.autograd.Function): + """TiledMLP implementation for memory-efficient MLP computation. + + This autograd function processes MLP forward/backward in tiles (chunks) + to reduce peak memory usage. Compatible with FSDP2. + """ + + @staticmethod + def forward(ctx, fn, module, x, shards, compute_params): + ctx.fn = fn + ctx.module = module + ctx.shards = shards + ctx.compute_params = [p for p in compute_params if p.requires_grad] + ctx.save_for_backward(x) + + # Split on dim=-2 (seqlen dimension) following Liger Kernel style + x_shards = list(torch.chunk(x, chunks=shards, dim=-2)) + with torch.no_grad(): + output_shards = [fn(module, x_shard) for x_shard in x_shards] + output_unsharded = torch.cat(output_shards, dim=-2) + return output_unsharded + + @staticmethod + def backward(ctx, *grads): + fn = ctx.fn + (x,) = ctx.saved_tensors + module = ctx.module + shards = ctx.shards + compute_params = ctx.compute_params + + x_requires_grad = x.requires_grad + x = x.detach() + x.requires_grad_(x_requires_grad) + + # Flatten to [bs*seqlen, hidden_size] + hidden_size = x.shape[-1] + x_shape_orig = x.shape + x = x.view(-1, hidden_size) + incoming_grad = grads[0].view(-1, hidden_size) + + # Pre-allocate input gradient + x_grad = torch.zeros_like(x) + + # Split on dim=0 + x_shards = list(torch.chunk(x, chunks=shards, dim=0)) + + grad_accumulator = GradientAccumulator(compute_params, shards, dtype=x.dtype) + + for i, x_shard in enumerate(x_shards): + x_shard.requires_grad_(x_requires_grad) + + shard_step = x_shards[i].shape[0] + shard_offset = i * x_shards[0].shape[0] + + # narrow(0, ...) creates a contiguous view that can receive gradients + x_shard.grad = x_grad.narrow(0, shard_offset, shard_step) + incoming_grad_shard = incoming_grad.narrow(0, shard_offset, shard_step) + + is_last_shard = i + 1 == shards + grad_accumulator.install_hooks(is_last_shard) + + with torch.enable_grad(): + output = fn(module, x_shard) + torch.autograd.backward(output, incoming_grad_shard) + + grad_accumulator.cleanup() + del grad_accumulator + + # Restore original shape + x_grad = x_grad.view(x_shape_orig) if x_requires_grad else None + return (None, None, x_grad, None, None) + + +def _mlp_forward_fn(module, x): + """Forward function for LlamaMLP / Qwen2MLP / Qwen3MLP style.""" + return module.down_proj(module.act_fn(module.gate_proj(x)) * module.up_proj(x)) + + +# ============================================================================ +# Monkey Patch Functions +# ============================================================================ + +# Model type to MLP class mapping +_MODEL_TYPE_TO_MLP_CLASS = { + "llama": ("transformers.models.llama.modeling_llama", "LlamaMLP"), + "qwen2": ("transformers.models.qwen2.modeling_qwen2", "Qwen2MLP"), + "qwen2_5": ("transformers.models.qwen2.modeling_qwen2", "Qwen2MLP"), # Qwen2.5 uses Qwen2 MLP + "qwen3": ("transformers.models.qwen3.modeling_qwen3", "Qwen3MLP"), + "qwen3_moe": ("transformers.models.qwen3_moe.modeling_qwen3_moe", "Qwen3MoeMLP"), +} + + +def apply_tiled_mlp_monkey_patch( + num_shards: int = 4, + model_type: Optional[str] = None, +): + """Apply TiledMLP monkey patch based on model_type. + + This function MUST be called BEFORE model instantiation to take effect. + It patches the MLP classes in transformers library to use TiledMLP for + memory-efficient computation during training. + + Args: + num_shards: Number of shards to split the input into. Higher values + reduce peak memory but may slightly impact performance. + model_type: The model type string (e.g., "llama", "qwen2", "qwen3"). + If None, patches all supported model types. + + Returns: + List of patched class names. + """ + if model_type is None: + types_to_patch = list(_MODEL_TYPE_TO_MLP_CLASS.keys()) + elif model_type in _MODEL_TYPE_TO_MLP_CLASS: + types_to_patch = [model_type] + else: + raise ValueError( + f"TiledMLP does not support model_type='{model_type}'. " + f"Supported types: {list(_MODEL_TYPE_TO_MLP_CLASS.keys())}. " + f"For SwiGLU-style MLPs, you can add support by extending _MODEL_TYPE_TO_MLP_CLASS " + f"in verl/models/transformers/tiled_mlp.py" + ) + + patched_classes = [] + + for mtype in types_to_patch: + module_path, class_name = _MODEL_TYPE_TO_MLP_CLASS[mtype] + try: + import importlib + + module = importlib.import_module(module_path) + mlp_class = getattr(module, class_name) + _patch_mlp_class(mlp_class, _mlp_forward_fn, num_shards) + if class_name not in patched_classes: + patched_classes.append(class_name) + except (ImportError, AttributeError) as e: + print(f"Warning: Could not patch {mtype} MLP: {e}") + + if patched_classes: + print(f"TiledMLP monkey patch applied to: {', '.join(patched_classes)} (shards={num_shards})") + + return patched_classes + + +def _patch_mlp_class(mlp_class: type[nn.Module], forward_fn, num_shards: int): + """Patch a single MLP class to use TiledMLP.""" + + def tiled_forward(self, x): + compute_params = [p for p in self.parameters() if p.requires_grad] + return TiledMLP.apply(forward_fn, self, x, num_shards, compute_params) + + mlp_class.forward = tiled_forward diff --git a/roll/third_party/megatron/model_update.py b/roll/third_party/megatron/model_update.py new file mode 100644 index 000000000..970ae888d --- /dev/null +++ b/roll/third_party/megatron/model_update.py @@ -0,0 +1,483 @@ +import time +from dataclasses import asdict +from typing import Optional + +import ray +import torch +import torch.distributed as dist +from megatron.core import mpu +from transformers.utils import is_peft_available + +from mcore_adapter.models.converter.model_converter import ModelConverter +from mcore_adapter.models.model_factory import McaGPTModel +from roll.configs.base_config import PPOConfig +from roll.configs.worker_config import WorkerConfig, is_actor_infer_overlapping_with_any_cluster +from roll.distributed.executor.cluster import Cluster +from roll.distributed.scheduler.driver_utils import Locker +from roll.platforms import current_platform +from roll.utils.collective import collective +from roll.utils.constants import RAY_NAMESPACE +from roll.utils.logging import get_logger +from roll.utils.network_utils import collect_free_port, get_node_ip +from roll.utils.send_recv_utils import serialize_named_weights + + +if is_peft_available(): + from peft import PeftModel, get_peft_model_state_dict + +logger = get_logger() + + +def gather_and_convert_weights( + weights_info: list[tuple[str, torch.Tensor]], + model_converter: ModelConverter, + tp_group: Optional[dist.ProcessGroup] = None, + ep_group: Optional[dist.ProcessGroup] = None, + **kwargs, +) -> dict[str, torch.Tensor]: + """ + weights_info: list of tuples, each tuple is (mcore_name, weight) + """ + if model_converter.mca_config.hf_model_type == "qwen3_vl_moe" and ep_group is not None: + # qwen3_vl_moe has fused moe weights, so we need to gather weights in ep_group before convert + handles, gathered_named_weights = [], [] + group_size = dist.get_world_size(ep_group) + for mcore_name, weight in weights_info: + if group_size == 1: + gathered_named_weights.append((mcore_name, [weight])) + handles.append(None) + continue + gathered_weights = [torch.empty_like(weight) for _ in range(group_size)] + gathered_named_weights.append((mcore_name, gathered_weights)) + handles.append(dist.all_gather(gathered_weights, weight, group=ep_group, async_op=True)) + + def extract_suffix_number(s): + import re + + match = re.search(r"\d+$", s) + return match.group() if match else None + + hf_named_weights = [] + for handle, (mcore_name, weights) in zip(handles, gathered_named_weights): + if handle is not None: + handle.wait() + local_moe_index = extract_suffix_number(mcore_name) + for ep_rank, weight in enumerate(weights): + global_moe_index = model_converter.dist_converter.num_layers_for_expert * ep_rank + int( + local_moe_index + ) + name = mcore_name[: -len(local_moe_index)] + str(global_moe_index) + converted_weights = ( + model_converter.convert_to_hf( + {name: [weight]}, layer_index_preprocessed=True, moe_index_preprocessed=True, **kwargs + ) + or {} + ) + hf_named_weights.extend([(name, weight) for name, weight in converted_weights.items()]) + + return hf_named_weights + + handles, gathered_named_weights = [], [] + group_size = 1 if tp_group is None else dist.get_world_size(tp_group) + for mcore_name, weight in weights_info: + if group_size == 1: + gathered_named_weights.append((mcore_name, [weight])) + handles.append(None) + continue + gathered_weights = [torch.empty_like(weight) for _ in range(group_size)] + gathered_named_weights.append((mcore_name, gathered_weights)) + handles.append(dist.all_gather(gathered_weights, weight, group=tp_group, async_op=True)) + + hf_named_weights = [] + for handle, (mcore_name, weights) in zip(handles, gathered_named_weights): + if handle is not None: + handle.wait() + converted_weights = ( + model_converter.convert_to_hf({mcore_name: weights}, layer_index_preprocessed=True, **kwargs) or {} + ) + hf_named_weights.extend([(name, weight) for name, weight in converted_weights.items()]) + + if ep_group is None or dist.get_world_size(ep_group) == 1: + return hf_named_weights + + names = [name for name, _ in hf_named_weights] + # TODO: use cpu but not communicate + ep_group_size = dist.get_world_size(ep_group) + all_names = [None for _ in range(dist.get_world_size(ep_group))] + dist.all_gather_object(all_names, names, group=ep_group) + handles = [] + all_named_weights = [] + for i, (name, weight) in enumerate(hf_named_weights): + gathered_weights = [torch.empty_like(weight) for _ in range(ep_group_size)] + handles.append(dist.all_gather(gathered_weights, weight.contiguous(), group=ep_group, async_op=True)) + for rank, gathered_weight in enumerate(gathered_weights): + ep_name = all_names[rank][i] + all_named_weights.append((ep_name, gathered_weight)) + + for handle in handles: + handle.wait() + return all_named_weights + + +def _gather_hf_weights( + model_converter: ModelConverter, + named_weights: list[tuple[str, torch.Tensor]], + buffer_size: Optional[int] = None, + **kwargs, +): + mca_config = model_converter.mca_config + other_weights_with_info = [] + expert_weights_with_info = [] + for mcore_name, weight in named_weights: + if model_converter.dist_converter.is_expert_parallel_weight(mcore_name): + expert_weights_with_info.append((mcore_name, weight)) + else: + other_weights_with_info.append((mcore_name, weight)) + + def _process_and_yield_weights(weights_info, group=None, ep_group=None): + # TODO: skip tp dup weights gather + waiting_weights, waiting_weights_size = [], 0 + group_size = 1 if group is None else dist.get_world_size(group) + group_size *= 1 if ep_group is None else dist.get_world_size(ep_group) + for mcore_name, weight in weights_info: + weight_size = weight.numel() * weight.element_size() * group_size + if buffer_size is not None and waiting_weights_size + weight_size > buffer_size: + yield gather_and_convert_weights(waiting_weights, model_converter, group, ep_group) + waiting_weights, waiting_weights_size = [], 0 + waiting_weights.append((mcore_name, weight)) + waiting_weights_size += weight_size + + if waiting_weights: + yield gather_and_convert_weights(waiting_weights, model_converter, group, ep_group, **kwargs) + + ep_group = None + if mca_config.expert_model_parallel_size is not None and mca_config.expert_model_parallel_size > 1: + ep_group = mpu.get_expert_model_parallel_group() + + yield from _process_and_yield_weights(expert_weights_with_info, mpu.get_expert_tensor_parallel_group(), ep_group) + yield from _process_and_yield_weights(other_weights_with_info, mpu.get_tensor_model_parallel_group()) + + +def _iter_vp_stage_named_weights(models: list[McaGPTModel], model_converter: ModelConverter): + for vp_stage, model in enumerate(models): + if is_peft_available() and isinstance(model, PeftModel): + mcore_state_dict = get_peft_model_state_dict(model, model.state_dict_for_save_checkpoint()) + else: + mcore_state_dict = model.state_dict_for_save_checkpoint() + for mcore_name, weight in sorted(mcore_state_dict.items()): + if mcore_name.endswith("_extra_state"): + continue + mcore_name = model_converter.dist_converter.preprocess_layer_index(mcore_name, vp_stage=vp_stage) + yield mcore_name, weight + + +def gather_pp_stage_hf_weights(models: list[McaGPTModel], buffer_size, **kwargs): + # gather tp&ep weights, not including pipeline parallel + if not mpu.model_parallel_is_initialized(): + raise RuntimeError("Model parallelism must be initialized before save as hf inflight.") + + model_config = models[0].config + model_converter = ModelConverter(model_config, to_hf=True, efficient_mode=True) + yield from _gather_hf_weights( + model_converter, list(_iter_vp_stage_named_weights(models, model_converter)), buffer_size, **kwargs + ) + + +def gather_weights_meta_cross_pp(models: list[McaGPTModel]): + if not mpu.model_parallel_is_initialized(): + raise RuntimeError("Model parallelism must be initialized before save as hf inflight.") + model_config = models[0].config + if model_config.pipeline_model_parallel_size <= 1: + return None + pp_rank = mpu.get_pipeline_model_parallel_rank() + model_converter = ModelConverter(model_config, to_hf=True, efficient_mode=True) + named_weights_meta = [] + for mcore_name, weight in _iter_vp_stage_named_weights(models, model_converter): + weight_size = weight.numel() * weight.element_size() + if model_converter.dist_converter.is_expert_parallel_weight(mcore_name): + weight_size *= model_config.expert_model_parallel_size * model_config.expert_tensor_parallel_size + else: + weight_size *= model_config.tensor_model_parallel_size + named_weights_meta.append( + { + "name": mcore_name, + "shape": weight.shape, + "dtype": weight.dtype, + "pp_stage": pp_rank, + "size": weight_size, + } + ) + all_named_weights_meta = [None for _ in range(model_config.pipeline_model_parallel_size)] + dist.all_gather_object(all_named_weights_meta, named_weights_meta, group=mpu.get_pipeline_model_parallel_group()) + all_named_weights_meta = sorted( + [meta for metas in all_named_weights_meta for meta in metas], key=lambda x: x["name"] + ) + expert_weights_meta = [] + other_weights_meta = [] + for meta in all_named_weights_meta: + if model_converter.dist_converter.is_expert_parallel_weight(meta["name"]): + expert_weights_meta.append(meta) + else: + other_weights_meta.append(meta) + return expert_weights_meta + other_weights_meta + + +def gather_all_hf_weights(models: list[McaGPTModel], buffer_size: int, weights_meta: Optional[list[dict]]): + # weights_meta: list of dict, each dict is {"name": str, "shape": list, "dtype": str, "pp_stage": int, "size": int} + if not mpu.model_parallel_is_initialized(): + raise RuntimeError("Model parallelism must be initialized before save as hf inflight.") + + kwargs = {} + if is_peft_available() and isinstance(models[0], PeftModel): + lora_rank = next(iter(models[0].peft_config.values())).r + kwargs = {"lora_rank": lora_rank} + + pp_size = models[0].config.pipeline_model_parallel_size + if pp_size <= 1: + yield from gather_pp_stage_hf_weights(models, buffer_size, **kwargs) + return + + pp_rank = mpu.get_pipeline_model_parallel_rank() + model_converter = ModelConverter( + models[0].config, pipeline_model_parallel_rank=pp_rank, to_hf=True, efficient_mode=True + ) + cur_stage_state_dict = { + mcore_name: weight for mcore_name, weight in _iter_vp_stage_named_weights(models, model_converter) + } + + def _gather_batch_params(named_weights_with_stage: list[tuple[str, torch.Tensor, int]]): + # named_weights_with_stage: list of tuples, each tuple is (mcore_name, weight, pp_stage) + named_weights, handles = [], [] + for mcore_name, weight, pp_stage in named_weights_with_stage: + named_weights.append((mcore_name, weight)) + handles.append( + dist.broadcast( + weight, group=mpu.get_pipeline_model_parallel_group(), async_op=True, group_src=pp_stage + ) + ) + for handle in handles: + handle.wait() + yield from _gather_hf_weights(model_converter, named_weights, **kwargs) + + waiting_weights, waiting_weights_size = [], 0 + for weight_meta in weights_meta: + weight_size = weight_meta["size"] + if waiting_weights_size + weight_size > buffer_size and waiting_weights: + yield from _gather_batch_params(waiting_weights) + waiting_weights, waiting_weights_size = [], 0 + if weight_meta["pp_stage"] == pp_rank: + weight = cur_stage_state_dict[weight_meta["name"]] + else: + weight = torch.empty(weight_meta["shape"], dtype=weight_meta["dtype"], device=current_platform.device_type) + waiting_weights.append((weight_meta["name"], weight, weight_meta["pp_stage"])) + waiting_weights_size += weight_size + if waiting_weights: + yield from _gather_batch_params(waiting_weights) + + +class MegatronWeightUpdater: + def __init__( + self, + pipeline_config: PPOConfig, + worker_config: WorkerConfig, + model_update_name: str, + models_unwrapped, + infer_cluster: Cluster, + ): + self.pipeline_config = pipeline_config + self.worker_config = worker_config + self.model_update_name = model_update_name + self.models_unwrapped = models_unwrapped + self.model_update_infer_workers = infer_cluster.workers + self._model_update_buffer_size = ( + pipeline_config.model_update_buffer_size_mb * 1024 * 1024 + ) # Convert MB to bytes + self.infer_worker_config = infer_cluster.worker_config + self.infer_cluster = infer_cluster + self.is_colocated = is_actor_infer_overlapping_with_any_cluster( + infer_cluster.worker_config, actor_train=worker_config + ) + self._broadcast_workers = None + + # Colocated mode attributes + self._infer_parallel_cpu_group = None + self._co_infer_worker = None + self._buffer_num = None + + # Separated mode attributes + self.model_update_group_name = None + self._model_update_locker = None + self._weights_meta = None + + if self.is_colocated: + self._setup_colocated_model_update() + else: + self._setup_separated_model_update() + + def model_update(self): + if self.is_colocated: + return self._colocated_model_update() + return self._separated_model_update() + + def _setup_colocated_model_update(self): + logger.info(f"RANK {dist.get_rank()} Setup colocated model update") + infer_worker_devices_num = self.infer_worker_config.num_gpus_per_worker + train_world_size = dist.get_world_size() + + device_start_diff = min(self.worker_config.device_mapping) - min(self.infer_worker_config.device_mapping) + device_end_diff = max(self.worker_config.device_mapping) - max(self.infer_worker_config.device_mapping) + + assert device_start_diff % infer_worker_devices_num == 0 + assert device_end_diff % infer_worker_devices_num == 0 + + for start_rank in range(0, train_world_size, infer_worker_devices_num): + end_rank = start_rank + infer_worker_devices_num + assert end_rank <= train_world_size + group_ranks = list(range(start_rank, end_rank)) + new_group = dist.new_group(ranks=group_ranks, backend="gloo") + if dist.get_rank() in group_ranks: + self._infer_parallel_cpu_group = new_group + infer_worker_idx = (dist.get_rank() + device_start_diff) // infer_worker_devices_num + self._co_infer_worker = None + if 0 <= infer_worker_idx < len(self.model_update_infer_workers): + self._co_infer_worker = self.model_update_infer_workers[infer_worker_idx] + + # rank0 broadcast to mismatch workers + if dist.get_rank() == 0 and (device_start_diff > 0 or device_end_diff < 0): + self._broadcast_workers = [] + if device_start_diff > 0: + self._broadcast_workers.extend( + self.model_update_infer_workers[: device_start_diff // infer_worker_devices_num] + ) + if device_end_diff < 0: + self._broadcast_workers.extend( + self.model_update_infer_workers[device_end_diff // infer_worker_devices_num :] + ) + self._setup_broadcast_group() + + self._weights_meta = gather_weights_meta_cross_pp(self.models_unwrapped) + + def _setup_separated_model_update(self): + self._model_update_locker = Locker.options( + name="model_update_locker", get_if_exists=True, namespace=RAY_NAMESPACE + ).remote() + if not ( + mpu.get_data_parallel_rank(with_context_parallel=True) == 0 and mpu.get_tensor_model_parallel_rank() == 0 + ): + return + + self._broadcast_workers = self.model_update_infer_workers + self._setup_broadcast_group() + + def _setup_broadcast_group(self): + if not self._broadcast_workers: + return + + ep_rank = 0 + if ( + self.models_unwrapped[0].config.num_moe_experts is not None + and self.models_unwrapped[0].config.num_moe_experts > 1 + ): + ep_rank = mpu.get_expert_model_parallel_rank() + model_update_group_name = f"{self.model_update_name}_pp{mpu.get_pipeline_model_parallel_rank()}_ep{ep_rank}" + self.model_update_group_name = model_update_group_name + + num_gpus_per_infer_worker = self.infer_worker_config.num_gpus_per_worker + infer_device_num = num_gpus_per_infer_worker * len(self._broadcast_workers) + master_address, master_port = get_node_ip(), collect_free_port() + + refs = [ + infer_worker.setup_collective_group.remote( + master_address=master_address, + master_port=master_port, + group_name=self.model_update_group_name, + rank_offset=i * num_gpus_per_infer_worker + 1, + world_size=infer_device_num + 1, + ) + for i, infer_worker in enumerate(self._broadcast_workers) + ] + collective.init_collective_group( + infer_device_num + 1, + 0, + group_name=self.model_update_group_name, + master_addr=master_address, + master_port=master_port, + ) + ray.get(refs) + + logger.info(f"Init weights update group {model_update_group_name}") + + def _broadcast_to_infer_workers(self, hf_named_weights) -> list[ray.ObjectRef]: + if not self._broadcast_workers: + return [] + refs = [ + worker.broadcast_parameter.remote( + group_name=self.model_update_group_name, + names=[n for n, _ in hf_named_weights], + dtypes=[w.dtype for _, w in hf_named_weights], + shapes=[w.shape for _, w in hf_named_weights], + is_lora=self.worker_config.model_args.lora_target is not None, + ) + for worker in self._broadcast_workers + ] + handles = [] + for _, weight in hf_named_weights: + handles.append( + collective.broadcast(tensor=weight, src_rank=0, group_name=self.model_update_group_name, async_op=True) + ) + for handle in handles: + handle.wait() + return refs + + def _colocated_model_update(self): + refs = [] + infer_parallel_size = dist.get_world_size(self._infer_parallel_cpu_group) + co_infer_rank = dist.get_rank(self._infer_parallel_cpu_group) + if is_lora := (self.worker_config.model_args.lora_target is not None): + peft_config = self.models_unwrapped[0].peft_config.get("default", None) + for hf_named_weights in gather_all_hf_weights( + self.models_unwrapped, buffer_size=self._model_update_buffer_size, weights_meta=self._weights_meta + ): + if self._co_infer_worker is not None: + serialized_tensors = serialize_named_weights( + hf_named_weights, infer_strategy=self.infer_worker_config.strategy_args.strategy_name + ) + infer_parallel_tensors = [None] * infer_parallel_size if co_infer_rank == 0 else None + dist.gather_object( + serialized_tensors, infer_parallel_tensors, group_dst=0, group=self._infer_parallel_cpu_group + ) + + if refs: + ray.get(refs) + refs = [] + if co_infer_rank == 0 and self._co_infer_worker is not None: + refs.append( + self._co_infer_worker.update_parameter_in_bucket.remote(infer_parallel_tensors, is_lora=is_lora) + ) + if self._broadcast_workers: + refs.extend(self._broadcast_to_infer_workers(hf_named_weights)) + + if refs: + ray.get(refs) + refs = [] + + if is_lora and co_infer_rank == 0 and self._co_infer_worker is not None: + refs.append(self._co_infer_worker.add_lora.remote(peft_config=asdict(peft_config))) + return {} + + def _separated_model_update(self): + if not mpu.get_expert_data_parallel_rank() == 0: + return {} + + logger.info(f"start broadcast model update {self.model_update_name}") + for hf_named_weights in gather_pp_stage_hf_weights( + self.models_unwrapped, buffer_size=self._model_update_buffer_size + ): + if not self._broadcast_workers: + continue + while not ray.get(self._model_update_locker.acquire.remote()): + time.sleep(0.1) + refs = self._broadcast_to_infer_workers(hf_named_weights) + ray.get(refs) + ray.get(self._model_update_locker.release.remote()) + return {} diff --git a/roll/third_party/megatron/tensor_parallel.py b/roll/third_party/megatron/tensor_parallel.py index 80bddeb13..69648296e 100644 --- a/roll/third_party/megatron/tensor_parallel.py +++ b/roll/third_party/megatron/tensor_parallel.py @@ -6,7 +6,6 @@ class _VocabParallelEntropy(torch.autograd.Function): @staticmethod def forward(ctx, vocab_parallel_logits: torch.Tensor) -> torch.Tensor: - vocab_parallel_logits = vocab_parallel_logits.float() @torch.compile(dynamic=True) def mul_reduce(a, b): return (a * b).sum(dim=-1, keepdim=True) diff --git a/roll/third_party/sglang/__init__.py b/roll/third_party/sglang/__init__.py index e3d796903..768eb744e 100644 --- a/roll/third_party/sglang/__init__.py +++ b/roll/third_party/sglang/__init__.py @@ -16,5 +16,8 @@ elif sgl.__version__ == '0.5.4.post2': from roll.third_party.sglang import v054_patch patch = v054_patch +elif sgl.__version__ == '0.5.5.post3': + from roll.third_party.sglang import v054_patch + patch = v054_patch else: raise NotImplementedError(f"Scale aligner version sglang:{sgl.__version__} is not supported.") \ No newline at end of file diff --git a/roll/third_party/sglang/async_engine.py b/roll/third_party/sglang/async_engine.py deleted file mode 100644 index d1aa52b45..000000000 --- a/roll/third_party/sglang/async_engine.py +++ /dev/null @@ -1,205 +0,0 @@ -import asyncio -import contextlib -import dataclasses -import enum -import traceback - -from roll.utils.logging import get_logger - - -logger = get_logger() - - -class SglangInputType(enum.Enum): - ADD = enum.auto() - ABORT = enum.auto() - - -# 用于存放所有abort_rid_set -abort_rid_set = set() -abort_lock = asyncio.Lock() -stop_flag = False - - -async def producer(thread_queue, asyncio_queue): - PRODUCER_PUT_TIMEOUT = 15 * 60 - global stop_flag - stop_flag = False - while True: - if not thread_queue.empty(): - data = thread_queue.get() - # 收到结束标记 - if data is None: - stop_flag = True - logger.info("[sglang async engine] receive stop signal, stoping") - break - command, command_data = data - if command == SglangInputType.ABORT: - async with abort_lock: - rid = command_data - abort_rid_set.add(rid) - else: - await asyncio.wait_for(asyncio_queue.put(data), timeout=PRODUCER_PUT_TIMEOUT) - else: - await asyncio.sleep(0.1) - -async def consumer(asyncio_queue, consumer_id, llm, request_complete_callback): - from sglang.srt.managers.io_struct import GenerateReqInput - - from roll.distributed.scheduler.protocol import DataProto - - def process_sglang_output(chunks, meta_info): - output_data = DataProto(meta_info=meta_info) - if chunks is None or chunks[0] is None: - # report a abort request - output_data.meta_info["finish_reasons"] = [None] # not finished - request_complete_callback(data=output_data) - return - - output_token_ids = [chunk.get("output_ids", []) for chunk in chunks] - output_logprobs = [chunk["meta_info"].get("output_token_logprobs", None) for chunk in chunks] - has_logprobs = any(logprobs is not None for logprobs in output_logprobs) - if has_logprobs: - lens = [min(len(ids), len(logprobs)) for ids, logprobs in zip(output_token_ids, output_logprobs)] - output_token_ids = [ids[:l] for ids, l in zip(output_token_ids, lens)] - output_logprobs = [logprobs[:l] for logprobs, l in zip(output_logprobs, lens)] - output_logprobs = [[prob_info[0] for prob_info in logprobs] for logprobs in output_logprobs] - output_data.meta_info["output_logprobs"] = output_logprobs - assert all([len(ids) == len(logprobs) for ids, logprobs in zip(output_token_ids, output_logprobs)]), ( - "output_token_ids and output_logprobs length not match" - ) - output_data.meta_info["output_token_ids"] = output_token_ids - output_data.meta_info["finish_reasons"] = [chunk["meta_info"].get("finish_reason") for chunk in chunks] - request_complete_callback(data=output_data) - logger.debug(f"worker_id:{consumer_id} request_id: {meta_info['request_id']} finish!") - - try: - while True: - pack_data = await asyncio_queue.get() - asyncio_queue.task_done() - if pack_data is None: - break - - command, data = pack_data - - rid, input_ids, sampling_params, meta_info = data - collect_unfinished = meta_info.get("collect_unfinished", False) - rid_str = rid[0] - - final_chunks: list[dict] = [None for _ in range(sampling_params['n'])] - logger.debug(f"worker_id:{consumer_id} request_id: {rid} starting!") - - if sampling_params['n'] > 1: - rid = [rid] - assert not collect_unfinished, "collect_unfinished is not supported in parallel sampling" - - obj_init_kw = {} # return logprobs may be in GenerateReqInput not SamplingParams - for field in dataclasses.fields(GenerateReqInput): - if field.name in sampling_params: - obj_init_kw[field.name] = sampling_params.pop(field.name) - from sglang import __version__ as version - if version >= '0.4.6.post4': - sampling_params['stream_interval'] = 50 - obj = GenerateReqInput( - input_ids=input_ids, - sampling_params=sampling_params, - stream=True, - **obj_init_kw, - ) - - need_abort = stop_flag - async with abort_lock: - if rid_str in abort_rid_set: - need_abort = True - logger.debug(f"request_id: {rid_str} do not running!") - if need_abort: - if collect_unfinished: - process_sglang_output(None, meta_info) - continue - - generator = llm.tokenizer_manager.generate_request(obj, None) - generate_success = True - next_task = asyncio.create_task(generator.__anext__()) - while True: - is_timeout = False - try: - chunk = await asyncio.wait_for(asyncio.shield(next_task), timeout=10) - next_task = asyncio.create_task(generator.__anext__()) - except asyncio.TimeoutError: - is_timeout = True - except StopAsyncIteration: - break - if not is_timeout: - chunk_index = chunk.get("index", 0) - final_chunks[chunk_index] = chunk - - need_abort = stop_flag - async with abort_lock: - if rid_str in abort_rid_set: - need_abort = True - - if need_abort: - logger.debug(f"request_id: {rid_str} aborting!") - if obj.is_single: - llm.tokenizer_manager.abort_request(obj.rid) - else: - for rid in obj.rid: - llm.tokenizer_manager.abort_request(rid) - logger.debug(f"request_id: {rid_str} abort success!") - generate_success = False - next_task.cancel() - with contextlib.suppress(asyncio.CancelledError): - await next_task - break - - if generate_success or collect_unfinished: - process_sglang_output(final_chunks, meta_info) - except Exception as e: - logger.info(traceback.format_exc()) - -async def predict_in_asyncio(model, request_complete_callback, thread_queue, max_running_requests=128): - PRODUCER_BUFFER_SIZE = 128 - - logger.info("[sglang asyncio] env setup...") - async with abort_lock: - abort_rid_set.clear() - asyncio_queue = asyncio.Queue(maxsize=PRODUCER_BUFFER_SIZE) - producer_task = asyncio.create_task(producer(thread_queue, asyncio_queue)) - consumers = [ - asyncio.create_task(consumer(asyncio_queue, i, model, request_complete_callback)) - for i in range(max_running_requests) - ] - logger.info("[sglang asyncio] env setup (done)") - - await producer_task - logger.info("[sglang asyncio] killing consumers ...") - for _ in range(len(consumers)): - await asyncio_queue.put(None) - - await asyncio_queue.join() - logger.info("[sglang asyncio] finish signal has set") - - try: - await asyncio.wait_for(asyncio.gather(*consumers), timeout=60) - except asyncio.TimeoutError: - logger.info("Timeout: Not all tasks completed within the time limit") - # for safety, all requests should already be aborted - for rid in model.tokenizer_manager.rid_to_state: - model.tokenizer_manager.abort_request(rid) - logger.info("killing workers done, AsyncSglangEngine stop success") - -def start_async_sglang(loop, model, request_complete_callback, thread_queue, max_running_requests=128): - try: - loop.run_until_complete( - predict_in_asyncio( - model, request_complete_callback, thread_queue=thread_queue, max_running_requests=max_running_requests - ) - ) - except Exception as e: - logger.info(f"async_sglang thread raise Exception!\n{traceback.format_exc()}") - -def add_request(thread_queue, data): - thread_queue.put((SglangInputType.ADD, data)) - -def abort_request(thread_queue, rid): - thread_queue.put((SglangInputType.ABORT, rid)) diff --git a/roll/third_party/sglang/fp8.py b/roll/third_party/sglang/fp8.py new file mode 100644 index 000000000..6c99adfc8 --- /dev/null +++ b/roll/third_party/sglang/fp8.py @@ -0,0 +1,304 @@ +from typing import Any, Dict, List +from functools import partial +import weakref + +import torch +from torch.nn import Module +from torch.nn.parameter import Parameter + +from sglang.srt.layers.quantization.fp8 import ( + Fp8Config, + _is_fp8_fnuz, + _is_cpu, + _is_hip, + _use_hip_int4, + _use_aiter, +) +from sglang.srt.layers.parameter import ( + BlockQuantScaleParameter, + ModelWeightParameter, +) +from sglang.srt.layers.moe import get_moe_runner_backend +from sglang.srt.layers.moe.ep_moe.layer import DeepEPMoE + +from roll.utils.fp8 import per_block_fp8_quant +from roll.utils.logging import get_logger + +logger = get_logger() + +def from_config(cls, config: Dict[str, Any]) -> Fp8Config: + quant_method = cls.get_from_keys_or(config, ["quant_method"], "") + is_checkpoint_fp8_serialized = "fp8" in quant_method + activation_scheme = cls.get_from_keys(config, ["activation_scheme"]) + ignored_layers = cls.get_from_keys_or(config, ["ignored_layers"], None) + weight_block_size = cls.get_from_keys_or(config, ["weight_block_size"], None) + skip_process_weights_after_loading = not is_checkpoint_fp8_serialized + config = cls( + is_checkpoint_fp8_serialized=True, + activation_scheme=activation_scheme, + ignored_layers=ignored_layers, + weight_block_size=weight_block_size, + ) + config.skip_process_weights_after_loading = skip_process_weights_after_loading + return config + +def monkey_patch_fp8_config(): + Fp8Config.from_config = classmethod(from_config) + +def per_block_fp8_quant_ue8m0( + weight: torch.Tensor, + weight_block_size: List[int], +): + from sglang.srt.layers.quantization.fp8_utils import ( + quant_weight_ue8m0, + transform_scale_ue8m0, + ) + assert weight_block_size == [128, 128] + + out_w, out_s = quant_weight_ue8m0( + weight_dequant=weight, + weight_block_size=weight_block_size, + ) + + out_s = transform_scale_ue8m0(out_s, mn=out_w.shape[-2]) + + return out_w, out_s + +def monkey_patch_fp8_linear_method(): + def f_weight_loader( + layer: weakref.ReferenceType, + original_weight_loader, + param: torch.Tensor, + loaded_weight: torch.Tensor, + *args, + **kwargs + ) -> None: + layer = layer() + assert param is layer.weight + target_device = layer.weight.device + with target_device: + loaded_weight = loaded_weight.to(target_device) + weight = ModelWeightParameter( + data=layer.weight.data if layer.weight_block_size else layer.weight.data.t(), + input_dim=1, + output_dim=0, + weight_loader=original_weight_loader, + ) + if loaded_weight.dtype == torch.float8_e4m3fn: + original_weight_loader(weight, loaded_weight, *args, **kwargs) + else: + if layer.format_ue8m0: + qweight, scale = per_block_fp8_quant_ue8m0(loaded_weight, layer.weight_block_size) + else: + qweight, scale = per_block_fp8_quant(loaded_weight, layer.weight_block_size) + weight_scale_inv = BlockQuantScaleParameter( + data=layer.weight_scale_inv.data, + input_dim=1, + output_dim=0, + weight_loader=original_weight_loader, + ) + weight_scale_inv.format_ue8m0 = True + original_weight_loader(weight, qweight, *args, **kwargs) + original_weight_loader(weight_scale_inv, scale, *args, **kwargs) + + def f_weight_scale_loader( + layer: weakref.ReferenceType, + original_weight_loader, + param: torch.Tensor, + loaded_weight: torch.Tensor, + *args, + **kwargs + ) -> None: + layer = layer() + assert param is layer.weight_scale_inv + target_device = layer.weight_scale_inv.device + with target_device: + weight_scale_inv = BlockQuantScaleParameter( + data=layer.weight_scale_inv.data, + input_dim=1, + output_dim=0, + weight_loader=original_weight_loader, + ) + original_weight_loader(weight_scale_inv, loaded_weight, *args, **kwargs) + + from sglang.srt.layers.quantization.fp8 import Fp8LinearMethod + original_create_weights = Fp8LinearMethod.create_weights + original_process_weights_after_loading = Fp8LinearMethod.process_weights_after_loading + + def f_create_weights( + self, + layer: torch.nn.Module, + input_size_per_partition: int, + output_partition_sizes: List[int], + input_size: int, + output_size: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + original_create_weights(self, layer, input_size_per_partition, output_partition_sizes, input_size, output_size, params_dtype, **extra_weight_attrs) + assert self.quant_config.is_checkpoint_fp8_serialized + assert self.block_quant, "only suuport block-wise quantization" + assert self.quant_config.weight_block_size + assert self.quant_config.activation_scheme == "dynamic" + assert not _is_fp8_fnuz + assert not _is_cpu + assert layer.input_scale is None + + if self.quant_config.skip_process_weights_after_loading: + try: + from sglang.srt.layers.quantization.fp8_utils import ( + requant_weight_ue8m0_inplace, + deepgemm_w8a8_block_fp8_linear_with_fallback, + ) + from sglang.srt.model_loader.utils import should_deepgemm_weight_requant_ue8m0 + # For fp8 linear weights run with deepgemm, the weights and scales need be requantized to ue8m0 + if ( + should_deepgemm_weight_requant_ue8m0(self.quant_config.weight_block_size) + and self.w8a8_block_fp8_linear is deepgemm_w8a8_block_fp8_linear_with_fallback + ): + requant_weight_ue8m0_inplace(layer.weight, layer.weight_scale_inv, self.quant_config.weight_block_size) + layer.format_ue8m0 = True + else: + layer.format_ue8m0 = False + except: + layer.format_ue8m0 = False + + layer.weight_block_size = self.quant_config.weight_block_size + + weight_loader = layer.weight.weight_loader + weight_loader = partial(f_weight_loader, weakref.ref(layer), weight_loader) + layer.weight = Parameter(layer.weight.data, requires_grad=False) + layer.weight.weight_loader = weight_loader + + weight_scale_inv_loader = layer.weight_scale_inv.weight_loader + weight_scale_inv_loader = partial(f_weight_scale_loader, weakref.ref(layer), weight_scale_inv_loader) + weight_scale_inv = layer.weight_scale_inv + layer.weight_scale_inv = Parameter(weight_scale_inv.data, requires_grad=False) + layer.weight_scale_inv.format_ue8m0 = self.quant_config.skip_process_weights_after_loading and layer.format_ue8m0 + layer.weight_scale_inv.weight_loader = weight_scale_inv_loader + + def f_process_weights_after_loading(self, layer: Module) -> None: + if not self.quant_config.skip_process_weights_after_loading: + original_process_weights_after_loading(self, layer) + + Fp8LinearMethod.create_weights = f_create_weights + Fp8LinearMethod.process_weights_after_loading = f_process_weights_after_loading + +def monkey_patch_fp8_moe_method(): + def f_w13_weight_loader( + layer: weakref.ReferenceType, + original_weight_loader, + param: torch.Tensor, + loaded_weight: torch.Tensor, + *args, + **kwargs + ) -> None: + layer = layer() + assert param is layer.w13_weight + target_device = layer.w13_weight.device + with target_device: + loaded_weight = loaded_weight.to(target_device) + if loaded_weight.dtype == torch.float8_e4m3fn: + original_weight_loader(layer.w13_weight, loaded_weight, *args, **kwargs) + else: + if layer.format_ue8m0: + qweight, scale = per_block_fp8_quant_ue8m0(loaded_weight, layer.weight_block_size) + else: + qweight, scale = per_block_fp8_quant(loaded_weight, layer.weight_block_size) + original_weight_loader(layer.w13_weight, qweight, *args, **kwargs) + original_weight_loader(layer.w13_weight_scale_inv, scale, *args, **kwargs) + + def f_w2_weight_loader( + layer: weakref.ReferenceType, + original_weight_loader, + param: torch.Tensor, + loaded_weight: torch.Tensor, + *args, + **kwargs + ) -> None: + layer = layer() + assert param is layer.w2_weight + target_device = layer.w2_weight.device + with target_device: + loaded_weight = loaded_weight.to(target_device) + if loaded_weight.dtype == torch.float8_e4m3fn: + original_weight_loader(layer.w2_weight, loaded_weight, *args, **kwargs) + else: + if layer.format_ue8m0: + qweight, scale = per_block_fp8_quant_ue8m0(loaded_weight, layer.weight_block_size) + else: + qweight, scale = per_block_fp8_quant(loaded_weight, layer.weight_block_size) + original_weight_loader(layer.w2_weight, qweight, *args, **kwargs) + original_weight_loader(layer.w2_weight_scale_inv, scale, *args, **kwargs) + + from sglang.srt.layers.quantization.fp8 import Fp8MoEMethod + original_create_weights = Fp8MoEMethod.create_weights + original_process_weights_after_loading = Fp8MoEMethod.process_weights_after_loading + + def f_create_weights( + self, + layer: Module, + num_experts: int, + hidden_size: int, + intermediate_size_per_partition: int, + params_dtype: torch.dtype, + **extra_weight_attrs, + ): + original_create_weights(self, layer, num_experts, hidden_size, intermediate_size_per_partition, params_dtype, **extra_weight_attrs) + assert self.quant_config.is_checkpoint_fp8_serialized + assert self.block_quant, "only suuport block-wise quantization" + assert self.quant_config.weight_block_size + assert self.quant_config.activation_scheme == "dynamic" + assert not _is_fp8_fnuz + assert not _is_cpu + assert not (_is_hip and _use_hip_int4) + assert not _use_aiter + + if self.quant_config.skip_process_weights_after_loading: + try: + from sglang.srt.layers.quantization.fp8_utils import ( + requant_weight_ue8m0_inplace, + ) + from sglang.srt.model_loader.utils import should_deepgemm_weight_requant_ue8m0 + # For fp8 moe run with deepgemm, the expert weights and scales need be requantized to ue8m0 + if ( + should_deepgemm_weight_requant_ue8m0(self.quant_config.weight_block_size) + and get_moe_runner_backend().is_deep_gemm() + ): + assert isinstance( + layer, DeepEPMoE + ), "DeepGemm MoE is only supported with DeepEPMoE" + requant_weight_ue8m0_inplace(layer.w13_weight, layer.w13_weight_scale_inv, layer.weight_block_size) + requant_weight_ue8m0_inplace(layer.w2_weight, layer.w2_weight_scale_inv, layer.weight_block_size) + layer.format_ue8m0 = True + else: + layer.format_ue8m0 = False + except: + layer.format_ue8m0 = False + + # store essential config in layer for custom weight loader + layer.weight_block_size = self.quant_config.weight_block_size + + w13_weight_loader = layer.w13_weight.weight_loader + w13_weight_loader = partial(f_w13_weight_loader, weakref.ref(layer), w13_weight_loader) + layer.w13_weight.weight_loader = w13_weight_loader + + w2_weight_loader = layer.w2_weight.weight_loader + w2_weight_loader = partial(f_w2_weight_loader , weakref.ref(layer), w2_weight_loader) + layer.w2_weight.weight_loader = w2_weight_loader + + # do not need patch weight loader of scale + assert type(layer.w13_weight_scale_inv) == Parameter + assert type(layer.w2_weight_scale_inv) == Parameter + + def f_process_weights_after_loading(self, layer: Module) -> None: + if not self.quant_config.skip_process_weights_after_loading: + original_process_weights_after_loading(self, layer) + + Fp8MoEMethod.create_weights = f_create_weights + Fp8MoEMethod.process_weights_after_loading = f_process_weights_after_loading + +def monkey_patch_fp8(): + monkey_patch_fp8_config() + monkey_patch_fp8_linear_method() + monkey_patch_fp8_moe_method() diff --git a/roll/third_party/sglang/io_struct.py b/roll/third_party/sglang/io_struct.py deleted file mode 100644 index faa6d156b..000000000 --- a/roll/third_party/sglang/io_struct.py +++ /dev/null @@ -1,62 +0,0 @@ -from dataclasses import dataclass - -@dataclass -class SetupCollectiveGroupReqInput: - comm_plan: dict - backend: int - rank_in_cluster: int - - -@dataclass -class SetupCollectiveGroupReqOutput: - success: bool - message: str - -@dataclass -class BroadcastBucketReqInput: - src_pp_rank: str - meta_infos: dict - bucket_size: int - - -@dataclass -class BroadcastBucketReqOutput: - success: bool - message: str - -@dataclass -class BroadcastParameterReqInput: - src_pp_rank: str - dtype: int - shape: dict - parameter_name: str - - -@dataclass -class BroadcastParameterReqOutput: - success: bool - message: str - -@dataclass -class UpdateParameterReqInput: - parameter_name: str - weight: int - ranks_in_worker: dict - - -@dataclass -class UpdateParameterReqOutput: - success: bool - message: str - -@dataclass -class UpdateParameterInBucketReqInput: - meta_infos: str - buffer: int - ranks_in_worker: dict - - -@dataclass -class UpdateParameterInBucketReqOutput: - success: bool - message: str \ No newline at end of file diff --git a/roll/third_party/sglang/v0410post2_patch/__init__.py b/roll/third_party/sglang/v0410post2_patch/__init__.py index fa4bec152..32de7e606 100644 --- a/roll/third_party/sglang/v0410post2_patch/__init__.py +++ b/roll/third_party/sglang/v0410post2_patch/__init__.py @@ -1,2 +1 @@ from . import engine -from . import scheduler \ No newline at end of file diff --git a/roll/third_party/sglang/v0410post2_patch/engine.py b/roll/third_party/sglang/v0410post2_patch/engine.py index cdc241676..c82ad395f 100644 --- a/roll/third_party/sglang/v0410post2_patch/engine.py +++ b/roll/third_party/sglang/v0410post2_patch/engine.py @@ -1,101 +1,35 @@ -import asyncio -from sglang.srt.entrypoints.engine import Engine +import os +import multiprocessing as mp -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, +import sglang.srt.entrypoints.engine as engine_module +from sglang.srt.server_args import ServerArgs +from sglang.srt.utils import ( + set_prometheus_multiproc_dir, + set_ulimit, ) -import sglang.srt.entrypoints.engine as engine_module +# Remove signal handler. singla.signal in python can only run in MainThread which fails when using Ray Async Actor. +def _set_envs_and_config(server_args: ServerArgs): + # Set global environments + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem)) + if not server_args.enable_symm_mem: + os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls)) + os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" + os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4" + os.environ["CUDA_MODULE_LOADING"] = "AUTO" -class EngineSA(Engine): + # Set prometheus env vars + if server_args.enable_metrics: + set_prometheus_multiproc_dir() + + # Set ulimit + set_ulimit() + + # Set mp start method + mp.set_start_method("spawn", force=True) - def setup_collective_group( - self, - comm_plan: str, - backend: str, - rank_in_cluster: int, - ): - obj = SetupCollectiveGroupReqInput( - comm_plan=comm_plan, - backend=backend, - rank_in_cluster=rank_in_cluster, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.setup_collective_group(obj, None) - ) - - def broadcast_bucket( - self, - src_pp_rank: int, - meta_infos: dict, - bucket_size: int, - ): - obj = BroadcastBucketReqInput( - src_pp_rank=src_pp_rank, - meta_infos=meta_infos, - bucket_size=bucket_size, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.broadcast_bucket(obj, None) - ) - - def broadcast_parameter( - self, - src_pp_rank, - dtype, - shape, - parameter_name - ): - obj = BroadcastParameterReqInput( - src_pp_rank=src_pp_rank, - dtype=dtype, - shape=shape, - parameter_name=parameter_name, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.broadcast_parameter(obj, None) - ) - - def update_parameter( - self, - parameter_name, - weight, - ranks_in_worker - ): - obj = UpdateParameterReqInput( - parameter_name=parameter_name, - weight=weight, - ranks_in_worker=ranks_in_worker, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.update_parameter(obj, None) - ) - - def update_parameter_in_bucket( - self, - meta_infos, - buffer, - ranks_in_worker - ): - """Initialize parameter update group.""" - obj = UpdateParameterInBucketReqInput( - meta_infos=meta_infos, - buffer=buffer, - ranks_in_worker=ranks_in_worker, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.update_parameter_in_bucket(obj, None) - ) class _roll_launch_subprocesses(object): def __init__(self, _launch_subprocesses): @@ -103,11 +37,8 @@ def __init__(self, _launch_subprocesses): def __call__(self, *args, **kwargs): import sys - from roll.third_party.sglang.v0410post2_patch.tokenizer_manager import TokenizerManagerSA - from roll.third_party.sglang.v0410post2_patch.scheduler import run_scheduler_process - - sys.modules['sglang.srt.entrypoints.engine'].__dict__['TokenizerManager'] = TokenizerManagerSA - sys.modules['sglang.srt.entrypoints.engine'].__dict__['run_scheduler_process'] = run_scheduler_process + + sys.modules['sglang.srt.entrypoints.engine'].__dict__['_set_envs_and_config'] = _set_envs_and_config return self._launch_subprocesses(*args, **kwargs) diff --git a/roll/third_party/sglang/v0410post2_patch/io_struct.py b/roll/third_party/sglang/v0410post2_patch/io_struct.py deleted file mode 100644 index faa6d156b..000000000 --- a/roll/third_party/sglang/v0410post2_patch/io_struct.py +++ /dev/null @@ -1,62 +0,0 @@ -from dataclasses import dataclass - -@dataclass -class SetupCollectiveGroupReqInput: - comm_plan: dict - backend: int - rank_in_cluster: int - - -@dataclass -class SetupCollectiveGroupReqOutput: - success: bool - message: str - -@dataclass -class BroadcastBucketReqInput: - src_pp_rank: str - meta_infos: dict - bucket_size: int - - -@dataclass -class BroadcastBucketReqOutput: - success: bool - message: str - -@dataclass -class BroadcastParameterReqInput: - src_pp_rank: str - dtype: int - shape: dict - parameter_name: str - - -@dataclass -class BroadcastParameterReqOutput: - success: bool - message: str - -@dataclass -class UpdateParameterReqInput: - parameter_name: str - weight: int - ranks_in_worker: dict - - -@dataclass -class UpdateParameterReqOutput: - success: bool - message: str - -@dataclass -class UpdateParameterInBucketReqInput: - meta_infos: str - buffer: int - ranks_in_worker: dict - - -@dataclass -class UpdateParameterInBucketReqOutput: - success: bool - message: str \ No newline at end of file diff --git a/roll/third_party/sglang/v0410post2_patch/model_runner.py b/roll/third_party/sglang/v0410post2_patch/model_runner.py deleted file mode 100644 index 3625684b6..000000000 --- a/roll/third_party/sglang/v0410post2_patch/model_runner.py +++ /dev/null @@ -1,195 +0,0 @@ -import logging -from dataclasses import dataclass -import torch -import torch.distributed as dist -import datetime - -from roll.platforms import current_platform - - -from sglang.srt.model_executor.model_runner import ModelRunner, UNBALANCED_MODEL_LOADING_TIMEOUT_S -from sglang.srt.configs.device_config import DeviceConfig -from sglang.srt.configs.load_config import LoadConfig -from sglang.srt.configs.update_config import adjust_config_with_unaligned_cpu_tp -from sglang.srt.distributed import get_tp_group -from sglang.srt.layers.quantization import monkey_patch_isinstance_for_vllm_base_layer -from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state -from sglang.srt.model_loader import get_model -from sglang.srt.utils import ( - get_available_gpu_memory, - monkey_patch_vllm_gguf_config, - set_cuda_arch, -) - -from roll.utils.collective import collective -from roll.utils.functionals import get_dist_info_from_comm_plan -from roll.platforms import current_platform - -logger = logging.getLogger(__name__) - - -class ModelRunnerSA(ModelRunner): - def load_model(self): - before_avail_memory = get_available_gpu_memory(self.device, self.gpu_id) - logger.info( - f"Load weight begin. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB" - ) - - # This can reduce thread conflicts and speed up weight loading. - if self.device != "cpu": - torch.set_num_threads(1) - if self.device == "cuda": - if current_platform.get_device_capability()[0] < 8: - logger.info( - "Compute capability below sm80. Use float16 due to lack of bfloat16 support." - ) - self.model_config.dtype = torch.float16 - if current_platform.get_device_capability()[1] < 5: - raise RuntimeError("SGLang only supports sm75 and above.") - - set_cuda_arch() - - # Prepare the model config - self.load_config = LoadConfig( - load_format=self.server_args.load_format, - download_dir=self.server_args.download_dir, - model_loader_extra_config=self.server_args.model_loader_extra_config, - ) - if self.device == "cpu": - self.model_config = adjust_config_with_unaligned_cpu_tp( - self.model_config, self.load_config, self.tp_size - ) - if self.server_args.load_format == "gguf": - monkey_patch_vllm_gguf_config() - - # Load the model - # Remove monkey_patch when linear.py quant remove dependencies with vllm - monkey_patch_vllm_parallel_state() - monkey_patch_isinstance_for_vllm_base_layer() - - self.model = get_model( - model_config=self.model_config, - load_config=self.load_config, - device_config=DeviceConfig(self.device), - ) - monkey_patch_vllm_parallel_state(reverse=True) - monkey_patch_isinstance_for_vllm_base_layer(reverse=True) - - if self.server_args.kv_cache_dtype == "fp8_e4m3": - if self.server_args.quantization_param_path is not None: - if callable(getattr(self.model, "load_kv_cache_scales", None)): - self.model.load_kv_cache_scales( - self.server_args.quantization_param_path - ) - logger.info( - "Loaded KV cache scaling factors from %s", - self.server_args.quantization_param_path, - ) - else: - raise RuntimeError( - "Using FP8 KV cache and scaling factors provided but " - "model %s does not support loading scaling factors.", - self.model.__class__, - ) - else: - logger.warning( - "Using FP8 KV cache but no scaling factors " - "provided. Defaulting to scaling factors of 1.0. " - "This may lead to less accurate results!" - ) - - # Parse other args - self.sliding_window_size = None - if hasattr(self.model, "get_attention_sliding_window_size"): - self.sliding_window_size = self.model.get_attention_sliding_window_size() - elif self.model_config.attention_chunk_size is not None: - self.sliding_window_size = self.model_config.attention_chunk_size - logger.info( - f"Setting sliding_window_size to be attention_chunk_size: {self.sliding_window_size}" - ) - - self.dtype = self.model_config.dtype - - after_avail_memory = get_available_gpu_memory(self.device, self.gpu_id) - self.weight_load_mem_usage = before_avail_memory - after_avail_memory - logger.info( - f"Load weight end. " - f"type={type(self.model).__name__}, " - f"dtype={self.dtype}, " - f"avail mem={after_avail_memory:.2f} GB, " - f"mem usage={self.weight_load_mem_usage:.2f} GB." - ) - - # Handle the case where some ranks do not finish loading. - try: - dist.monitored_barrier( - group=get_tp_group().cpu_group, - timeout=datetime.timedelta(seconds=UNBALANCED_MODEL_LOADING_TIMEOUT_S), - wait_all_ranks=True, - ) - except RuntimeError: - raise ValueError( - f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node." - ) from None - - def setup_collective_group(self, comm_plan, backend, rank_in_cluster): - self.model_update_comm_plan = getattr(self, "model_update_comm_plan", {}) - rank, comm_plan_args = get_dist_info_from_comm_plan(comm_plan, rank_in_cluster=rank_in_cluster, - rank_in_worker=dist.get_rank()) - if rank is None: - logger.info(f"no comm_plan found for rank {rank_in_cluster}/{dist.get_rank()}") - return True, "Succeeded to setup_collective_group." - - group_name = comm_plan_args["group_name"] - master_addr = comm_plan_args["master_addr"] - master_port = comm_plan_args["master_port"] - world_size = len(comm_plan_args["tgt_devices"]) + 1 - src_pp_rank = comm_plan_args["src_pp_rank"] - collective.init_collective_group(world_size, rank, backend=backend, group_name=group_name, - master_addr=master_addr, master_port=master_port) - # A small all_reduce for warmup. - collective.allreduce(torch.zeros(1).cuda(), group_name=group_name) - self.model_update_comm_plan[src_pp_rank] = dict(rank=rank, - world_size=world_size, - src_pp_rank=src_pp_rank, - group_name=group_name, - comm_plan=comm_plan, - comm_plan_args=comm_plan_args) - logger.info(f"warmup setup_collective_group: {group_name} rank: {rank} world_size: {world_size}") - return True, "Succeeded to setup_collective_group." - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - if src_pp_rank not in self.model_update_comm_plan: - return True, "Succeeded to broadcast_bucket." - - comm_plan = self.model_update_comm_plan[src_pp_rank] - buffer = torch.empty(bucket_size, dtype=torch.int8, device=current_platform.device_type) - collective.broadcast(tensor=buffer, src_rank=0, group_name=comm_plan["group_name"]) - self.update_parameter_in_bucket(meta_infos, buffer, [dist.get_rank()]) - return True, "Succeeded to broadcast_bucket." - - def broadcast_parameter(self, src_pp_rank, dtype, shape, parameter_name): - if src_pp_rank not in self.model_update_comm_plan: - return True, "Succeeded to broadcast_parameter." - comm_plan = self.model_update_comm_plan[src_pp_rank] - weight = torch.empty(shape, dtype=dtype, device=current_platform.device_type) - collective.broadcast(tensor=weight, src_rank=0, group_name=comm_plan["group_name"]) - self.update_parameter(parameter_name, weight, [dist.get_rank()]) - return True, "Succeeded to broadcast_parameter." - - def update_parameter(self, parameter_name, weight, ranks_in_worker): - if dist.get_rank() not in ranks_in_worker: - return True, "Succeeded to update_parameter." - self.model.load_weights([(parameter_name, weight)]) - del weight - return True, "Succeeded to update_parameter." - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - if dist.get_rank() not in ranks_in_worker: - return True, "Succeeded to update_parameter_in_bucket." - from mcore_adapter.models.converter.convert_utils import RecvBucketManager - self.recv_manager = getattr(self, "recv_manager", RecvBucketManager()) - named_params = self.recv_manager.process_bucket(meta_infos, buffer) - del buffer - self.model.load_weights([(name, weight) for name, weight in named_params.items()]) - return True, "Succeeded to update_parameter_in_bucket." \ No newline at end of file diff --git a/roll/third_party/sglang/v0410post2_patch/scheduler.py b/roll/third_party/sglang/v0410post2_patch/scheduler.py deleted file mode 100644 index ab13d80f2..000000000 --- a/roll/third_party/sglang/v0410post2_patch/scheduler.py +++ /dev/null @@ -1,96 +0,0 @@ -import torch -from roll.platforms import current_platform -from sglang.srt.managers.io_struct import ( - ReleaseMemoryOccupationReqInput, - ReleaseMemoryOccupationReqOutput, - ResumeMemoryOccupationReqOutput, - ResumeMemoryOccupationReqInput, -) -from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE - -from sglang.srt.managers.scheduler import Scheduler -from sglang.srt.managers.scheduler_update_weights_mixin import _import_static_state, _export_static_state - - -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, - SetupCollectiveGroupReqOutput, - BroadcastBucketReqOutput, - BroadcastParameterReqOutput, - UpdateParameterInBucketReqOutput, - UpdateParameterReqOutput, -) - -class SchedulerSA(Scheduler): - def __init__(self, *args, **kwargs): - import sys - from roll.third_party.sglang.v0410post2_patch.tp_worker import TpModelWorkerClientSA, TpModelWorkerSA - sys.modules['sglang.srt.managers.scheduler'].__dict__['TpModelWorkerClient'] = TpModelWorkerClientSA - sys.modules['sglang.srt.managers.scheduler'].__dict__['TpModelWorker'] = TpModelWorkerSA - super().__init__(*args, **kwargs) - func_map_patch = [(SetupCollectiveGroupReqInput, self.setup_collective_group), - (BroadcastBucketReqInput, self.broadcast_bucket), - (BroadcastParameterReqInput, self.broadcast_parameter), - (UpdateParameterInBucketReqInput, self.update_parameter_in_bucket), - (UpdateParameterReqInput, self.update_parameter)] - self._request_dispatcher._mapping += func_map_patch - - def setup_collective_group(self, recv_req: SetupCollectiveGroupReqInput): - success, message = self.tp_worker.setup_collective_group(recv_req) - return SetupCollectiveGroupReqOutput(success, message) - - def release_memory_occupation(self, recv_req: ReleaseMemoryOccupationReqInput): - self.stashed_model_static_state = _export_static_state( - self.tp_worker.worker.model_runner.model - ) - self.tp_worker.worker.model_runner.model.to('cpu') - self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_KV_CACHE) - self.flush_cache() - return ReleaseMemoryOccupationReqOutput() - - def resume_memory_occupation(self, recv_req: ResumeMemoryOccupationReqInput): - self.tp_worker.worker.model_runner.model.to(current_platform.current_device()) - self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_KV_CACHE) - - # gc.collect() - # current_platform.empty_cache() - # self.tp_worker.worker.model_runner.model.to(current_platform.current_device()) - _import_static_state( - self.tp_worker.worker.model_runner.model, self.stashed_model_static_state - ) - del self.stashed_model_static_state - - self.tp_worker.worker.model_runner.init_cublas() - self.tp_worker.worker.model_runner.init_attention_backend() - from sglang.srt.model_executor.cuda_graph_runner import set_global_graph_memory_pool - set_global_graph_memory_pool(None) - self.tp_worker.worker.model_runner.init_cuda_graphs() - - return ResumeMemoryOccupationReqOutput() - - def broadcast_bucket(self, recv_req: BroadcastBucketReqInput): - success, message = self.tp_worker.broadcast_bucket(recv_req) - return BroadcastBucketReqOutput(success, message) - - def broadcast_parameter(self, recv_req: BroadcastParameterReqInput): - success, message = self.tp_worker.broadcast_parameter(recv_req) - return BroadcastParameterReqOutput(success, message) - - def update_parameter(self, recv_req: UpdateParameterReqInput): - success, message = self.tp_worker.update_parameter(recv_req) - return UpdateParameterReqOutput(success, message) - - def update_parameter_in_bucket(self, recv_req: UpdateParameterInBucketReqInput): - success, message = self.tp_worker.update_parameter_in_bucket(recv_req) - return UpdateParameterInBucketReqOutput(success, message) - - -def run_scheduler_process(*args, **kwargs): - import sys - sys.modules['sglang.srt.managers.scheduler'].__dict__['Scheduler'] = SchedulerSA - from sglang.srt.managers.scheduler import run_scheduler_process - return run_scheduler_process(*args, **kwargs) \ No newline at end of file diff --git a/roll/third_party/sglang/v0410post2_patch/tokenizer_manager.py b/roll/third_party/sglang/v0410post2_patch/tokenizer_manager.py deleted file mode 100644 index c751b1535..000000000 --- a/roll/third_party/sglang/v0410post2_patch/tokenizer_manager.py +++ /dev/null @@ -1,126 +0,0 @@ -import os -from typing import Optional, Tuple -import fastapi - -from sglang.srt.server_args import PortArgs, ServerArgs -from sglang.srt.managers.tokenizer_manager import TokenizerManager, _Communicator - -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, - SetupCollectiveGroupReqOutput, - BroadcastBucketReqOutput, - BroadcastParameterReqOutput, - UpdateParameterInBucketReqOutput, - UpdateParameterReqOutput, -) - -class TokenizerManagerSA(TokenizerManager): - def __init__( - self, - server_args: ServerArgs, - port_args: PortArgs, - ): - super().__init__(server_args=server_args, port_args=port_args) - - self.setup_collective_group_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.broadcast_bucket_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.broadcast_parameter_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.update_parameter_in_bucket_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.update_parameter_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - - communicator_patch = [( - SetupCollectiveGroupReqOutput, - self.setup_collective_group_communicator.handle_recv, - ), - ( - BroadcastBucketReqOutput, - self.broadcast_bucket_communicator.handle_recv, - ), - ( - BroadcastParameterReqOutput, - self.broadcast_parameter_communicator.handle_recv, - ), - ( - UpdateParameterInBucketReqOutput, - self.update_parameter_in_bucket_communicator.handle_recv, - ), - ( - UpdateParameterReqOutput, - self.update_parameter_communicator.handle_recv, - )] - - self._result_dispatcher._mapping += communicator_patch - - async def setup_collective_group( - self, - obj: SetupCollectiveGroupReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - assert ( - self.server_args.dp_size == 1 - ), "dp_size must be 1 for init parameter update group" - result = (await self.setup_collective_group_communicator(obj))[0] - return result.success, result.message - - async def broadcast_bucket( - self, - obj: BroadcastBucketReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - assert ( - self.server_args.dp_size == 1 - ), "dp_size must be 1 for init parameter update group" - result = (await self.broadcast_bucket_communicator(obj))[0] - return result.success, result.message - - async def broadcast_parameter( - self, - obj: BroadcastParameterReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - assert ( - self.server_args.dp_size == 1 - ), "dp_size must be 1 for init parameter update group" - result = (await self.broadcast_parameter_communicator(obj))[0] - return result.success, result.message - - async def update_parameter( - self, - obj: UpdateParameterReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - assert ( - self.server_args.dp_size == 1 - ), "dp_size must be 1 for init parameter update group" - result = (await self.update_parameter_communicator(obj))[0] - return result.success, result.message - - async def update_parameter_in_bucket( - self, - obj: UpdateParameterInBucketReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - assert ( - self.server_args.dp_size == 1 - ), "dp_size must be 1 for init parameter update group" - result = (await self.update_parameter_in_bucket_communicator(obj))[0] - return result.success, result.message \ No newline at end of file diff --git a/roll/third_party/sglang/v0410post2_patch/tp_worker.py b/roll/third_party/sglang/v0410post2_patch/tp_worker.py deleted file mode 100644 index 509d6e0df..000000000 --- a/roll/third_party/sglang/v0410post2_patch/tp_worker.py +++ /dev/null @@ -1,86 +0,0 @@ -from sglang.srt.managers.tp_worker import TpModelWorker -from sglang.srt.managers.tp_worker_overlap_thread import TpModelWorkerClient - - -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, -) - -class TpModelWorkerSA(TpModelWorker): - def __init__(self, *args, **kwargs): - import sys - from roll.third_party.sglang.v0410post2_patch.model_runner import ModelRunnerSA - sys.modules['sglang.srt.managers.tp_worker'].__dict__['ModelRunner'] = ModelRunnerSA - super().__init__(*args, **kwargs) - - def setup_collective_group(self, recv_req: SetupCollectiveGroupReqInput): - success, message = self.model_runner.setup_collective_group( - recv_req.comm_plan, - recv_req.backend, - recv_req.rank_in_cluster, - ) - return success, message - - def broadcast_bucket(self, recv_req: BroadcastBucketReqInput): - success, message = self.model_runner.broadcast_bucket( - recv_req.src_pp_rank, - recv_req.meta_infos, - recv_req.bucket_size, - ) - return success, message - - def broadcast_parameter(self, recv_req: BroadcastParameterReqInput): - success, message = self.model_runner.broadcast_parameter( - recv_req.src_pp_rank, - recv_req.dtype, - recv_req.shape, - recv_req.parameter_name, - ) - return success, message - - def update_parameter(self, recv_req: UpdateParameterReqInput): - success, message = self.model_runner.update_parameter( - recv_req.parameter_name, - recv_req.weight, - recv_req.ranks_in_worker, - ) - return success, message - - def update_parameter_in_bucket(self, recv_req: UpdateParameterInBucketReqInput): - success, message = self.model_runner.update_parameter_in_bucket( - recv_req.meta_infos, - recv_req.buffer, - recv_req.ranks_in_worker, - ) - return success, message - - -class TpModelWorkerClientSA(TpModelWorkerClient): - def __init__(self, *args, **kwargs): - import sys - sys.modules['sglang.srt.managers.tp_worker_overlap_thread'].__dict__['TpModelWorker'] = TpModelWorkerSA - super().__init__(*args, **kwargs) - - def setup_collective_group(self, recv_req: SetupCollectiveGroupReqInput): - success, message = self.worker.setup_collective_group(recv_req) - return success, message - - def broadcast_bucket(self, recv_req: BroadcastBucketReqInput): - success, message = self.worker.broadcast_bucket(recv_req) - return success, message - - def broadcast_parameter(self, recv_req: BroadcastParameterReqInput): - success, message = self.worker.broadcast_parameter(recv_req) - return success, message - - def update_parameter(self, recv_req: UpdateParameterReqInput): - success, message = self.worker.update_parameter(recv_req) - return success, message - - def update_parameter_in_bucket(self, recv_req: UpdateParameterInBucketReqInput): - success, message = self.worker.update_parameter_in_bucket(recv_req) - return success, message \ No newline at end of file diff --git a/roll/third_party/sglang/v046post4_patch/__init__.py b/roll/third_party/sglang/v046post4_patch/__init__.py index fa4bec152..32de7e606 100644 --- a/roll/third_party/sglang/v046post4_patch/__init__.py +++ b/roll/third_party/sglang/v046post4_patch/__init__.py @@ -1,2 +1 @@ from . import engine -from . import scheduler \ No newline at end of file diff --git a/roll/third_party/sglang/v046post4_patch/async_engine.py b/roll/third_party/sglang/v046post4_patch/async_engine.py deleted file mode 100644 index 096b069e9..000000000 --- a/roll/third_party/sglang/v046post4_patch/async_engine.py +++ /dev/null @@ -1,169 +0,0 @@ -import asyncio -import traceback -import asyncio -import enum -from roll.utils.logging import get_logger - -logger = get_logger() - - -class SglangInputType(enum.Enum): - ADD = enum.auto() - ABORT = enum.auto() - -def list_endswith(lst, suffix): - # 检查 lst 是否以 suffix 结尾 - return lst[-len(suffix):] == suffix if len(suffix) <= len(lst) else False - -def trim_overlap_tokens(existing_tokens, new_chunk_tokens): - """ - copy trim_overlap in int list - """ - max_overlap = 0 - max_possible = min(len(existing_tokens), len(new_chunk_tokens)) - for i in range(max_possible, 0, -1): - if list_endswith(existing_tokens, new_chunk_tokens[:i]): - max_overlap = i - break - return new_chunk_tokens[max_overlap:] - - -# 用于存放所有abort_rid_set -abort_rid_set = set() -abort_lock = asyncio.Lock() - - -async def producer(thread_queue, asyncio_queue): - PRODUCER_PUT_TIMEOUT = 15 * 60 - while True: - if not thread_queue.empty(): - data = thread_queue.get() - # 收到结束标记 - if data is None: - logger.info("[sglang async engine] receive stop signal, stoping") - break - command, command_data = data - if command == SglangInputType.ABORT: - async with abort_lock: - rid = command_data - abort_rid_set.add(rid) - else: - await asyncio.wait_for(asyncio_queue.put(data), timeout=PRODUCER_PUT_TIMEOUT) - else: - await asyncio.sleep(0.1) - -async def consumer(asyncio_queue, consumer_id, llm, request_complete_callback): - from sglang.srt.managers.io_struct import GenerateReqInput - from roll.distributed.scheduler.protocol import DataProto - - def process_sglang_output(token_ids, meta_info): - # 线上正式使用 - output_data = DataProto(meta_info=meta_info) - output_data.meta_info["output_token_ids"] = token_ids - request_complete_callback(data=output_data) - - # 本地调试使用 - # request_complete_callback(meta_info['request_id'], token_ids) - logger.debug(f"worker_id:{consumer_id} request_id: {meta_info['request_id']} finish!") - - try: - while True: - pack_data = await asyncio_queue.get() - asyncio_queue.task_done() - if pack_data is None: - break - - command, data = pack_data - - rid, input_ids, sampling_params, meta_info = data - rid_str = rid[0] - async with abort_lock: - if rid_str in abort_rid_set: - logger.debug(f"request_id: {rid_str} do not running!") - abort_rid_set.remove(rid_str) - continue - - final_tokens = [[] for _ in range(sampling_params['n'])] - logger.debug(f"worker_id:{consumer_id} request_id: {rid} starting!") - - parallel_sample_num = 1 - if sampling_params['n'] > 1: - rid = [rid] - parallel_sample_num = sampling_params['n'] - - obj = GenerateReqInput( - # text=prompt, - input_ids=input_ids, - rid=rid, - sampling_params=sampling_params, - stream=True, - ) - generator = llm.tokenizer_manager.generate_request(obj, None) - - # generator = await llm.async_generate(prompt, sampling_params, rid=rid, stream=True) - generate_success = True - async for chunk in generator: - # chunk_text = chunk["text"] - async with abort_lock: - if rid_str in abort_rid_set: - cur_abort_rid = chunk['meta_info']['id'] - - logger.debug(f"request_id: {rid_str}-{cur_abort_rid} aborting!") - llm.tokenizer_manager.abort_request(cur_abort_rid) - logger.debug(f"request_id: {rid_str}-{cur_abort_rid} abort success!") - parallel_sample_num -= 1 - - if parallel_sample_num == 0: - abort_rid_set.remove(rid_str) - generate_success = False - break - - chunk_tokens = chunk["output_ids"] - chunk_index = chunk.get("index", 0) - # logger.info(chunk["meta_info"]) - cleaned_chunk = trim_overlap_tokens(final_tokens[chunk_index], chunk_tokens) - final_tokens[chunk_index] += cleaned_chunk - # logger.info(f"consumer_id:{consumer_id} consumer finish: {final_text}") - if generate_success: - process_sglang_output(final_tokens, meta_info) - # request_complete_callback(rid, final_tokens) - except Exception as e: - logger.info(traceback.format_exc()) - -async def predict_in_asyncio(model, request_complete_callback, thread_queue): - PARALLELISM_WORKER_CNT = 128 - PRODUCER_BUFFER_SIZE = 40 - - logger.info("[sglang asyncio] env setup...") - async with abort_lock: - abort_rid_set.clear() - asyncio_queue = asyncio.Queue(maxsize=PRODUCER_BUFFER_SIZE) - producer_task = asyncio.create_task(producer(thread_queue, asyncio_queue)) - consumers = [asyncio.create_task(consumer(asyncio_queue, i, model, request_complete_callback)) for i in range(PARALLELISM_WORKER_CNT)] - logger.info("[sglang asyncio] env setup (done)") - - await producer_task - logger.info("[sglang asyncio] killing consumers ...") - for _ in range(len(consumers)): - await asyncio_queue.put(None) - # await asyncio_queue.join() - logger.info("[sglang asyncio] finish signal has set") - try: - await asyncio.wait_for(asyncio.gather(*consumers), timeout=30) - except asyncio.TimeoutError: - logger.info("Timeout: Not all tasks completed within the time limit") - # model.tokenizer_manager.asyncio_tasks.clear() - # model.tokenizer_manager.no_create_loop = False - logger.info("killing workers done, AsyncSglangEngine stop success") - -def start_async_sglang(loop, model, request_complete_callback, thread_queue): - try: - loop.run_until_complete(predict_in_asyncio(model, request_complete_callback, thread_queue=thread_queue)) - except Exception as e: - logger.info(f"async_sglang thread raise Exception!\n{traceback.format_exc()}") - -def add_request(thread_queue, data): - thread_queue.put((SglangInputType.ADD, data)) - -def abort_request(thread_queue, rid): - thread_queue.put((SglangInputType.ABORT, rid)) diff --git a/roll/third_party/sglang/v046post4_patch/engine.py b/roll/third_party/sglang/v046post4_patch/engine.py index 64c40b123..cf0a47f21 100644 --- a/roll/third_party/sglang/v046post4_patch/engine.py +++ b/roll/third_party/sglang/v046post4_patch/engine.py @@ -1,101 +1,53 @@ -import asyncio -from sglang.srt.entrypoints.engine import Engine +import os +import multiprocessing as mp -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, +import sglang.srt.entrypoints.engine as engine_module +from sglang.srt.server_args import ServerArgs +from sglang.srt.utils import ( + maybe_set_triton_cache_manager, + set_prometheus_multiproc_dir, + set_ulimit, ) -import sglang.srt.entrypoints.engine as engine_module +# Remove signal handler. singla.signal in python can only run in MainThread which fails when using Ray Async Actor. +def _set_envs_and_config(server_args: ServerArgs): + # Set global environments + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + os.environ["NCCL_CUMEM_ENABLE"] = "0" + os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls)) + os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1" + os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4" + os.environ["CUDA_MODULE_LOADING"] = "AUTO" -class EngineSA(Engine): + # Set prometheus env vars + if server_args.enable_metrics: + set_prometheus_multiproc_dir() - def setup_collective_group( - self, - comm_plan: str, - backend: str, - rank_in_cluster: int, - ): - obj = SetupCollectiveGroupReqInput( - comm_plan=comm_plan, - backend=backend, - rank_in_cluster=rank_in_cluster, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.setup_collective_group(obj, None) - ) - - def broadcast_bucket( - self, - src_pp_rank: int, - meta_infos: dict, - bucket_size: int, - ): - obj = BroadcastBucketReqInput( - src_pp_rank=src_pp_rank, - meta_infos=meta_infos, - bucket_size=bucket_size, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.broadcast_bucket(obj, None) - ) - - def broadcast_parameter( - self, - src_pp_rank, - dtype, - shape, - parameter_name - ): - obj = BroadcastParameterReqInput( - src_pp_rank=src_pp_rank, - dtype=dtype, - shape=shape, - parameter_name=parameter_name, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.broadcast_parameter(obj, None) - ) - - def update_parameter( - self, - parameter_name, - weight, - ranks_in_worker - ): - obj = UpdateParameterReqInput( - parameter_name=parameter_name, - weight=weight, - ranks_in_worker=ranks_in_worker, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.update_parameter(obj, None) - ) - - def update_parameter_in_bucket( - self, - meta_infos, - buffer, - ranks_in_worker - ): - """Initialize parameter update group.""" - obj = UpdateParameterInBucketReqInput( - meta_infos=meta_infos, - buffer=buffer, - ranks_in_worker=ranks_in_worker, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.update_parameter_in_bucket(obj, None) - ) + # Set ulimit + set_ulimit() + + # Fix triton bugs + if server_args.tp_size * server_args.dp_size > 1: + # FIXME: remove this after https://github.com/triton-lang/triton/pull/4295 is used as a dependency. + maybe_set_triton_cache_manager() + + # Set mp start method + mp.set_start_method("spawn", force=True) + +def run_scheduler_process(*args, **kwargs): + from roll.third_party.sglang import fp8 + fp8.monkey_patch_fp8() + + from sglang.srt.managers.scheduler import run_scheduler_process + return run_scheduler_process(*args, **kwargs) + +def run_data_parallel_controller_process(*args, **kwargs): + import sys + sys.modules['sglang.srt.managers.data_parallel_controller'].__dict__['run_scheduler_process'] = run_scheduler_process + + from sglang.srt.managers.data_parallel_controller import run_data_parallel_controller_process + return run_data_parallel_controller_process(*args, **kwargs) class _roll_launch_subprocesses(object): def __init__(self, _launch_subprocesses): @@ -103,11 +55,10 @@ def __init__(self, _launch_subprocesses): def __call__(self, *args, **kwargs): import sys - from roll.third_party.sglang.v046post4_patch.tokenizer_manager import TokenizerManagerSA - from roll.third_party.sglang.v046post4_patch.scheduler import run_scheduler_process - - sys.modules['sglang.srt.entrypoints.engine'].__dict__['TokenizerManager'] = TokenizerManagerSA + + sys.modules['sglang.srt.entrypoints.engine'].__dict__['_set_envs_and_config'] = _set_envs_and_config sys.modules['sglang.srt.entrypoints.engine'].__dict__['run_scheduler_process'] = run_scheduler_process + sys.modules['sglang.srt.entrypoints.engine'].__dict__['run_data_parallel_controller_process'] = run_data_parallel_controller_process return self._launch_subprocesses(*args, **kwargs) diff --git a/roll/third_party/sglang/v046post4_patch/io_struct.py b/roll/third_party/sglang/v046post4_patch/io_struct.py deleted file mode 100644 index faa6d156b..000000000 --- a/roll/third_party/sglang/v046post4_patch/io_struct.py +++ /dev/null @@ -1,62 +0,0 @@ -from dataclasses import dataclass - -@dataclass -class SetupCollectiveGroupReqInput: - comm_plan: dict - backend: int - rank_in_cluster: int - - -@dataclass -class SetupCollectiveGroupReqOutput: - success: bool - message: str - -@dataclass -class BroadcastBucketReqInput: - src_pp_rank: str - meta_infos: dict - bucket_size: int - - -@dataclass -class BroadcastBucketReqOutput: - success: bool - message: str - -@dataclass -class BroadcastParameterReqInput: - src_pp_rank: str - dtype: int - shape: dict - parameter_name: str - - -@dataclass -class BroadcastParameterReqOutput: - success: bool - message: str - -@dataclass -class UpdateParameterReqInput: - parameter_name: str - weight: int - ranks_in_worker: dict - - -@dataclass -class UpdateParameterReqOutput: - success: bool - message: str - -@dataclass -class UpdateParameterInBucketReqInput: - meta_infos: str - buffer: int - ranks_in_worker: dict - - -@dataclass -class UpdateParameterInBucketReqOutput: - success: bool - message: str \ No newline at end of file diff --git a/roll/third_party/sglang/v046post4_patch/model_runner.py b/roll/third_party/sglang/v046post4_patch/model_runner.py deleted file mode 100644 index 400f0e584..000000000 --- a/roll/third_party/sglang/v046post4_patch/model_runner.py +++ /dev/null @@ -1,190 +0,0 @@ -import logging -from dataclasses import dataclass -import torch -import torch.distributed as dist -import datetime - -from roll.platforms import current_platform - -from sglang.srt.model_executor.model_runner import ModelRunner, UNBALANCED_MODEL_LOADING_TIMEOUT_S -from sglang.srt.configs.device_config import DeviceConfig -from sglang.srt.configs.load_config import LoadConfig -from sglang.srt.distributed import get_tp_group -from sglang.srt.layers.quantization import monkey_patch_isinstance_for_vllm_base_layer -from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state -from sglang.srt.model_loader import get_model -from sglang.srt.utils import ( - get_available_gpu_memory, - monkey_patch_vllm_gguf_config, - set_cuda_arch, -) - -from roll.utils.collective import collective -from roll.utils.functionals import get_dist_info_from_comm_plan -from roll.platforms import current_platform - -logger = logging.getLogger(__name__) - - -class ModelRunnerSA(ModelRunner): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - def load_model(self): - before_avail_memory = get_available_gpu_memory(self.device, self.gpu_id) - logger.info( - f"Load weight begin. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB" - ) - - # This can reduce thread conflicts and speed up weight loading. - if self.device != "cpu": - torch.set_num_threads(1) - if self.device == current_platform.device_type: - if current_platform.get_device_capability()[0] < 8: - if self.should_log: - logger.info( - "Compute capability below sm80. Use float16 due to lack of bfloat16 support." - ) - self.server_args.dtype = "float16" - self.model_config.dtype = torch.float16 - if current_platform.get_device_capability()[1] < 5: - raise RuntimeError("SGLang only supports sm75 and above.") - - set_cuda_arch() - - # Prepare the model config - self.load_config = LoadConfig( - load_format=self.server_args.load_format, - download_dir=self.server_args.download_dir, - ) - if self.server_args.load_format == "gguf": - monkey_patch_vllm_gguf_config() - - # Load the model - # Remove monkey_patch when linear.py quant remove dependencies with vllm - monkey_patch_vllm_parallel_state() - monkey_patch_isinstance_for_vllm_base_layer() - - self.model = get_model( - model_config=self.model_config, - load_config=self.load_config, - device_config=DeviceConfig(self.device), - ) - monkey_patch_vllm_parallel_state(reverse=True) - monkey_patch_isinstance_for_vllm_base_layer(reverse=True) - - if self.server_args.kv_cache_dtype == "fp8_e4m3": - if self.server_args.quantization_param_path is not None: - if callable(getattr(self.model, "load_kv_cache_scales", None)): - self.model.load_kv_cache_scales( - self.server_args.quantization_param_path - ) - if self.should_log: - logger.info( - "Loaded KV cache scaling factors from %s", - self.server_args.quantization_param_path, - ) - else: - raise RuntimeError( - "Using FP8 KV cache and scaling factors provided but " - "model %s does not support loading scaling factors.", - self.model.__class__, - ) - else: - logger.warning( - "Using FP8 KV cache but no scaling factors " - "provided. Defaulting to scaling factors of 1.0. " - "This may lead to less accurate results!" - ) - - # Parse other args - self.sliding_window_size = ( - self.model.get_attention_sliding_window_size() - if hasattr(self.model, "get_attention_sliding_window_size") - else None - ) - self.dtype = self.model_config.dtype - - after_avail_memory = get_available_gpu_memory(self.device, self.gpu_id) - logger.info( - f"Load weight end. " - f"type={type(self.model).__name__}, " - f"dtype={self.dtype}, " - f"avail mem={after_avail_memory:.2f} GB, " - f"mem usage={(before_avail_memory - after_avail_memory):.2f} GB." - ) - - # Handle the case where some ranks do not finish loading. - try: - dist.monitored_barrier( - group=get_tp_group().cpu_group, - timeout=datetime.timedelta(seconds=UNBALANCED_MODEL_LOADING_TIMEOUT_S), - wait_all_ranks=True, - ) - except RuntimeError: - raise ValueError( - f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node." - ) from None - - def setup_collective_group(self, comm_plan, backend, rank_in_cluster): - self.model_update_comm_plan = getattr(self, "model_update_comm_plan", {}) - rank, comm_plan_args = get_dist_info_from_comm_plan(comm_plan, rank_in_cluster=rank_in_cluster, - rank_in_worker=dist.get_rank()) - if rank is None: - logger.info(f"no comm_plan found for rank {rank_in_cluster}/{dist.get_rank()}") - return True, "Succeeded to setup_collective_group." - - group_name = comm_plan_args["group_name"] - master_addr = comm_plan_args["master_addr"] - master_port = comm_plan_args["master_port"] - world_size = len(comm_plan_args["tgt_devices"]) + 1 - src_pp_rank = comm_plan_args["src_pp_rank"] - collective.init_collective_group(world_size, rank, backend=backend, group_name=group_name, - master_addr=master_addr, master_port=master_port) - # A small all_reduce for warmup. - collective.allreduce(torch.zeros(1).to(current_platform.device_type), group_name=group_name) - self.model_update_comm_plan[src_pp_rank] = dict(rank=rank, - world_size=world_size, - src_pp_rank=src_pp_rank, - group_name=group_name, - comm_plan=comm_plan, - comm_plan_args=comm_plan_args) - logger.info(f"warmup setup_collective_group: {group_name} rank: {rank} world_size: {world_size}") - return True, "Succeeded to setup_collective_group." - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - if src_pp_rank not in self.model_update_comm_plan: - return True, "Succeeded to broadcast_bucket." - - comm_plan = self.model_update_comm_plan[src_pp_rank] - buffer = torch.empty(bucket_size, dtype=torch.int8, device=current_platform.device_type) - collective.broadcast(tensor=buffer, src_rank=0, group_name=comm_plan["group_name"]) - self.update_parameter_in_bucket(meta_infos, buffer, [dist.get_rank()]) - return True, "Succeeded to broadcast_bucket." - - def broadcast_parameter(self, src_pp_rank, dtype, shape, parameter_name): - if src_pp_rank not in self.model_update_comm_plan: - return True, "Succeeded to broadcast_parameter." - comm_plan = self.model_update_comm_plan[src_pp_rank] - weight = torch.empty(shape, dtype=dtype, device=current_platform.device_type) - collective.broadcast(tensor=weight, src_rank=0, group_name=comm_plan["group_name"]) - self.update_parameter(parameter_name, weight, [dist.get_rank()]) - return True, "Succeeded to broadcast_parameter." - - - def update_parameter(self, parameter_name, weight, ranks_in_worker): - if dist.get_rank() not in ranks_in_worker: - return True, "Succeeded to update_parameter." - self.model.load_weights([(parameter_name, weight)]) - del weight - return True, "Succeeded to update_parameter." - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - if dist.get_rank() not in ranks_in_worker: - return True, "Succeeded to update_parameter_in_bucket." - from mcore_adapter.models.converter.convert_utils import RecvBucketManager - self.recv_manager = getattr(self, "recv_manager", RecvBucketManager()) - named_params = self.recv_manager.process_bucket(meta_infos, buffer) - del buffer - self.model.load_weights([(name, weight) for name, weight in named_params.items()]) - return True, "Succeeded to update_parameter_in_bucket." \ No newline at end of file diff --git a/roll/third_party/sglang/v046post4_patch/scheduler.py b/roll/third_party/sglang/v046post4_patch/scheduler.py deleted file mode 100644 index 2a23937a8..000000000 --- a/roll/third_party/sglang/v046post4_patch/scheduler.py +++ /dev/null @@ -1,98 +0,0 @@ -import torch -import logging -import torch -from roll.platforms import current_platform -from sglang.srt.managers.io_struct import ( - ReleaseMemoryOccupationReqInput, - ReleaseMemoryOccupationReqOutput, - ResumeMemoryOccupationReqOutput, - ResumeMemoryOccupationReqInput, -) - -from sglang.srt.managers.scheduler import Scheduler, _import_static_state, _export_static_state - -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, - SetupCollectiveGroupReqOutput, - BroadcastBucketReqOutput, - BroadcastParameterReqOutput, - UpdateParameterInBucketReqOutput, - UpdateParameterReqOutput, -) - -logger = logging.getLogger(__name__) - - -class SchedulerSA(Scheduler): - def __init__(self, *args, **kwargs): - import sys - from roll.third_party.sglang.v046post4_patch.tp_worker import TpModelWorkerClientSA, TpModelWorkerSA - sys.modules['sglang.srt.managers.scheduler'].__dict__['TpModelWorkerClient'] = TpModelWorkerClientSA - sys.modules['sglang.srt.managers.scheduler'].__dict__['TpModelWorker'] = TpModelWorkerSA - super().__init__(*args, **kwargs) - func_map_patch = [(SetupCollectiveGroupReqInput, self.setup_collective_group), - (BroadcastBucketReqInput, self.broadcast_bucket), - (BroadcastParameterReqInput, self.broadcast_parameter), - (UpdateParameterInBucketReqInput, self.update_parameter_in_bucket), - (UpdateParameterReqInput, self.update_parameter)] - self._request_dispatcher._mapping += func_map_patch - - def setup_collective_group(self, recv_req: SetupCollectiveGroupReqInput): - success, message = self.tp_worker.setup_collective_group(recv_req) - return SetupCollectiveGroupReqOutput(success, message) - - def release_memory_occupation(self, recv_req: ReleaseMemoryOccupationReqInput): - self.stashed_model_static_state = _export_static_state( - self.tp_worker.worker.model_runner.model - ) - self.tp_worker.worker.model_runner.model.to('cpu') - self.memory_saver_adapter.pause() - self.flush_cache() - return ReleaseMemoryOccupationReqOutput() - - def resume_memory_occupation(self, recv_req: ResumeMemoryOccupationReqInput): - self.tp_worker.worker.model_runner.model.to(current_platform.current_device()) - self.memory_saver_adapter.resume() - - # gc.collect() - # torch.cuda.empty_cache() - # self.tp_worker.worker.model_runner.model.to(current_platform.current_device()) - _import_static_state( - self.tp_worker.worker.model_runner.model, self.stashed_model_static_state - ) - del self.stashed_model_static_state - - self.tp_worker.worker.model_runner.init_cublas() - self.tp_worker.worker.model_runner.init_attention_backend() - from sglang.srt.model_executor.cuda_graph_runner import set_global_graph_memory_pool - set_global_graph_memory_pool(None) - self.tp_worker.worker.model_runner.init_cuda_graphs() - - return ResumeMemoryOccupationReqOutput() - - def broadcast_bucket(self, recv_req: BroadcastBucketReqInput): - success, message = self.tp_worker.broadcast_bucket(recv_req) - return BroadcastBucketReqOutput(success, message) - - def broadcast_parameter(self, recv_req: BroadcastParameterReqInput): - success, message = self.tp_worker.broadcast_parameter(recv_req) - return BroadcastParameterReqOutput(success, message) - - def update_parameter(self, recv_req: UpdateParameterReqInput): - success, message = self.tp_worker.update_parameter(recv_req) - return UpdateParameterReqOutput(success, message) - - def update_parameter_in_bucket(self, recv_req: UpdateParameterInBucketReqInput): - success, message = self.tp_worker.update_parameter_in_bucket(recv_req) - return UpdateParameterInBucketReqOutput(success, message) - - -def run_scheduler_process(*args, **kwargs): - import sys - sys.modules['sglang.srt.managers.scheduler'].__dict__['Scheduler'] = SchedulerSA - from sglang.srt.managers.scheduler import run_scheduler_process - return run_scheduler_process(*args, **kwargs) \ No newline at end of file diff --git a/roll/third_party/sglang/v046post4_patch/tokenizer_manager.py b/roll/third_party/sglang/v046post4_patch/tokenizer_manager.py deleted file mode 100644 index c751b1535..000000000 --- a/roll/third_party/sglang/v046post4_patch/tokenizer_manager.py +++ /dev/null @@ -1,126 +0,0 @@ -import os -from typing import Optional, Tuple -import fastapi - -from sglang.srt.server_args import PortArgs, ServerArgs -from sglang.srt.managers.tokenizer_manager import TokenizerManager, _Communicator - -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, - SetupCollectiveGroupReqOutput, - BroadcastBucketReqOutput, - BroadcastParameterReqOutput, - UpdateParameterInBucketReqOutput, - UpdateParameterReqOutput, -) - -class TokenizerManagerSA(TokenizerManager): - def __init__( - self, - server_args: ServerArgs, - port_args: PortArgs, - ): - super().__init__(server_args=server_args, port_args=port_args) - - self.setup_collective_group_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.broadcast_bucket_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.broadcast_parameter_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.update_parameter_in_bucket_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.update_parameter_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - - communicator_patch = [( - SetupCollectiveGroupReqOutput, - self.setup_collective_group_communicator.handle_recv, - ), - ( - BroadcastBucketReqOutput, - self.broadcast_bucket_communicator.handle_recv, - ), - ( - BroadcastParameterReqOutput, - self.broadcast_parameter_communicator.handle_recv, - ), - ( - UpdateParameterInBucketReqOutput, - self.update_parameter_in_bucket_communicator.handle_recv, - ), - ( - UpdateParameterReqOutput, - self.update_parameter_communicator.handle_recv, - )] - - self._result_dispatcher._mapping += communicator_patch - - async def setup_collective_group( - self, - obj: SetupCollectiveGroupReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - assert ( - self.server_args.dp_size == 1 - ), "dp_size must be 1 for init parameter update group" - result = (await self.setup_collective_group_communicator(obj))[0] - return result.success, result.message - - async def broadcast_bucket( - self, - obj: BroadcastBucketReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - assert ( - self.server_args.dp_size == 1 - ), "dp_size must be 1 for init parameter update group" - result = (await self.broadcast_bucket_communicator(obj))[0] - return result.success, result.message - - async def broadcast_parameter( - self, - obj: BroadcastParameterReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - assert ( - self.server_args.dp_size == 1 - ), "dp_size must be 1 for init parameter update group" - result = (await self.broadcast_parameter_communicator(obj))[0] - return result.success, result.message - - async def update_parameter( - self, - obj: UpdateParameterReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - assert ( - self.server_args.dp_size == 1 - ), "dp_size must be 1 for init parameter update group" - result = (await self.update_parameter_communicator(obj))[0] - return result.success, result.message - - async def update_parameter_in_bucket( - self, - obj: UpdateParameterInBucketReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - assert ( - self.server_args.dp_size == 1 - ), "dp_size must be 1 for init parameter update group" - result = (await self.update_parameter_in_bucket_communicator(obj))[0] - return result.success, result.message \ No newline at end of file diff --git a/roll/third_party/sglang/v046post4_patch/tp_worker.py b/roll/third_party/sglang/v046post4_patch/tp_worker.py deleted file mode 100644 index cf37cd5a0..000000000 --- a/roll/third_party/sglang/v046post4_patch/tp_worker.py +++ /dev/null @@ -1,86 +0,0 @@ -from sglang.srt.managers.tp_worker import TpModelWorker -from sglang.srt.managers.tp_worker_overlap_thread import TpModelWorkerClient - - -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, -) - -class TpModelWorkerSA(TpModelWorker): - def __init__(self, *args, **kwargs): - import sys - from roll.third_party.sglang.v046post4_patch.model_runner import ModelRunnerSA - sys.modules['sglang.srt.managers.tp_worker'].__dict__['ModelRunner'] = ModelRunnerSA - super().__init__(*args, **kwargs) - - def setup_collective_group(self, recv_req: SetupCollectiveGroupReqInput): - success, message = self.model_runner.setup_collective_group( - recv_req.comm_plan, - recv_req.backend, - recv_req.rank_in_cluster, - ) - return success, message - - def broadcast_bucket(self, recv_req: BroadcastBucketReqInput): - success, message = self.model_runner.broadcast_bucket( - recv_req.src_pp_rank, - recv_req.meta_infos, - recv_req.bucket_size, - ) - return success, message - - def broadcast_parameter(self, recv_req: BroadcastParameterReqInput): - success, message = self.model_runner.broadcast_parameter( - recv_req.src_pp_rank, - recv_req.dtype, - recv_req.shape, - recv_req.parameter_name, - ) - return success, message - - def update_parameter(self, recv_req: UpdateParameterReqInput): - success, message = self.model_runner.update_parameter( - recv_req.parameter_name, - recv_req.weight, - recv_req.ranks_in_worker, - ) - return success, message - - def update_parameter_in_bucket(self, recv_req: UpdateParameterInBucketReqInput): - success, message = self.model_runner.update_parameter_in_bucket( - recv_req.meta_infos, - recv_req.buffer, - recv_req.ranks_in_worker, - ) - return success, message - - -class TpModelWorkerClientSA(TpModelWorkerClient): - def __init__(self, *args, **kwargs): - import sys - sys.modules['sglang.srt.managers.tp_worker_overlap_thread'].__dict__['TpModelWorker'] = TpModelWorkerSA - super().__init__(*args, **kwargs) - - def setup_collective_group(self, recv_req: SetupCollectiveGroupReqInput): - success, message = self.worker.setup_collective_group(recv_req) - return success, message - - def broadcast_bucket(self, recv_req: BroadcastBucketReqInput): - success, message = self.worker.broadcast_bucket(recv_req) - return success, message - - def broadcast_parameter(self, recv_req: BroadcastParameterReqInput): - success, message = self.worker.broadcast_parameter(recv_req) - return success, message - - def update_parameter(self, recv_req: UpdateParameterReqInput): - success, message = self.worker.update_parameter(recv_req) - return success, message - - def update_parameter_in_bucket(self, recv_req: UpdateParameterInBucketReqInput): - success, message = self.worker.update_parameter_in_bucket(recv_req) - return success, message \ No newline at end of file diff --git a/roll/third_party/sglang/v052_patch/__init__.py b/roll/third_party/sglang/v052_patch/__init__.py index fa4bec152..32de7e606 100644 --- a/roll/third_party/sglang/v052_patch/__init__.py +++ b/roll/third_party/sglang/v052_patch/__init__.py @@ -1,2 +1 @@ from . import engine -from . import scheduler \ No newline at end of file diff --git a/roll/third_party/sglang/v052_patch/engine.py b/roll/third_party/sglang/v052_patch/engine.py index 12fe03aa1..48b2098ea 100644 --- a/roll/third_party/sglang/v052_patch/engine.py +++ b/roll/third_party/sglang/v052_patch/engine.py @@ -1,111 +1,66 @@ -import asyncio -from sglang.srt.entrypoints.engine import Engine +import os +import time +import random +import multiprocessing as mp -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, -) import sglang.srt.entrypoints.engine as engine_module +from sglang.srt.server_args import ServerArgs +from sglang.srt.utils import ( + set_prometheus_multiproc_dir, + set_ulimit, +) -class EngineSA(Engine): +# Remove signal handler. singla.signal in python can only run in MainThread which fails when using Ray Async Actor. +def _set_envs_and_config(server_args: ServerArgs): + # Set global environments + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem)) + if not server_args.enable_symm_mem: + os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls)) + os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4" + os.environ["CUDA_MODULE_LOADING"] = "AUTO" + # flashinfer uses this environment variable for various kernels from MoE to quant kernels + if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0": + os.environ["TRTLLM_ENABLE_PDL"] = "1" + + # Can also be passed as argument + os.environ["SGLANG_RUN_ID"] = ( + f"sglang-run-{time.time()}-{random.randint(0, 100000000)}" + ) + + # Set prometheus env vars + if server_args.enable_metrics: + set_prometheus_multiproc_dir() + + # Set ulimit + set_ulimit() + + # Set mp start method + mp.set_start_method("spawn", force=True) + +def run_scheduler_process(*args, **kwargs): + from roll.third_party.sglang import fp8 + fp8.monkey_patch_fp8() + + from sglang.srt.managers.scheduler import run_scheduler_process + return run_scheduler_process(*args, **kwargs) + +def run_data_parallel_controller_process(*args, **kwargs): + import sys + sys.modules['sglang.srt.managers.data_parallel_controller'].__dict__['run_scheduler_process'] = run_scheduler_process + + from sglang.srt.managers.data_parallel_controller import run_data_parallel_controller_process + return run_data_parallel_controller_process(*args, **kwargs) - def setup_collective_group( - self, - comm_plan: str, - backend: str, - rank_in_cluster: int, - ): - obj = SetupCollectiveGroupReqInput( - comm_plan=comm_plan, - backend=backend, - rank_in_cluster=rank_in_cluster, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.setup_collective_group(obj, None) - ) - - def broadcast_bucket( - self, - src_pp_rank: int, - meta_infos: dict, - bucket_size: int, - ): - obj = BroadcastBucketReqInput( - src_pp_rank=src_pp_rank, - meta_infos=meta_infos, - bucket_size=bucket_size, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.broadcast_bucket(obj, None) - ) - - def broadcast_parameter( - self, - src_pp_rank, - dtype, - shape, - parameter_name - ): - obj = BroadcastParameterReqInput( - src_pp_rank=src_pp_rank, - dtype=dtype, - shape=shape, - parameter_name=parameter_name, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.broadcast_parameter(obj, None) - ) - - def update_parameter( - self, - parameter_name, - weight, - ranks_in_worker - ): - obj = UpdateParameterReqInput( - parameter_name=parameter_name, - weight=weight, - ranks_in_worker=ranks_in_worker, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.update_parameter(obj, None) - ) - - def update_parameter_in_bucket( - self, - meta_infos, - buffer, - ranks_in_worker - ): - """Initialize parameter update group.""" - obj = UpdateParameterInBucketReqInput( - meta_infos=meta_infos, - buffer=buffer, - ranks_in_worker=ranks_in_worker, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.update_parameter_in_bucket(obj, None) - ) - class _roll_launch_subprocesses(object): def __init__(self, _launch_subprocesses): self._launch_subprocesses = _launch_subprocesses def __call__(self, *args, **kwargs): import sys - from roll.third_party.sglang.v052_patch.tokenizer_manager import TokenizerManagerSA - from roll.third_party.sglang.v052_patch.scheduler import run_scheduler_process, run_data_parallel_controller_process - - sys.modules['sglang.srt.entrypoints.engine'].__dict__['TokenizerManager'] = TokenizerManagerSA + + sys.modules['sglang.srt.entrypoints.engine'].__dict__['_set_envs_and_config'] = _set_envs_and_config sys.modules['sglang.srt.entrypoints.engine'].__dict__['run_scheduler_process'] = run_scheduler_process sys.modules['sglang.srt.entrypoints.engine'].__dict__['run_data_parallel_controller_process'] = run_data_parallel_controller_process return self._launch_subprocesses(*args, **kwargs) diff --git a/roll/third_party/sglang/v052_patch/io_struct.py b/roll/third_party/sglang/v052_patch/io_struct.py deleted file mode 100644 index faa6d156b..000000000 --- a/roll/third_party/sglang/v052_patch/io_struct.py +++ /dev/null @@ -1,62 +0,0 @@ -from dataclasses import dataclass - -@dataclass -class SetupCollectiveGroupReqInput: - comm_plan: dict - backend: int - rank_in_cluster: int - - -@dataclass -class SetupCollectiveGroupReqOutput: - success: bool - message: str - -@dataclass -class BroadcastBucketReqInput: - src_pp_rank: str - meta_infos: dict - bucket_size: int - - -@dataclass -class BroadcastBucketReqOutput: - success: bool - message: str - -@dataclass -class BroadcastParameterReqInput: - src_pp_rank: str - dtype: int - shape: dict - parameter_name: str - - -@dataclass -class BroadcastParameterReqOutput: - success: bool - message: str - -@dataclass -class UpdateParameterReqInput: - parameter_name: str - weight: int - ranks_in_worker: dict - - -@dataclass -class UpdateParameterReqOutput: - success: bool - message: str - -@dataclass -class UpdateParameterInBucketReqInput: - meta_infos: str - buffer: int - ranks_in_worker: dict - - -@dataclass -class UpdateParameterInBucketReqOutput: - success: bool - message: str \ No newline at end of file diff --git a/roll/third_party/sglang/v052_patch/model_runner.py b/roll/third_party/sglang/v052_patch/model_runner.py deleted file mode 100644 index ce1832d8d..000000000 --- a/roll/third_party/sglang/v052_patch/model_runner.py +++ /dev/null @@ -1,200 +0,0 @@ -import logging -from dataclasses import dataclass -import torch -import torch.distributed as dist -import datetime - -from roll.platforms import current_platform - - -from sglang.srt.model_executor.model_runner import ModelRunner, UNBALANCED_MODEL_LOADING_TIMEOUT_S -from sglang.srt.configs.device_config import DeviceConfig -from sglang.srt.configs.load_config import LoadConfig -from sglang.srt.configs.update_config import adjust_config_with_unaligned_cpu_tp -from sglang.srt.distributed import get_tp_group -from sglang.srt.layers.quantization import monkey_patch_isinstance_for_vllm_base_layer -from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state -from sglang.srt.model_loader import get_model -from sglang.srt.offloader import get_offloader - -from sglang.srt.utils import ( - get_available_gpu_memory, - monkey_patch_vllm_gguf_config, - set_cuda_arch, -) - -from roll.utils.collective import collective -from roll.utils.functionals import get_dist_info_from_comm_plan -from roll.platforms import current_platform - -logger = logging.getLogger(__name__) - - -class ModelRunnerSA(ModelRunner): - def load_model(self): - before_avail_memory = get_available_gpu_memory(self.device, self.gpu_id) - logger.info( - f"Load weight begin. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB" - ) - - # This can reduce thread conflicts and speed up weight loading. - if self.device != "cpu": - torch.set_num_threads(1) - if self.device == current_platform.device_type: - if current_platform.get_device_capability()[0] < 8: - logger.info( - "Compute capability below sm80. Use float16 due to lack of bfloat16 support." - ) - self.server_args.dtype = "float16" - self.model_config.dtype = torch.float16 - if current_platform.get_device_capability()[1] < 5: - raise RuntimeError("SGLang only supports sm75 and above.") - - set_cuda_arch() - - # Prepare the model config - self.load_config = LoadConfig( - load_format=self.server_args.load_format, - download_dir=self.server_args.download_dir, - model_loader_extra_config=self.server_args.model_loader_extra_config, - ) - if self.device == "cpu": - self.model_config = adjust_config_with_unaligned_cpu_tp( - self.model_config, self.load_config, self.tp_size - ) - if self.server_args.load_format == "gguf": - monkey_patch_vllm_gguf_config() - - # Load the model - # Remove monkey_patch when linear.py quant remove dependencies with vllm - monkey_patch_vllm_parallel_state() - monkey_patch_isinstance_for_vllm_base_layer() - - self.model = get_model( - model_config=self.model_config, - load_config=self.load_config, - device_config=DeviceConfig(self.device), - ) - monkey_patch_vllm_parallel_state(reverse=True) - monkey_patch_isinstance_for_vllm_base_layer(reverse=True) - - get_offloader().post_init() - - if self.server_args.kv_cache_dtype == "fp8_e4m3": - if self.server_args.quantization_param_path is not None: - if callable(getattr(self.model, "load_kv_cache_scales", None)): - self.model.load_kv_cache_scales( - self.server_args.quantization_param_path - ) - logger.info( - "Loaded KV cache scaling factors from %s", - self.server_args.quantization_param_path, - ) - else: - raise RuntimeError( - "Using FP8 KV cache and scaling factors provided but " - "model %s does not support loading scaling factors.", - self.model.__class__, - ) - else: - logger.warning( - "Using FP8 KV cache but no scaling factors " - "provided. Defaulting to scaling factors of 1.0. " - "This may lead to less accurate results!" - ) - - # Parse other args - self.sliding_window_size = None - if hasattr(self.model, "get_attention_sliding_window_size"): - self.sliding_window_size = self.model.get_attention_sliding_window_size() - elif self.model_config.attention_chunk_size is not None: - self.sliding_window_size = self.model_config.attention_chunk_size - logger.info( - f"Setting sliding_window_size to be attention_chunk_size: {self.sliding_window_size}" - ) - - self.dtype = self.model_config.dtype - - after_avail_memory = get_available_gpu_memory(self.device, self.gpu_id) - self.weight_load_mem_usage = before_avail_memory - after_avail_memory - logger.info( - f"Load weight end. " - f"type={type(self.model).__name__}, " - f"dtype={self.dtype}, " - f"avail mem={after_avail_memory:.2f} GB, " - f"mem usage={self.weight_load_mem_usage:.2f} GB." - ) - - # Handle the case where some ranks do not finish loading. - try: - dist.monitored_barrier( - group=get_tp_group().cpu_group, - timeout=datetime.timedelta(seconds=UNBALANCED_MODEL_LOADING_TIMEOUT_S), - wait_all_ranks=True, - ) - except RuntimeError: - raise ValueError( - f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node." - ) from None - - def setup_collective_group(self, comm_plan, backend, rank_in_cluster): - self.model_update_comm_plan = getattr(self, "model_update_comm_plan", {}) - rank, comm_plan_args = get_dist_info_from_comm_plan(comm_plan, rank_in_cluster=rank_in_cluster, - rank_in_worker=dist.get_rank()) - if rank is None: - logger.info(f"no comm_plan found for rank {rank_in_cluster}/{dist.get_rank()}") - return True, "Succeeded to setup_collective_group." - - group_name = comm_plan_args["group_name"] - master_addr = comm_plan_args["master_addr"] - master_port = comm_plan_args["master_port"] - world_size = len(comm_plan_args["tgt_devices"]) + 1 - src_pp_rank = comm_plan_args["src_pp_rank"] - collective.init_collective_group(world_size, rank, backend=backend, group_name=group_name, - master_addr=master_addr, master_port=master_port) - # A small all_reduce for warmup. - collective.allreduce(torch.zeros(1).to(current_platform.device_type), group_name=group_name) - self.model_update_comm_plan[src_pp_rank] = dict(rank=rank, - world_size=world_size, - src_pp_rank=src_pp_rank, - group_name=group_name, - comm_plan=comm_plan, - comm_plan_args=comm_plan_args) - logger.info(f"warmup setup_collective_group: {group_name} rank: {rank} world_size: {world_size}") - return True, "Succeeded to setup_collective_group." - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - if src_pp_rank not in self.model_update_comm_plan: - return True, "Succeeded to broadcast_bucket." - - comm_plan = self.model_update_comm_plan[src_pp_rank] - buffer = torch.empty(bucket_size, dtype=torch.int8, device=current_platform.device_type) - collective.broadcast(tensor=buffer, src_rank=0, group_name=comm_plan["group_name"]) - self.update_parameter_in_bucket(meta_infos, buffer, [dist.get_rank()]) - return True, "Succeeded to broadcast_bucket." - - def broadcast_parameter(self, src_pp_rank, dtype, shape, parameter_name): - if src_pp_rank not in self.model_update_comm_plan: - return True, "Succeeded to broadcast_parameter." - comm_plan = self.model_update_comm_plan[src_pp_rank] - weight = torch.empty(shape, dtype=dtype, device=current_platform.device_type) - collective.broadcast(tensor=weight, src_rank=0, group_name=comm_plan["group_name"]) - self.update_parameter(parameter_name, weight, [dist.get_rank()]) - return True, "Succeeded to broadcast_parameter." - - def update_parameter(self, parameter_name, weight, ranks_in_worker): - if dist.get_rank() not in ranks_in_worker: - return True, "Succeeded to update_parameter." - self.model.load_weights([(parameter_name, weight)]) - del weight - return True, "Succeeded to update_parameter." - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - if dist.get_rank() not in ranks_in_worker: - return True, "Succeeded to update_parameter_in_bucket." - from mcore_adapter.models.converter.convert_utils import RecvBucketManager - self.recv_manager = getattr(self, "recv_manager", RecvBucketManager()) - named_params = self.recv_manager.process_bucket(meta_infos, buffer) - del buffer - self.model.load_weights([(name, weight) for name, weight in named_params.items()]) - return True, "Succeeded to update_parameter_in_bucket." \ No newline at end of file diff --git a/roll/third_party/sglang/v052_patch/scheduler.py b/roll/third_party/sglang/v052_patch/scheduler.py deleted file mode 100644 index 48405d4df..000000000 --- a/roll/third_party/sglang/v052_patch/scheduler.py +++ /dev/null @@ -1,108 +0,0 @@ -import torch -from roll.platforms import current_platform - - -from sglang.srt.managers.io_struct import ( - ReleaseMemoryOccupationReqInput, - ReleaseMemoryOccupationReqOutput, - ResumeMemoryOccupationReqOutput, - ResumeMemoryOccupationReqInput, -) -from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE -from sglang.srt.managers.scheduler import Scheduler - -from sglang.srt.managers.scheduler_update_weights_mixin import _import_static_state, _export_static_state - - -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, - SetupCollectiveGroupReqOutput, - BroadcastBucketReqOutput, - BroadcastParameterReqOutput, - UpdateParameterInBucketReqOutput, - UpdateParameterReqOutput, -) - -class SchedulerSA(Scheduler): - def __init__(self, *args, **kwargs): - import sys - from roll.third_party.sglang.v052_patch.tp_worker import TpModelWorkerClientSA, TpModelWorkerSA - sys.modules['sglang.srt.managers.scheduler'].__dict__['TpModelWorkerClient'] = TpModelWorkerClientSA - sys.modules['sglang.srt.managers.scheduler'].__dict__['TpModelWorker'] = TpModelWorkerSA - super().__init__(*args, **kwargs) - func_map_patch = [(SetupCollectiveGroupReqInput, self.setup_collective_group), - (BroadcastBucketReqInput, self.broadcast_bucket), - (BroadcastParameterReqInput, self.broadcast_parameter), - (UpdateParameterInBucketReqInput, self.update_parameter_in_bucket), - (UpdateParameterReqInput, self.update_parameter)] - self._request_dispatcher._mapping += func_map_patch - - def setup_collective_group(self, recv_req: SetupCollectiveGroupReqInput): - success, message = self.tp_worker.setup_collective_group(recv_req) - return SetupCollectiveGroupReqOutput(success, message) - - def release_memory_occupation(self, recv_req: ReleaseMemoryOccupationReqInput): - self.stashed_model_static_state = _export_static_state( - self.tp_worker.worker.model_runner.model - ) - self.tp_worker.worker.model_runner.model.to('cpu') - self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_KV_CACHE) - self.flush_cache() - return ReleaseMemoryOccupationReqOutput() - - def resume_memory_occupation(self, recv_req: ResumeMemoryOccupationReqInput): - self.tp_worker.worker.model_runner.model.to(current_platform.current_device()) - self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_KV_CACHE) - - # gc.collect() - # torch.cuda.empty_cache() - # self.tp_worker.worker.model_runner.model.to(current_platform.current_device()) - _import_static_state( - self.tp_worker.worker.model_runner.model, self.stashed_model_static_state - ) - del self.stashed_model_static_state - - self.tp_worker.worker.model_runner.init_cublas() - self.tp_worker.worker.model_runner.init_attention_backend() - from sglang.srt.model_executor.cuda_graph_runner import set_global_graph_memory_pool - set_global_graph_memory_pool(None) - self.tp_worker.worker.model_runner.init_device_graphs() - - return ResumeMemoryOccupationReqOutput() - - def broadcast_bucket(self, recv_req: BroadcastBucketReqInput): - success, message = self.tp_worker.broadcast_bucket(recv_req) - return BroadcastBucketReqOutput(success, message) - - def broadcast_parameter(self, recv_req: BroadcastParameterReqInput): - success, message = self.tp_worker.broadcast_parameter(recv_req) - return BroadcastParameterReqOutput(success, message) - - def update_parameter(self, recv_req: UpdateParameterReqInput): - success, message = self.tp_worker.update_parameter(recv_req) - return UpdateParameterReqOutput(success, message) - - def update_parameter_in_bucket(self, recv_req: UpdateParameterInBucketReqInput): - success, message = self.tp_worker.update_parameter_in_bucket(recv_req) - return UpdateParameterInBucketReqOutput(success, message) - - -def run_scheduler_process(*args, **kwargs): - import sys - sys.modules['sglang.srt.managers.scheduler'].__dict__['Scheduler'] = SchedulerSA - from sglang.srt.managers.scheduler import run_scheduler_process - return run_scheduler_process(*args, **kwargs) - - -def run_data_parallel_controller_process(*args, **kwargs): - import sys - sys.modules['sglang.srt.managers.data_parallel_controller'].__dict__['run_scheduler_process'] = run_scheduler_process - from sglang.srt.managers.data_parallel_controller import ( - run_data_parallel_controller_process, - ) - return run_data_parallel_controller_process(*args, **kwargs) - diff --git a/roll/third_party/sglang/v052_patch/tokenizer_manager.py b/roll/third_party/sglang/v052_patch/tokenizer_manager.py deleted file mode 100644 index fd84c0f3c..000000000 --- a/roll/third_party/sglang/v052_patch/tokenizer_manager.py +++ /dev/null @@ -1,112 +0,0 @@ -import os -from typing import Optional, Tuple -import fastapi - -from sglang.srt.server_args import PortArgs, ServerArgs -from sglang.srt.managers.tokenizer_manager import TokenizerManager -from sglang.srt.managers.tokenizer_communicator_mixin import _Communicator - -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, - SetupCollectiveGroupReqOutput, - BroadcastBucketReqOutput, - BroadcastParameterReqOutput, - UpdateParameterInBucketReqOutput, - UpdateParameterReqOutput, -) - -class TokenizerManagerSA(TokenizerManager): - def __init__( - self, - server_args: ServerArgs, - port_args: PortArgs, - ): - super().__init__(server_args=server_args, port_args=port_args) - - self.setup_collective_group_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.broadcast_bucket_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.broadcast_parameter_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.update_parameter_in_bucket_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.update_parameter_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - - communicator_patch = [( - SetupCollectiveGroupReqOutput, - self.setup_collective_group_communicator.handle_recv, - ), - ( - BroadcastBucketReqOutput, - self.broadcast_bucket_communicator.handle_recv, - ), - ( - BroadcastParameterReqOutput, - self.broadcast_parameter_communicator.handle_recv, - ), - ( - UpdateParameterInBucketReqOutput, - self.update_parameter_in_bucket_communicator.handle_recv, - ), - ( - UpdateParameterReqOutput, - self.update_parameter_communicator.handle_recv, - )] - - self._result_dispatcher._mapping += communicator_patch - - async def setup_collective_group( - self, - obj: SetupCollectiveGroupReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - result = (await self.setup_collective_group_communicator(obj))[0] - return result.success, result.message - - async def broadcast_bucket( - self, - obj: BroadcastBucketReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - result = (await self.broadcast_bucket_communicator(obj))[0] - return result.success, result.message - - async def broadcast_parameter( - self, - obj: BroadcastParameterReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - result = (await self.broadcast_parameter_communicator(obj))[0] - return result.success, result.message - - async def update_parameter( - self, - obj: UpdateParameterReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - result = (await self.update_parameter_communicator(obj))[0] - return result.success, result.message - - async def update_parameter_in_bucket( - self, - obj: UpdateParameterInBucketReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - result = (await self.update_parameter_in_bucket_communicator(obj))[0] - return result.success, result.message \ No newline at end of file diff --git a/roll/third_party/sglang/v052_patch/tp_worker.py b/roll/third_party/sglang/v052_patch/tp_worker.py deleted file mode 100644 index 3840aa5db..000000000 --- a/roll/third_party/sglang/v052_patch/tp_worker.py +++ /dev/null @@ -1,85 +0,0 @@ -from sglang.srt.managers.tp_worker import TpModelWorker -from sglang.srt.managers.tp_worker_overlap_thread import TpModelWorkerClient - - -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, -) - -class TpModelWorkerSA(TpModelWorker): - def __init__(self, *args, **kwargs): - import sys - from roll.third_party.sglang.v052_patch.model_runner import ModelRunnerSA - sys.modules['sglang.srt.managers.tp_worker'].__dict__['ModelRunner'] = ModelRunnerSA - super().__init__(*args, **kwargs) - - def setup_collective_group(self, recv_req: SetupCollectiveGroupReqInput): - success, message = self.model_runner.setup_collective_group( - recv_req.comm_plan, - recv_req.backend, - recv_req.rank_in_cluster, - ) - return success, message - - def broadcast_bucket(self, recv_req: BroadcastBucketReqInput): - success, message = self.model_runner.broadcast_bucket( - recv_req.src_pp_rank, - recv_req.meta_infos, - recv_req.bucket_size, - ) - return success, message - - def broadcast_parameter(self, recv_req: BroadcastParameterReqInput): - success, message = self.model_runner.broadcast_parameter( - recv_req.src_pp_rank, - recv_req.dtype, - recv_req.shape, - recv_req.parameter_name, - ) - return success, message - - def update_parameter(self, recv_req: UpdateParameterReqInput): - success, message = self.model_runner.update_parameter( - recv_req.parameter_name, - recv_req.weight, - recv_req.ranks_in_worker, - ) - return success, message - - def update_parameter_in_bucket(self, recv_req: UpdateParameterInBucketReqInput): - success, message = self.model_runner.update_parameter_in_bucket( - recv_req.meta_infos, - recv_req.buffer, - recv_req.ranks_in_worker, - ) - return success, message - -class TpModelWorkerClientSA(TpModelWorkerClient): - def __init__(self, *args, **kwargs): - import sys - sys.modules['sglang.srt.managers.tp_worker_overlap_thread'].__dict__['TpModelWorker'] = TpModelWorkerSA - super().__init__(*args, **kwargs) - - def setup_collective_group(self, recv_req: SetupCollectiveGroupReqInput): - success, message = self.worker.setup_collective_group(recv_req) - return success, message - - def broadcast_bucket(self, recv_req: BroadcastBucketReqInput): - success, message = self.worker.broadcast_bucket(recv_req) - return success, message - - def broadcast_parameter(self, recv_req: BroadcastParameterReqInput): - success, message = self.worker.broadcast_parameter(recv_req) - return success, message - - def update_parameter(self, recv_req: UpdateParameterReqInput): - success, message = self.worker.update_parameter(recv_req) - return success, message - - def update_parameter_in_bucket(self, recv_req: UpdateParameterInBucketReqInput): - success, message = self.worker.update_parameter_in_bucket(recv_req) - return success, message \ No newline at end of file diff --git a/roll/third_party/sglang/v054_patch/__init__.py b/roll/third_party/sglang/v054_patch/__init__.py index fa4bec152..32de7e606 100644 --- a/roll/third_party/sglang/v054_patch/__init__.py +++ b/roll/third_party/sglang/v054_patch/__init__.py @@ -1,2 +1 @@ from . import engine -from . import scheduler \ No newline at end of file diff --git a/roll/third_party/sglang/v054_patch/engine.py b/roll/third_party/sglang/v054_patch/engine.py index df7f7ba56..b2beb1d31 100644 --- a/roll/third_party/sglang/v054_patch/engine.py +++ b/roll/third_party/sglang/v054_patch/engine.py @@ -1,115 +1,76 @@ -import asyncio -from sglang.srt.entrypoints.engine import Engine - -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, -) +import os +import time +import random +import multiprocessing as mp + import sglang.srt.entrypoints.engine as engine_module +from sglang.srt.server_args import ServerArgs +from sglang.srt.utils import ( + set_prometheus_multiproc_dir, + set_ulimit, +) + + +# Remove signal handler. singla.signal in python can only run in MainThread which fails when using Ray Async Actor. +def _set_envs_and_config(server_args: ServerArgs): + # Set global environments + os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" + os.environ["NCCL_CUMEM_ENABLE"] = str(int(server_args.enable_symm_mem)) + if not server_args.enable_symm_mem: + os.environ["NCCL_NVLS_ENABLE"] = str(int(server_args.enable_nccl_nvls)) + os.environ["CUDA_DEVICE_MAX_CONNECTIONS"] = "4" + os.environ["CUDA_MODULE_LOADING"] = "AUTO" + # flashinfer uses this environment variable for various kernels from MoE to quant kernels + if os.environ.get("TRTLLM_ENABLE_PDL", "1") != "0": + os.environ["TRTLLM_ENABLE_PDL"] = "1" + + if os.environ.get("CUTE_DSL_LOG_LEVEL") is None: + # Default to warning level, to avoid too many logs + os.environ["CUTE_DSL_LOG_LEVEL"] = "30" + if os.environ.get("CUTE_DSL_LOG_TO_CONSOLE") is None: + # Need to set log to console, otherwise the log level won't take effect + os.environ["CUTE_DSL_LOG_TO_CONSOLE"] = "1" + + # Can also be passed as argument + os.environ["SGLANG_RUN_ID"] = ( + f"sglang-run-{time.time()}-{random.randint(0, 100000000)}" + ) + # Set prometheus env vars + if server_args.enable_metrics: + set_prometheus_multiproc_dir() + + # Set ulimit + set_ulimit() + + # Set mp start method + mp.set_start_method("spawn", force=True) + +def run_scheduler_process(*args, **kwargs): + from roll.third_party.sglang import fp8 + fp8.monkey_patch_fp8() + + from sglang.srt.managers.scheduler import run_scheduler_process + return run_scheduler_process(*args, **kwargs) + +def run_data_parallel_controller_process(*args, **kwargs): + import sys + sys.modules['sglang.srt.managers.data_parallel_controller'].__dict__['run_scheduler_process'] = run_scheduler_process + + from sglang.srt.managers.data_parallel_controller import run_data_parallel_controller_process + return run_data_parallel_controller_process(*args, **kwargs) -class EngineSA(Engine): - - def setup_collective_group( - self, - comm_plan: str, - backend: str, - rank_in_cluster: int, - ): - obj = SetupCollectiveGroupReqInput( - comm_plan=comm_plan, - backend=backend, - rank_in_cluster=rank_in_cluster, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.setup_collective_group(obj, None) - ) - - def broadcast_bucket( - self, - src_pp_rank: int, - meta_infos: dict, - bucket_size: int, - ): - obj = BroadcastBucketReqInput( - src_pp_rank=src_pp_rank, - meta_infos=meta_infos, - bucket_size=bucket_size, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.broadcast_bucket(obj, None) - ) - - def broadcast_parameter( - self, - src_pp_rank, - dtype, - shape, - parameter_name - ): - obj = BroadcastParameterReqInput( - src_pp_rank=src_pp_rank, - dtype=dtype, - shape=shape, - parameter_name=parameter_name, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.broadcast_parameter(obj, None) - ) - - def update_parameter( - self, - parameter_name, - weight, - ranks_in_worker - ): - obj = UpdateParameterReqInput( - parameter_name=parameter_name, - weight=weight, - ranks_in_worker=ranks_in_worker, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.update_parameter(obj, None) - ) - - def update_parameter_in_bucket( - self, - meta_infos, - buffer, - ranks_in_worker - ): - """Initialize parameter update group.""" - obj = UpdateParameterInBucketReqInput( - meta_infos=meta_infos, - buffer=buffer, - ranks_in_worker=ranks_in_worker, - ) - loop = asyncio.get_event_loop() - return loop.run_until_complete( - self.tokenizer_manager.update_parameter_in_bucket(obj, None) - ) - class _roll_launch_subprocesses(object): def __init__(self, _launch_subprocesses): self._launch_subprocesses = _launch_subprocesses - + def __call__(self, *args, **kwargs): import sys - from roll.third_party.sglang.v054_patch.tokenizer_manager import TokenizerManagerSA - from roll.third_party.sglang.v054_patch.scheduler import run_scheduler_process, run_data_parallel_controller_process - - sys.modules['sglang.srt.entrypoints.engine'].__dict__['TokenizerManager'] = TokenizerManagerSA + sys.modules['sglang.srt.entrypoints.engine'].__dict__['_set_envs_and_config'] = _set_envs_and_config sys.modules['sglang.srt.entrypoints.engine'].__dict__['run_scheduler_process'] = run_scheduler_process sys.modules['sglang.srt.entrypoints.engine'].__dict__['run_data_parallel_controller_process'] = run_data_parallel_controller_process return self._launch_subprocesses(*args, **kwargs) -engine_module._launch_subprocesses = _roll_launch_subprocesses(engine_module._launch_subprocesses) \ No newline at end of file +engine_module._launch_subprocesses = _roll_launch_subprocesses(engine_module._launch_subprocesses) diff --git a/roll/third_party/sglang/v054_patch/model_runner.py b/roll/third_party/sglang/v054_patch/model_runner.py deleted file mode 100644 index 12529a4c1..000000000 --- a/roll/third_party/sglang/v054_patch/model_runner.py +++ /dev/null @@ -1,246 +0,0 @@ -import logging -import torch -import torch.distributed as dist -import datetime -import socket -import threading - -from roll.platforms import current_platform - - -from sglang.srt.model_executor.model_runner import ModelRunner, UNBALANCED_MODEL_LOADING_TIMEOUT_S -from sglang.srt.configs.device_config import DeviceConfig -from sglang.srt.configs.load_config import LoadConfig, LoadFormat - -from sglang.srt.configs.update_config import adjust_config_with_unaligned_cpu_tp -from sglang.srt.distributed import get_tp_group -from sglang.srt.distributed.parallel_state import monkey_patch_vllm_parallel_state -from sglang.srt.model_loader import get_model -from sglang.srt.model_loader.remote_instance_weight_loader_utils import ( - trigger_init_weights_send_group_for_remote_instance_request, -) -from sglang.srt.debug_utils.tensor_dump_forward_hook import ( - register_forward_hook_for_model, -) -from sglang.srt.utils.offloader import get_offloader - -from sglang.srt.utils import ( - get_available_gpu_memory, - set_cuda_arch, -) - -from roll.utils.collective import collective -from roll.utils.functionals import get_dist_info_from_comm_plan -from roll.platforms import current_platform - -logger = logging.getLogger(__name__) - - -class ModelRunnerSA(ModelRunner): - def load_model(self): - before_avail_memory = get_available_gpu_memory(self.device, self.gpu_id) - logger.info( - f"Load weight begin. avail mem={get_available_gpu_memory(self.device, self.gpu_id):.2f} GB" - ) - - # This can reduce thread conflicts and speed up weight loading. - if self.device != "cpu": - torch.set_num_threads(1) - if self.device == "cuda": - if torch.cuda.get_device_capability()[0] < 8: - logger.info( - "Compute capability below sm80. Use float16 due to lack of bfloat16 support." - ) - self.server_args.dtype = "float16" - self.model_config.dtype = torch.float16 - if torch.cuda.get_device_capability()[1] < 5: - raise RuntimeError("SGLang only supports sm75 and above.") - - set_cuda_arch() - - # Prepare the model config - from sglang.srt.configs.modelopt_config import ModelOptConfig - - modelopt_config = ModelOptConfig( - quant=self.server_args.modelopt_quant, - checkpoint_restore_path=self.server_args.modelopt_checkpoint_restore_path, - checkpoint_save_path=self.server_args.modelopt_checkpoint_save_path, - export_path=self.server_args.modelopt_export_path, - quantize_and_serve=self.server_args.quantize_and_serve, - ) - - self.load_config = LoadConfig( - load_format=self.server_args.load_format, - download_dir=self.server_args.download_dir, - model_loader_extra_config=self.server_args.model_loader_extra_config, - tp_rank=self.tp_rank, - remote_instance_weight_loader_seed_instance_ip=self.server_args.remote_instance_weight_loader_seed_instance_ip, - remote_instance_weight_loader_seed_instance_service_port=self.server_args.remote_instance_weight_loader_seed_instance_service_port, - remote_instance_weight_loader_send_weights_group_ports=self.server_args.remote_instance_weight_loader_send_weights_group_ports, - modelopt_config=modelopt_config, - ) - if self.device == "cpu": - self.model_config = adjust_config_with_unaligned_cpu_tp( - self.model_config, self.load_config, self.tp_size - ) - - if self.server_args.load_format == LoadFormat.REMOTE_INSTANCE: - if self.tp_rank == 0: - instance_ip = socket.gethostbyname(socket.gethostname()) - t = threading.Thread( - target=trigger_init_weights_send_group_for_remote_instance_request, - args=( - self.server_args.remote_instance_weight_loader_seed_instance_ip, - self.server_args.remote_instance_weight_loader_seed_instance_service_port, - self.server_args.remote_instance_weight_loader_send_weights_group_ports, - instance_ip, - ), - ) - t.start() - - # Load the model - # Remove monkey_patch when linear.py quant remove dependencies with vllm - monkey_patch_vllm_parallel_state() - - self.model = get_model( - model_config=self.model_config, - load_config=self.load_config, - device_config=DeviceConfig(self.device, self.gpu_id), - ) - monkey_patch_vllm_parallel_state(reverse=True) - - get_offloader().post_init() - - if self.server_args.kv_cache_dtype == "fp8_e4m3": - if self.server_args.quantization_param_path is not None: - if callable(getattr(self.model, "load_kv_cache_scales", None)): - self.model.load_kv_cache_scales( - self.server_args.quantization_param_path - ) - logger.info( - "Loaded KV cache scaling factors from %s", - self.server_args.quantization_param_path, - ) - else: - raise RuntimeError( - "Using FP8 KV cache and scaling factors provided but " - "model %s does not support loading scaling factors.", - self.model.__class__, - ) - else: - logger.warning( - "Using FP8 KV cache but no scaling factors " - "provided. Defaulting to scaling factors of 1.0. " - "This may lead to less accurate results!" - ) - - # Parse other args - self.sliding_window_size = None - if hasattr(self.model, "get_attention_sliding_window_size"): - self.sliding_window_size = self.model.get_attention_sliding_window_size() - elif self.model_config.attention_chunk_size is not None: - self.sliding_window_size = self.model_config.attention_chunk_size - logger.info( - f"Setting sliding_window_size to be attention_chunk_size: {self.sliding_window_size}" - ) - - self.dtype = self.model_config.dtype - - after_avail_memory = get_available_gpu_memory(self.device, self.gpu_id) - self.weight_load_mem_usage = before_avail_memory - after_avail_memory - logger.info( - f"Load weight end. " - f"type={type(self.model).__name__}, " - f"dtype={self.dtype}, " - f"avail mem={after_avail_memory:.2f} GB, " - f"mem usage={self.weight_load_mem_usage:.2f} GB." - ) - if self.server_args.debug_tensor_dump_output_folder is not None: - register_forward_hook_for_model( - self.model, - self.server_args.debug_tensor_dump_output_folder, - self.server_args.debug_tensor_dump_layers, - self.tp_size, - self.tp_rank, - self.pp_rank, - ) - - if self.server_args.elastic_ep_backend == "mooncake": - # Mooncake does not support `monitored_barrier` - dist.barrier(group=get_tp_group().cpu_group) - else: - # Handle the case where some ranks do not finish loading. - try: - dist.monitored_barrier( - group=get_tp_group().cpu_group, - timeout=datetime.timedelta( - seconds=UNBALANCED_MODEL_LOADING_TIMEOUT_S - ), - wait_all_ranks=True, - ) - except RuntimeError: - raise ValueError( - f"TP rank {self.tp_rank} could finish the model loading, but there are other ranks that didn't finish loading. It is likely due to unexpected failures (e.g., OOM) or a slow node." - ) from None - - def setup_collective_group(self, comm_plan, backend, rank_in_cluster): - self.model_update_comm_plan = getattr(self, "model_update_comm_plan", {}) - rank, comm_plan_args = get_dist_info_from_comm_plan(comm_plan, rank_in_cluster=rank_in_cluster, - rank_in_worker=dist.get_rank()) - if rank is None: - logger.info(f"no comm_plan found for rank {rank_in_cluster}/{dist.get_rank()}") - return True, "Succeeded to setup_collective_group." - - group_name = comm_plan_args["group_name"] - master_addr = comm_plan_args["master_addr"] - master_port = comm_plan_args["master_port"] - world_size = len(comm_plan_args["tgt_devices"]) + 1 - src_pp_rank = comm_plan_args["src_pp_rank"] - collective.init_collective_group(world_size, rank, backend=backend, group_name=group_name, - master_addr=master_addr, master_port=master_port) - # A small all_reduce for warmup. - collective.allreduce(torch.zeros(1).to(current_platform.device_type), group_name=group_name) - self.model_update_comm_plan[src_pp_rank] = dict(rank=rank, - world_size=world_size, - src_pp_rank=src_pp_rank, - group_name=group_name, - comm_plan=comm_plan, - comm_plan_args=comm_plan_args) - logger.info(f"warmup setup_collective_group: {group_name} rank: {rank} world_size: {world_size}") - return True, "Succeeded to setup_collective_group." - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - if src_pp_rank not in self.model_update_comm_plan: - return True, "Succeeded to broadcast_bucket." - - comm_plan = self.model_update_comm_plan[src_pp_rank] - buffer = torch.empty(bucket_size, dtype=torch.int8, device=current_platform.device_type) - collective.broadcast(tensor=buffer, src_rank=0, group_name=comm_plan["group_name"]) - self.update_parameter_in_bucket(meta_infos, buffer, [dist.get_rank()]) - return True, "Succeeded to broadcast_bucket." - - def broadcast_parameter(self, src_pp_rank, dtype, shape, parameter_name): - if src_pp_rank not in self.model_update_comm_plan: - return True, "Succeeded to broadcast_parameter." - comm_plan = self.model_update_comm_plan[src_pp_rank] - weight = torch.empty(shape, dtype=dtype, device=current_platform.device_type) - collective.broadcast(tensor=weight, src_rank=0, group_name=comm_plan["group_name"]) - self.update_parameter(parameter_name, weight, [dist.get_rank()]) - return True, "Succeeded to broadcast_parameter." - - def update_parameter(self, parameter_name, weight, ranks_in_worker): - if dist.get_rank() not in ranks_in_worker: - return True, "Succeeded to update_parameter." - self.model.load_weights([(parameter_name, weight)]) - del weight - return True, "Succeeded to update_parameter." - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - if dist.get_rank() not in ranks_in_worker: - return True, "Succeeded to update_parameter_in_bucket." - from mcore_adapter.models.converter.convert_utils import RecvBucketManager - self.recv_manager = getattr(self, "recv_manager", RecvBucketManager()) - named_params = self.recv_manager.process_bucket(meta_infos, buffer) - del buffer - self.model.load_weights([(name, weight) for name, weight in named_params.items()]) - return True, "Succeeded to update_parameter_in_bucket." \ No newline at end of file diff --git a/roll/third_party/sglang/v054_patch/scheduler.py b/roll/third_party/sglang/v054_patch/scheduler.py deleted file mode 100644 index ed87999ad..000000000 --- a/roll/third_party/sglang/v054_patch/scheduler.py +++ /dev/null @@ -1,105 +0,0 @@ -import torch -from roll.platforms import current_platform - - -from sglang.srt.managers.io_struct import ( - ReleaseMemoryOccupationReqInput, - ReleaseMemoryOccupationReqOutput, - ResumeMemoryOccupationReqOutput, - ResumeMemoryOccupationReqInput, -) -from sglang.srt.constants import GPU_MEMORY_TYPE_KV_CACHE -from sglang.srt.managers.scheduler import Scheduler - -from sglang.srt.managers.scheduler_update_weights_mixin import _import_static_state, _export_static_state - - -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, - SetupCollectiveGroupReqOutput, - BroadcastBucketReqOutput, - BroadcastParameterReqOutput, - UpdateParameterInBucketReqOutput, - UpdateParameterReqOutput, -) - -class SchedulerSA(Scheduler): - def __init__(self, *args, **kwargs): - import sys - from roll.third_party.sglang.v054_patch.tp_worker import TpModelWorkerSA - sys.modules['sglang.srt.managers.tp_worker'].__dict__['TpModelWorker'] = TpModelWorkerSA - super().__init__(*args, **kwargs) - func_map_patch = [(SetupCollectiveGroupReqInput, self.setup_collective_group), - (BroadcastBucketReqInput, self.broadcast_bucket), - (BroadcastParameterReqInput, self.broadcast_parameter), - (UpdateParameterInBucketReqInput, self.update_parameter_in_bucket), - (UpdateParameterReqInput, self.update_parameter)] - self._request_dispatcher._mapping += func_map_patch - - def setup_collective_group(self, recv_req: SetupCollectiveGroupReqInput): - success, message = self.tp_worker.setup_collective_group(recv_req) - return SetupCollectiveGroupReqOutput(success, message) - - def release_memory_occupation(self, recv_req: ReleaseMemoryOccupationReqInput): - self.stashed_model_static_state = _export_static_state( - self.tp_worker.model_runner.model - ) - self.tp_worker.model_runner.model.to('cpu') - self.memory_saver_adapter.pause(GPU_MEMORY_TYPE_KV_CACHE) - self.flush_cache() - return ReleaseMemoryOccupationReqOutput() - - def resume_memory_occupation(self, recv_req: ResumeMemoryOccupationReqInput): - self.tp_worker.model_runner.model.to(current_platform.current_device()) - self.memory_saver_adapter.resume(GPU_MEMORY_TYPE_KV_CACHE) - - # gc.collect() - # torch.cuda.empty_cache() - # self.tp_worker.model_runner.model.to(current_platform.current_device()) - _import_static_state( - self.tp_worker.model_runner.model, self.stashed_model_static_state - ) - del self.stashed_model_static_state - - self.tp_worker.model_runner.init_cublas() - self.tp_worker.model_runner.init_attention_backend() - from sglang.srt.model_executor.cuda_graph_runner import set_global_graph_memory_pool - set_global_graph_memory_pool(None) - self.tp_worker.model_runner.init_device_graphs() - - return ResumeMemoryOccupationReqOutput() - - def broadcast_bucket(self, recv_req: BroadcastBucketReqInput): - success, message = self.tp_worker.broadcast_bucket(recv_req) - return BroadcastBucketReqOutput(success, message) - - def broadcast_parameter(self, recv_req: BroadcastParameterReqInput): - success, message = self.tp_worker.broadcast_parameter(recv_req) - return BroadcastParameterReqOutput(success, message) - - def update_parameter(self, recv_req: UpdateParameterReqInput): - success, message = self.tp_worker.update_parameter(recv_req) - return UpdateParameterReqOutput(success, message) - - def update_parameter_in_bucket(self, recv_req: UpdateParameterInBucketReqInput): - success, message = self.tp_worker.update_parameter_in_bucket(recv_req) - return UpdateParameterInBucketReqOutput(success, message) - - -def run_scheduler_process(*args, **kwargs): - import sys - sys.modules['sglang.srt.managers.scheduler'].__dict__['Scheduler'] = SchedulerSA - from sglang.srt.managers.scheduler import run_scheduler_process - return run_scheduler_process(*args, **kwargs) - -def run_data_parallel_controller_process(*args, **kwargs): - import sys - sys.modules['sglang.srt.managers.data_parallel_controller'].__dict__['run_scheduler_process'] = run_scheduler_process - from sglang.srt.managers.data_parallel_controller import ( - run_data_parallel_controller_process, - ) - return run_data_parallel_controller_process(*args, **kwargs) diff --git a/roll/third_party/sglang/v054_patch/tokenizer_manager.py b/roll/third_party/sglang/v054_patch/tokenizer_manager.py deleted file mode 100644 index fd84c0f3c..000000000 --- a/roll/third_party/sglang/v054_patch/tokenizer_manager.py +++ /dev/null @@ -1,112 +0,0 @@ -import os -from typing import Optional, Tuple -import fastapi - -from sglang.srt.server_args import PortArgs, ServerArgs -from sglang.srt.managers.tokenizer_manager import TokenizerManager -from sglang.srt.managers.tokenizer_communicator_mixin import _Communicator - -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, - SetupCollectiveGroupReqOutput, - BroadcastBucketReqOutput, - BroadcastParameterReqOutput, - UpdateParameterInBucketReqOutput, - UpdateParameterReqOutput, -) - -class TokenizerManagerSA(TokenizerManager): - def __init__( - self, - server_args: ServerArgs, - port_args: PortArgs, - ): - super().__init__(server_args=server_args, port_args=port_args) - - self.setup_collective_group_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.broadcast_bucket_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.broadcast_parameter_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.update_parameter_in_bucket_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - self.update_parameter_communicator = _Communicator( - self.send_to_scheduler, server_args.dp_size - ) - - communicator_patch = [( - SetupCollectiveGroupReqOutput, - self.setup_collective_group_communicator.handle_recv, - ), - ( - BroadcastBucketReqOutput, - self.broadcast_bucket_communicator.handle_recv, - ), - ( - BroadcastParameterReqOutput, - self.broadcast_parameter_communicator.handle_recv, - ), - ( - UpdateParameterInBucketReqOutput, - self.update_parameter_in_bucket_communicator.handle_recv, - ), - ( - UpdateParameterReqOutput, - self.update_parameter_communicator.handle_recv, - )] - - self._result_dispatcher._mapping += communicator_patch - - async def setup_collective_group( - self, - obj: SetupCollectiveGroupReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - result = (await self.setup_collective_group_communicator(obj))[0] - return result.success, result.message - - async def broadcast_bucket( - self, - obj: BroadcastBucketReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - result = (await self.broadcast_bucket_communicator(obj))[0] - return result.success, result.message - - async def broadcast_parameter( - self, - obj: BroadcastParameterReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - result = (await self.broadcast_parameter_communicator(obj))[0] - return result.success, result.message - - async def update_parameter( - self, - obj: UpdateParameterReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - result = (await self.update_parameter_communicator(obj))[0] - return result.success, result.message - - async def update_parameter_in_bucket( - self, - obj: UpdateParameterInBucketReqInput, - request: Optional[fastapi.Request] = None, - ) -> Tuple[bool, str]: - self.auto_create_handle_loop() - result = (await self.update_parameter_in_bucket_communicator(obj))[0] - return result.success, result.message \ No newline at end of file diff --git a/roll/third_party/sglang/v054_patch/tp_worker.py b/roll/third_party/sglang/v054_patch/tp_worker.py deleted file mode 100644 index eee8a8075..000000000 --- a/roll/third_party/sglang/v054_patch/tp_worker.py +++ /dev/null @@ -1,58 +0,0 @@ -from sglang.srt.managers.tp_worker import TpModelWorker - - -from roll.third_party.sglang.io_struct import ( - SetupCollectiveGroupReqInput, - BroadcastBucketReqInput, - BroadcastParameterReqInput, - UpdateParameterInBucketReqInput, - UpdateParameterReqInput, -) - -class TpModelWorkerSA(TpModelWorker): - def __init__(self, *args, **kwargs): - import sys - from roll.third_party.sglang.v054_patch.model_runner import ModelRunnerSA - sys.modules['sglang.srt.managers.tp_worker'].__dict__['ModelRunner'] = ModelRunnerSA - super().__init__(*args, **kwargs) - - def setup_collective_group(self, recv_req: SetupCollectiveGroupReqInput): - success, message = self.model_runner.setup_collective_group( - recv_req.comm_plan, - recv_req.backend, - recv_req.rank_in_cluster, - ) - return success, message - - def broadcast_bucket(self, recv_req: BroadcastBucketReqInput): - success, message = self.model_runner.broadcast_bucket( - recv_req.src_pp_rank, - recv_req.meta_infos, - recv_req.bucket_size, - ) - return success, message - - def broadcast_parameter(self, recv_req: BroadcastParameterReqInput): - success, message = self.model_runner.broadcast_parameter( - recv_req.src_pp_rank, - recv_req.dtype, - recv_req.shape, - recv_req.parameter_name, - ) - return success, message - - def update_parameter(self, recv_req: UpdateParameterReqInput): - success, message = self.model_runner.update_parameter( - recv_req.parameter_name, - recv_req.weight, - recv_req.ranks_in_worker, - ) - return success, message - - def update_parameter_in_bucket(self, recv_req: UpdateParameterInBucketReqInput): - success, message = self.model_runner.update_parameter_in_bucket( - recv_req.meta_infos, - recv_req.buffer, - recv_req.ranks_in_worker, - ) - return success, message \ No newline at end of file diff --git a/roll/third_party/vllm/__init__.py b/roll/third_party/vllm/__init__.py index 3f6c19a28..2c7d061ae 100644 --- a/roll/third_party/vllm/__init__.py +++ b/roll/third_party/vllm/__init__.py @@ -1,30 +1,133 @@ +import os +import pathlib +from typing import Dict, List + +import torch import vllm from packaging.version import Version +from vllm import envs +from vllm.engine.arg_utils import AsyncEngineArgs +from vllm.envs import get_default_cache_root +from vllm.usage.usage_lib import UsageContext +import roll.third_party.vllm.fp8 as fp8 +from roll.utils.import_utils import safe_import_class from roll.utils.logging import get_logger -logger = get_logger() -LLM = None -AsyncLLM = None +logger = get_logger() if Version("0.8.4") == Version(vllm.__version__): - from roll.third_party.vllm.vllm_0_8_4.llm import Llm084 - from roll.third_party.vllm.vllm_0_8_4.v1.async_llm import AsyncLLM084 - LLM = Llm084 - AsyncLLM = AsyncLLM084 -elif Version("0.10.0") <= Version(vllm.__version__) < Version("0.10.2"): - from roll.third_party.vllm.vllm_0_10_0.llm import Llm0100 - from roll.third_party.vllm.vllm_0_10_0.v1.async_llm import AsyncLLM0100 - LLM = Llm0100 - AsyncLLM = AsyncLLM0100 + import roll.third_party.vllm.vllm_0_8_4 # apply patch + ray_executor_class_v0 = safe_import_class("roll.third_party.vllm.vllm_0_8_4.ray_distributed_executor.CustomRayDistributedExecutor") + ray_executor_class_v1 = safe_import_class("roll.third_party.vllm.vllm_0_8_4.v1.ray_distributed_executor.CustomRayDistributedExecutor") elif Version("0.10.2") == Version(vllm.__version__): - from roll.third_party.vllm.vllm_0_10_2.llm import Llm0102 - LLM = Llm0102 -elif Version("0.11.1rc2.dev0+gc3a722fcb.d20251021") == Version(vllm.__version__) or Version("0.11.0") == Version(vllm.__version__): - from roll.third_party.vllm.vllm_0_11_0.llm import Llm0110 - LLM = Llm0110 + ray_executor_class_v0 = safe_import_class("roll.third_party.vllm.vllm_0_10_2.ray_distributed_executor.CustomRayDistributedExecutor") + ray_executor_class_v1 = safe_import_class("roll.third_party.vllm.vllm_0_10_2.v1.ray_distributed_executor.CustomRayDistributedExecutor") +elif Version("0.11.0") == Version(vllm.__version__) or Version("0.11.1rc1") == Version(vllm.__version__) or Version("0.11.1rc2.dev0+gc3a722fcb.d20251021") == Version(vllm.__version__): + ray_executor_class_v0 = safe_import_class("roll.third_party.vllm.vllm_0_11_0.ray_distributed_executor.CustomRayDistributedExecutor") + ray_executor_class_v1 = safe_import_class("roll.third_party.vllm.vllm_0_11_0.v1.ray_distributed_executor.CustomRayDistributedExecutor") +elif Version("0.12.0") == Version(vllm.__version__): + ray_executor_class_v0 = None # V0 deprecated + ray_executor_class_v1 = safe_import_class("roll.third_party.vllm.vllm_0_12_0.ray_distributed_executor.CustomRayDistributedExecutor") else: - raise NotImplementedError(f"roll vllm version {vllm.__version__} is not supported.") + ray_executor_class_v0 = None + ray_executor_class_v1 = None + logger.warning(f"ROLL is not tested on vllm version {vllm.__version__}, something strange may happen!!!") + +logger.info("Using vllm version {vllm.__version__}") + + +async def create_async_llm(resource_placement_groups: List[Dict], **kwargs): + kwargs["enable_sleep_mode"] = True + + if "worker_extension_cls" not in kwargs: + # VLLM_USE_V1 is deprecated in vllm>=0.11.1 + if not hasattr(envs, "VLLM_USE_V1") or envs.VLLM_USE_V1: + kwargs["worker_extension_cls"] = "roll.third_party.vllm.worker.WorkerV1" + else: + kwargs["worker_extension_cls"] = "roll.third_party.vllm.worker.WorkerBase" + + # https://github.com/vllm-project/vllm/pull/14189/files + # TODO do not override other options in PYTORCH_CUDA_ALLOC_CONF + os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "" + # torch.cuda may already init, explicitly disable expandable_segments + # here (only matters when VLLM_USE_RAY_SPMD_WORKER=0) + torch.cuda.memory._set_allocator_settings("expandable_segments:False") + + os.environ["VLLM_CACHE_ROOT"] = os.path.join(get_default_cache_root(), "vllm", os.environ.get("WORKER_NAME", "")) + + os.environ["FLASHINFER_WORKSPACE_BASE"] = os.path.join( + pathlib.Path.home().as_posix(), ".cache", os.environ.get("WORKER_NAME", "") + ) + + # Default fork method is not compatible with Roll. + os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn" + + engine_args = AsyncEngineArgs(**kwargs) + # VLLM_USE_V1 may be modified inside create_engine_config + vllm_config = engine_args.create_engine_config(UsageContext.ENGINE_CONTEXT) + + fp8.update_quant_config(vllm_config) + + # change parallel_config.placement_group for CustomRayDistributedExecutor + parallel_config = vllm_config.parallel_config + assert len(resource_placement_groups) == parallel_config.world_size + parallel_config.placement_group = resource_placement_groups + + if not hasattr(envs, "VLLM_USE_V1") or envs.VLLM_USE_V1: + from vllm.v1.executor.abstract import Executor + + from roll.third_party.vllm.async_llm import CustomAsyncLLM + + executor_class = Executor.get_class(vllm_config) + if parallel_config.distributed_executor_backend == "ray": + assert ray_executor_class_v1 is not None, ( + f"ROLL does not support using ray distributed executor with vllm version {vllm.__version__}" + ) + executor_class = ray_executor_class_v1 + + logger.info(f"Using executor_class: {executor_class}") + logger.info(f"Using {parallel_config.worker_cls=} {parallel_config.worker_extension_cls=}") + async_llm = CustomAsyncLLM( + vllm_config=vllm_config, + executor_class=executor_class, + start_engine_loop=True, + log_requests=engine_args.enable_log_requests + if hasattr(engine_args, "enable_log_requests") + else not engine_args.disable_log_requests, + log_stats=not engine_args.disable_log_stats, + usage_context=UsageContext.ENGINE_CONTEXT, + ) + else: + from vllm.v1.engine.async_llm import AsyncLLM + + from roll.third_party.vllm.async_llm_engine import CustomAsyncLLMEngine + + assert not issubclass(CustomAsyncLLMEngine, AsyncLLM) + + executor_class = CustomAsyncLLMEngine._get_executor_cls(vllm_config) + if parallel_config.distributed_executor_backend == "ray": + assert ray_executor_class_v0 is not None, ( + f"ROLL does not support using ray distributed executor with vllm version {vllm.__version__}" + ) + executor_class = ray_executor_class_v0 + + logger.info(f"Using executor_class: {executor_class}") + logger.info(f"Using worker cls: {parallel_config.worker_cls}") + async_llm = CustomAsyncLLMEngine( + vllm_config=vllm_config, + executor_class=executor_class, + start_engine_loop=True, + log_requests=not engine_args.disable_log_requests, + log_stats=not engine_args.disable_log_stats, + usage_context=UsageContext.ENGINE_CONTEXT, + stat_loggers=None, + ) + + await async_llm.custom_init_worker() + + return async_llm + -__all__ = ["LLM", "AsyncLLM"] +__all__ = ["create_async_llm"] diff --git a/roll/third_party/vllm/async_llm.py b/roll/third_party/vllm/async_llm.py new file mode 100644 index 000000000..950a06ef5 --- /dev/null +++ b/roll/third_party/vllm/async_llm.py @@ -0,0 +1,28 @@ +from vllm.v1.engine.async_llm import AsyncLLM + + +class CustomAsyncLLM(AsyncLLM): + async def custom_init_worker(self): + await self.engine_core.collective_rpc_async(method="custom_init_worker") + + async def load_states(self): + await self.engine_core.collective_rpc_async(method="load_states") + + async def offload_states(self, level): + await self.reset_prefix_cache() + await self.engine_core.collective_rpc_async(method="offload_states", args=(level,)) + + async def setup_collective_group(self, *args, **kwargs): + await self.engine_core.collective_rpc_async(method="setup_collective_group", args=args, kwargs=kwargs) + + async def broadcast_parameter(self, *args, **kwargs): + await self.engine_core.collective_rpc_async(method="broadcast_parameter", args=args, kwargs=kwargs) + + async def update_parameter_in_bucket(self, serialized_named_tensors, is_lora=False): + await self.engine_core.collective_rpc_async(method="update_parameter_in_bucket", args=(serialized_named_tensors, is_lora)) + + async def add_lora(self, *args, **kwargs): + await self.engine_core.collective_rpc_async(method="custom_add_lora", args=args, kwargs=kwargs) + + async def process_weights_after_loading(self): + await self.engine_core.collective_rpc_async(method="process_weights_after_loading") diff --git a/roll/third_party/vllm/async_llm_engine.py b/roll/third_party/vllm/async_llm_engine.py new file mode 100644 index 000000000..25a7a025e --- /dev/null +++ b/roll/third_party/vllm/async_llm_engine.py @@ -0,0 +1,27 @@ +from vllm.engine.async_llm_engine import AsyncLLMEngine + +class CustomAsyncLLMEngine(AsyncLLMEngine): + async def custom_init_worker(self): + self.engine.model_executor.collective_rpc(method="custom_init_worker") + + async def load_states(self): + self.engine.model_executor.collective_rpc(method="load_states") + + async def offload_states(self, level): + self.reset_prefix_cache() + self.engine.model_executor.collective_rpc(method="offload_states", args=(level,)) + + async def setup_collective_group(self, *args, **kwargs): + self.engine.model_executor.collective_rpc(method="setup_collective_group", args=args, kwargs=kwargs) + + async def broadcast_parameter(self, *args, **kwargs): + self.engine.model_executor.collective_rpc(method="broadcast_parameter", args=args, kwargs=kwargs) + + async def update_parameter_in_bucket(self, *args, **kwargs): + self.engine.model_executor.collective_rpc(method="update_parameter_in_bucket", args=args, kwargs=kwargs) + + async def add_lora(self, *args, **kwargs): + self.engine.model_executor.collective_rpc(method="custom_add_lora", args=args, kwargs=kwargs) + + async def process_weights_after_loading(self): + await self.engine.model_executor.collective_rpc(method="process_weights_after_loading") diff --git a/roll/third_party/vllm/fp8.py b/roll/third_party/vllm/fp8.py index 5e576d251..762c743a5 100644 --- a/roll/third_party/vllm/fp8.py +++ b/roll/third_party/vllm/fp8.py @@ -1,4 +1,4 @@ -from typing import List, Optional +from typing import List from functools import partial import weakref @@ -16,68 +16,11 @@ from vllm._custom_ops import scaled_fp8_quant as per_tensor_fp8_quant from vllm.model_executor.layers.quantization.utils.w8a8_utils import requantize_with_max_scale +from roll.utils.fp8 import per_block_fp8_quant from roll.utils.logging import get_logger logger = get_logger() -# Block quant operator -# -# Borrow from transformers -# https://huggingface.co/docs/transformers/en/quantization/finegrained_fp8 -# https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/quantizers/quantizer_finegrained_fp8.py#L83 -# -# May use op from torchao: -# https://github.com/pytorch/ao/pull/1668 -# https://github.com/volcengine/verl/pull/3084 -def per_block_fp8_quant(param_value: torch.Tensor, weight_block_size: List[int]): - """ - Quantizes weights to FP8 format using Block-wise quantization - """ - # Get FP8 min/max values - fp8_min = torch.finfo(torch.float8_e4m3fn).min - fp8_max = torch.finfo(torch.float8_e4m3fn).max - - block_size_m, block_size_n = weight_block_size - - rows, cols = param_value.shape[-2:] - - if rows % block_size_m != 0 or cols % block_size_n != 0: - raise ValueError( - f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_size_m}, {block_size_n})" - ) - param_value_orig_shape = param_value.shape - - param_value = param_value.reshape( - -1, rows // block_size_m, block_size_m, cols // block_size_n, block_size_n - ).permute(0, 1, 3, 2, 4) - - # Calculate scaling factor for each block - max_abs = torch.amax(torch.abs(param_value), dim=(-1, -2)) - scale = fp8_max / max_abs - scale_orig_shape = scale.shape - scale = scale.unsqueeze(-1).unsqueeze(-1) - - # Quantize the weights - quantized_param = torch.clamp(param_value * scale, min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) - - quantized_param = quantized_param.permute(0, 1, 3, 2, 4) - # Reshape back to matrix shape - quantized_param = quantized_param.reshape(param_value_orig_shape) - - # Construct the final, correct shape for the scales - num_row_blocks = rows // block_size_m - num_col_blocks = cols // block_size_n - # This preserves original batch dimensions, if any - final_scale_shape = (*param_value_orig_shape[:-2], num_row_blocks, num_col_blocks) - # Reshape directly to the correct shape and take the reciprocal - scale = scale.reshape(final_scale_shape).reciprocal() - - # TODO: DeepGemm scales need to be transposed and aligned (said in vLLM fp8.py)? - - # TODO: On B200, DeepGemm only support E8M0 scale - - return quantized_param, scale - def update_quant_config(vllm_config): # Use hf_overrides arguments of LLM with weight_block_size # to enable block quantization. @@ -208,9 +151,12 @@ def _fp8_moe_w13_weight_loader(layer: weakref.ReferenceType, original_weight_loa target_device = layer.w13_weight.device with target_device: loaded_weight = loaded_weight.to(target_device) - qweight, scale = per_block_fp8_quant(loaded_weight, layer.weight_block_size) - original_weight_loader(layer.w13_weight, qweight, *args, **kwargs) - original_weight_loader(layer.w13_weight_scale_inv, scale, *args, **kwargs) + if loaded_weight.dtype == torch.float8_e4m3fn: + original_weight_loader(layer.w13_weight, loaded_weight, *args, **kwargs) + else: + qweight, scale = per_block_fp8_quant(loaded_weight, layer.weight_block_size) + original_weight_loader(layer.w13_weight, qweight, *args, **kwargs) + original_weight_loader(layer.w13_weight_scale_inv, scale, *args, **kwargs) def _fp8_moe_w2_weight_loader(layer: weakref.ReferenceType, original_weight_loader, param: torch.Tensor, loaded_weight: torch.Tensor, *args, **kwargs) -> None: layer = layer() @@ -218,9 +164,12 @@ def _fp8_moe_w2_weight_loader(layer: weakref.ReferenceType, original_weight_load target_device = layer.w2_weight.device with target_device: loaded_weight = loaded_weight.to(target_device) - qweight, scale = per_block_fp8_quant(loaded_weight, layer.weight_block_size) - original_weight_loader(layer.w2_weight, qweight, *args, **kwargs) - original_weight_loader(layer.w2_weight_scale_inv, scale, *args, **kwargs) + if loaded_weight.dtype == torch.float8_e4m3fn: + original_weight_loader(layer.w2_weight, loaded_weight, *args, **kwargs) + else: + qweight, scale = per_block_fp8_quant(loaded_weight, layer.weight_block_size) + original_weight_loader(layer.w2_weight, qweight, *args, **kwargs) + original_weight_loader(layer.w2_weight_scale_inv, scale, *args, **kwargs) def _fp8_moe_create_weights(self, layer: Module, num_experts: int, hidden_size: int, intermediate_size_per_partition: int, @@ -249,12 +198,12 @@ def _fp8_moe_create_weights(self, layer: Module, num_experts: int, hidden_size: w13_weight_loader = layer.w13_weight.weight_loader w13_weight_loader = partial(_fp8_moe_w13_weight_loader, weakref.ref(layer), w13_weight_loader) layer.w13_weight.weight_loader = w13_weight_loader - set_weight_attrs(layer.w13_weight, {"roll_skip_patch_moe": True}) + set_weight_attrs(layer.w13_weight, {"roll_skip_patch_moe": True}) # TODO: remove once vllm 0.8.4 is deprecated w2_weight_loader = layer.w2_weight.weight_loader w2_weight_loader = partial(_fp8_moe_w2_weight_loader, weakref.ref(layer), w2_weight_loader) layer.w2_weight.weight_loader = w2_weight_loader - set_weight_attrs(layer.w2_weight, {"roll_skip_patch_moe": True}) + set_weight_attrs(layer.w2_weight, {"roll_skip_patch_moe": True}) # TODO: remove once vllm 0.8.4 is deprecated # do not need patch weight loader of scale assert type(layer.w13_weight_scale_inv) == Parameter diff --git a/roll/third_party/vllm/vllm_0_10_0/llm.py b/roll/third_party/vllm/vllm_0_10_0/llm.py deleted file mode 100644 index 56aa1cfdf..000000000 --- a/roll/third_party/vllm/vllm_0_10_0/llm.py +++ /dev/null @@ -1,233 +0,0 @@ -import os -import queue -import time -from typing import Any, Dict, Iterable, List, Optional, Union - -import cloudpickle -import torch -from vllm import LLM, EngineArgs, SamplingParams, envs -from vllm.config import (CompilationConfig, ModelDType, TokenizerMode, - is_init_field) -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.engine.arg_utils import HfOverrides, PoolerConfig, TaskOption -from vllm.lora.request import LoRARequest -from vllm.usage.usage_lib import UsageContext -from vllm.utils import Counter -from vllm.envs import get_default_cache_root - -from roll.third_party.vllm.vllm_0_10_0.llm_engine import LLMEngine0100 -from roll.utils.send_recv_utils import SendBucketManager -from roll.platforms import current_platform - -class Llm0100(LLM): - - def __init__( - self, - resource_placement_groups: List[Dict], - model: str, - tokenizer: Optional[str] = None, - tokenizer_mode: TokenizerMode = "auto", - skip_tokenizer_init: bool = False, - trust_remote_code: bool = False, - allowed_local_media_path: str = "", - tensor_parallel_size: int = 1, - dtype: ModelDType = "auto", - quantization: Optional[QuantizationMethods] = None, - revision: Optional[str] = None, - tokenizer_revision: Optional[str] = None, - seed: Optional[int] = None, - gpu_memory_utilization: float = 0.9, - swap_space: float = 4, - cpu_offload_gb: float = 0, - enforce_eager: bool = False, - max_seq_len_to_capture: int = 8192, - disable_custom_all_reduce: bool = False, - disable_async_output_proc: bool = False, - hf_token: Optional[Union[bool, str]] = None, - hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[dict[str, Any]] = None, - # After positional args are removed, move this right below `model` - task: TaskOption = "auto", - override_pooler_config: Optional[PoolerConfig] = None, - compilation_config: Optional[Union[int, dict[str, Any], CompilationConfig]] = None, - **kwargs, - ) -> None: - # setup envs for vllm - # https://github.com/vllm-project/vllm/pull/14189/files - # TODO do not override other options in PYTORCH_CUDA_ALLOC_CONF - os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "" - # torch.cuda may already init, explicitly disable expandable_segments - # here (only matters when VLLM_USE_RAY_SPMD_WORKER=0) - current_platform.set_allocator_settings("expandable_segments:False") - - os.environ["VLLM_CACHE_ROOT"] = os.path.join( - get_default_cache_root(), "vllm", os.environ.get("WORKER_NAME", "")) - - if "disable_log_stats" not in kwargs: - kwargs["disable_log_stats"] = True - - if "worker_cls" in kwargs: - worker_cls = kwargs["worker_cls"] - # if the worker_cls is not qualified string name, - # we serialize it using cloudpickle to avoid pickling issues - if isinstance(worker_cls, type): - kwargs["worker_cls"] = cloudpickle.dumps(worker_cls) - - if hf_overrides is None: - hf_overrides = {} - - if compilation_config is not None: - if isinstance(compilation_config, int): - compilation_config_instance = CompilationConfig( - level=compilation_config) - elif isinstance(compilation_config, dict): - predicate = lambda x: is_init_field(CompilationConfig, x[0]) - compilation_config_instance = CompilationConfig( - **dict(filter(predicate, compilation_config.items()))) - else: - compilation_config_instance = compilation_config - else: - compilation_config_instance = CompilationConfig() - - kwargs["enable_sleep_mode"] = True - engine_args = EngineArgs( - model=model, - task=task, - tokenizer=tokenizer, - tokenizer_mode=tokenizer_mode, - skip_tokenizer_init=skip_tokenizer_init, - trust_remote_code=trust_remote_code, - allowed_local_media_path=allowed_local_media_path, - tensor_parallel_size=tensor_parallel_size, - dtype=dtype, - quantization=quantization, - revision=revision, - tokenizer_revision=tokenizer_revision, - seed=seed, - gpu_memory_utilization=gpu_memory_utilization, - swap_space=swap_space, - cpu_offload_gb=cpu_offload_gb, - enforce_eager=enforce_eager, - max_seq_len_to_capture=max_seq_len_to_capture, - disable_custom_all_reduce=disable_custom_all_reduce, - disable_async_output_proc=disable_async_output_proc, - hf_token=hf_token, - hf_overrides=hf_overrides, - mm_processor_kwargs=mm_processor_kwargs, - override_pooler_config=override_pooler_config, - compilation_config=compilation_config_instance, - **kwargs, - ) - engine_args.resource_placement_groups = resource_placement_groups - - # Create the Engine (autoselects V0 vs V1) - self.llm_engine = LLMEngine0100.from_engine_args( - engine_args=engine_args, usage_context=UsageContext.LLM_CLASS) - self.engine_class = type(self.llm_engine) - - self.request_counter = Counter() - self.default_sampling_params: Union[dict[str, Any], None] = None - - def load_states(self): - self.collective_rpc(method="load_states") - - def offload_states(self, level=1): - self.reset_prefix_cache() - self.collective_rpc(method="offload_states", args=(level,)) - - def fetch_output(self): - # simulating non blocking semantic when using v1 engine - if envs.VLLM_USE_V1: - try: - request_outputs = self.llm_engine.step_nowait() - except queue.Empty: - request_outputs = [] - else: - request_outputs = self.llm_engine.step() - - return request_outputs - - def get_num_waiting(self): - stats = self.llm_engine._get_stats(scheduler_outputs=None) - return stats.num_waiting_sys - - def add_requests( - self, - prompt_token_ids: List[List[int]], - request_ids: List[int] | None, - sampling_params: SamplingParams, - multi_modal_data: List[int] | None, - lora_requests: List[LoRARequest] | None, - ): - assert len(prompt_token_ids) == len(request_ids) - if multi_modal_data: - assert len(multi_modal_data) == len(request_ids) - for i, (token_ids, request_id)in enumerate(zip(prompt_token_ids, request_ids)): - if request_id is None: - request_id = next(self.request_counter) - lora_request = lora_requests[i] if lora_requests is not None else None - if multi_modal_data: - # in v1, input_preprocessor is in engine.processor - processor = getattr(self.llm_engine, "processor", None) - input_preprocessor = processor.input_preprocessor if processor else self.llm_engine.input_preprocessor - preprocessed_inputs = input_preprocessor.preprocess( - prompt={"prompt_token_ids": token_ids, "multi_modal_data": multi_modal_data[i]}, - lora_request=lora_request, - ) - # in v1, engine does not use a input_processor - processed_inputs = ( - self.llm_engine.input_processor(preprocessed_inputs) - if hasattr(self.llm_engine, "input_processor") - else preprocessed_inputs - ) - else: - processed_inputs = { - "type": "token", - "prompt_token_ids": token_ids - } - self.llm_engine._add_processed_request( - request_id=request_id, - processed_inputs=processed_inputs, - params=sampling_params, - arrival_time=time.time(), - lora_request=lora_request, - ) - - def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: - self.llm_engine.abort_request(request_id) - - def clear_unfinished_requests(self): - self._run_engine(use_tqdm=True) - - # 参数同步接口 - def setup_collective_group(self, *args, **kwargs): - self.collective_rpc(method="setup_collective_group", args=args, kwargs=kwargs) - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - if envs.VLLM_USE_V1: - SendBucketManager.meta_to_dict(meta_infos) - self.collective_rpc(method="broadcast_bucket", args=(src_pp_rank, meta_infos, bucket_size)) - - def broadcast_parameter(self, *args, **kwargs): - self.collective_rpc(method="broadcast_parameter", args=args, kwargs=kwargs) - - def update_parameter(self, parameter_name, weight, ranks_in_worker, is_lora): - if envs.VLLM_USE_V1: - weight_dict = { - "dtype": weight.dtype, - "weight": weight.cpu().tolist() - } - self.collective_rpc(method="update_parameter", args=(parameter_name, weight_dict, ranks_in_worker, is_lora)) - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - if envs.VLLM_USE_V1: - SendBucketManager.meta_to_dict(meta_infos) - # vllm 084 does not support serialization of torch.Tensor(GPU), must use custom - # numpy array encoder or use pickle. - # Can not convert to numpy array here, because of bug in encoder/decoder of vllm 084. - # Newer version of vllm support efficient serilization of torch.Tensor. - buffer = buffer.cpu().tolist() - self.collective_rpc(method="update_parameter_in_bucket", args=(meta_infos, buffer, ranks_in_worker)) - - def add_lora(self, *args, **kwargs): - self.collective_rpc(method="add_lora", args=args, kwargs=kwargs) diff --git a/roll/third_party/vllm/vllm_0_10_0/llm_engine.py b/roll/third_party/vllm/vllm_0_10_0/llm_engine.py deleted file mode 100644 index 92505ecf2..000000000 --- a/roll/third_party/vllm/vllm_0_10_0/llm_engine.py +++ /dev/null @@ -1,89 +0,0 @@ -from typing import Dict, Optional, Type - -from vllm import LLMEngine, EngineArgs, envs -from vllm.config import VllmConfig -from vllm.usage.usage_lib import UsageContext -from vllm.engine.metrics_types import StatLoggerBase - -import roll.third_party.vllm.fp8 as fp8 -from roll.utils.logging import get_logger - -logger = get_logger() - - -class LLMEngine0100(LLMEngine): - - @classmethod - def from_vllm_config( - cls, - vllm_config: VllmConfig, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - disable_log_stats: bool = False, - ) -> "LLMEngine": - parallel_config = vllm_config.parallel_config - - executor_class = cls._get_executor_cls(vllm_config) - if parallel_config.distributed_executor_backend == "ray": - from roll.third_party.vllm.vllm_0_10_0.ray_distributed_executor import ( - CustomRayDistributedExecutor as V0CustomRayDistributedExecutor) - executor_class = V0CustomRayDistributedExecutor - - logger.info(f"Using executor_class: {executor_class}") - logger.info(f"Using worker cls: {parallel_config.worker_cls}") - return cls( - vllm_config=vllm_config, - executor_class=executor_class, - log_stats=(not disable_log_stats), - usage_context=usage_context, - stat_loggers=stat_loggers, - ) - - @classmethod - def from_engine_args( - cls, - engine_args: EngineArgs, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - ) -> "LLMEngine": - # Create the engine configs. - vllm_config = engine_args.create_engine_config(usage_context) - parallel_config = vllm_config.parallel_config - - resource_placement_groups = getattr(engine_args, "resource_placement_groups") - assert len(resource_placement_groups) == parallel_config.world_size - parallel_config.placement_group = resource_placement_groups - - # change worker cls to custom - cls.update_worker_cls_config(vllm_config) - - fp8.update_quant_config(vllm_config) - - engine_cls = cls - if envs.VLLM_USE_V1: - from roll.third_party.vllm.vllm_0_10_0.v1.llm_engine import ( - LLMEngine0100 as V1LLMEngine0100) - engine_cls = V1LLMEngine0100 - - return engine_cls.from_vllm_config( - vllm_config=vllm_config, - usage_context=usage_context, - stat_loggers=stat_loggers, - disable_log_stats=engine_args.disable_log_stats, - ) - - @classmethod - def update_worker_cls_config(cls, vllm_config: VllmConfig) -> None: - parallel_config = vllm_config.parallel_config - scheduler_config = vllm_config.scheduler_config - - assert parallel_config.worker_cls != "auto" - if scheduler_config.is_multi_step: - pass - elif vllm_config.speculative_config: - pass - else: - if envs.VLLM_USE_V1: - parallel_config.worker_cls = "roll.third_party.vllm.vllm_0_10_0.v1.worker.Worker0100" - else: - parallel_config.worker_cls = "roll.third_party.vllm.vllm_0_10_0.worker.Worker0100" diff --git a/roll/third_party/vllm/vllm_0_10_0/ray_distributed_executor.py b/roll/third_party/vllm/vllm_0_10_0/ray_distributed_executor.py deleted file mode 100644 index 3c4cc1f8d..000000000 --- a/roll/third_party/vllm/vllm_0_10_0/ray_distributed_executor.py +++ /dev/null @@ -1,265 +0,0 @@ -import asyncio -import os -from typing import Optional, List - -import cloudpickle -import msgspec - -import ray -from ray.runtime_env import RuntimeEnv -from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy -from vllm import envs -from vllm.executor.msgspec_utils import encode_hook -from vllm.executor.ray_distributed_executor import RayDistributedExecutor, RayWorkerMetaData -from vllm.executor.ray_utils import RayWorkerWrapper -from vllm.model_executor.layers.sampler import SamplerOutput -from vllm.platforms import current_platform -from vllm.ray.ray_env import get_env_vars_to_copy -from vllm.utils import make_async, get_ip, get_distributed_init_method, get_open_port - -from roll.utils.logging import get_logger -from roll.platforms import current_platform as roll_current_platform - -logger = get_logger() - -def initialize_ray_cluster(ray_address: Optional[str] = None): - if ray.is_initialized(): - return - ray.init(address=ray_address) - -class CustomRayDistributedExecutor(RayDistributedExecutor): - - def _init_executor(self) -> None: - self.forward_dag: Optional[ray.dag.CompiledDAG] = None - if envs.VLLM_USE_V1: - # V1 uses SPMD worker and compiled DAG - os.environ["VLLM_USE_RAY_SPMD_WORKER"] = "1" - os.environ["VLLM_USE_RAY_COMPILED_DAG"] = "1" - assert not current_platform.is_tpu() - - # If the env var is set, it uses the Ray's compiled DAG API - # which optimizes the control plane overhead. - # Run vLLM with VLLM_USE_RAY_COMPILED_DAG=1 to enable it. - # Currently, this requires USE_RAY_SPMD_WORKER=True. - self.use_ray_compiled_dag = envs.VLLM_USE_RAY_COMPILED_DAG - # If the env var is set, then we do not distinguish between the - # "driver worker" vs other workers. Also, the rank 0 worker will - # be executed in a remote Ray worker. Currently this requires - # USE_RAY_COMPILED_DAG=True. - self.use_ray_spmd_worker = envs.VLLM_USE_RAY_SPMD_WORKER - if self.use_ray_compiled_dag: - assert self.use_ray_spmd_worker, ( - "VLLM_USE_RAY_COMPILED_DAG=1 requires " - "VLLM_USE_RAY_SPMD_WORKER=1") - if self.use_ray_spmd_worker: - assert self.use_ray_compiled_dag, ( - "VLLM_USE_RAY_SPMD_WORKER=1 requires " - "VLLM_USE_RAY_COMPILED_DAG=1") - - placement_group = self.parallel_config.placement_group - assert self.uses_ray - assert len(placement_group) > 0 - initialize_ray_cluster(placement_group[0]['ray_address']) - assert ray.is_initialized() - - # Disable Ray usage stats collection. - ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") - if ray_usage != "1": - os.environ["RAY_USAGE_STATS_ENABLED"] = "0" - - # Create the parallel GPU workers. - self._init_workers_ray(placement_group) - - self.input_encoder = msgspec.msgpack.Encoder(enc_hook=encode_hook) - self.output_decoder = msgspec.msgpack.Decoder( - Optional[List[SamplerOutput]]) - self.use_v1 = envs.VLLM_USE_V1 - - self.pp_locks: Optional[List[asyncio.Lock]] = None - if not self.use_ray_compiled_dag: - self.driver_exec_method = make_async( - self.driver_worker.execute_method) - - def _init_workers_ray(self, placement_group: "PlacementGroup", - **ray_remote_kwargs): - assert len(placement_group) == self.parallel_config.world_size - - # The driver dummy worker does not actually use any resources. - # It holds the resource for the driver worker. - self.driver_dummy_worker: Optional[RayWorkerWrapper] = None - # The remaining workers are the actual ray actors. - self.workers: List[RayWorkerWrapper] = [] - - # Used in ray compiled DAG: indexed first by PP rank, - # and then TP rank. In other words, the inner list is - # the TP group of workers for a PP rank. - self.pp_tp_workers: List[List[RayWorkerWrapper]] = [] - - if self.parallel_config.ray_workers_use_nsight: - ray_remote_kwargs = self._configure_ray_workers_use_nsight( - ray_remote_kwargs) - - logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker) - - # Create the workers. - worker_metadata: List[RayWorkerMetaData] = [] - driver_ip = get_ip() - for rank in range(self.parallel_config.world_size): - pg = placement_group[rank]['placement_group'] - gpu_rank = placement_group[rank]['gpu_rank'] - env_vars = {} - env_vars.update(roll_current_platform.get_custom_env_vars()) - env_vars.update(roll_current_platform.get_vllm_run_time_env_vars(gpu_rank)) - runtime_env = RuntimeEnv(env_vars=env_vars) - assert current_platform.ray_device_key == "GPU" - # NV+AMD GPUs, and Intel XPUs - worker = ray.remote( - num_cpus=0, - num_gpus=0.01, - runtime_env=runtime_env, - scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg, ), - **ray_remote_kwargs, - )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, - rpc_rank=rank) - worker_metadata.append( - RayWorkerMetaData(worker=worker, created_rank=rank)) - - worker_ips = ray.get([ - each.worker.get_node_ip.remote() # type: ignore[attr-defined] - for each in worker_metadata - ]) - - for each, ip in zip(worker_metadata, worker_ips): - each.ip = ip - - if not self.use_ray_spmd_worker: - for i, each in enumerate(worker_metadata): - # find and remove the dummy worker from the list - worker = each.worker - worker_ip = each.ip - if self.driver_dummy_worker is None and worker_ip == driver_ip: - # If the worker is on the same node as the driver, we use it - # as the resource holder for the driver process. - self.driver_dummy_worker = worker - self.driver_worker = RayWorkerWrapper( - vllm_config=self.vllm_config, rpc_rank=0) - worker_metadata.pop(i) - break - - logger.debug("workers: %s", worker_metadata) - logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker) - if not self.use_ray_spmd_worker and self.driver_dummy_worker is None: - raise ValueError( - "Ray does not allocate any GPUs on the driver node." - f"Driver IP: {driver_ip}, worker IPs: {worker_ips}." - "Consider adjusting the Ray placement group or running " - "the driver on a GPU node.") - - # 不需要sorted,按placement_group给定的资源顺序即可 - start_rank = 0 if self.use_ray_spmd_worker else 1 - for i, item in enumerate(worker_metadata): - item.adjusted_rank = i + start_rank - self.workers = [item.worker for item in worker_metadata] - rerank_mapping = { - item.created_rank: item.adjusted_rank - for item in worker_metadata - } - self._run_workers("adjust_rank", rerank_mapping) - - # Get the set of GPU IDs used on each node. - worker_node_and_gpu_ids = [] - for worker in [self.driver_dummy_worker] + self.workers: - if worker is None: - # driver_dummy_worker can be None when using ray spmd worker. - continue - worker_node_and_gpu_ids.append( - ray.get(worker.get_node_and_gpu_ids.remote()) \ - ) # type: ignore - - # Set environment variables for the driver and workers. - # 移除了device_control_env_var(CUDA_VISIBLE_DEVICES)设置,原因是我们只为每个worker分配了一个可见gpu - all_args_to_update_environment_variables = [{} for (node_id, _) in worker_node_and_gpu_ids] - # Environment variables to copy from driver to workers - env_vars_to_copy = get_env_vars_to_copy( - exclude_vars=self.WORKER_SPECIFIC_ENV_VARS, - additional_vars=set(current_platform.additional_env_vars).union( - self.ADDITIONAL_ENV_VARS), - destination="workers") - - # Copy existing env vars to each worker's args - for args in all_args_to_update_environment_variables: - for name in env_vars_to_copy: - if name in os.environ: - args[name] = os.environ[name] - - self._env_vars_for_all_workers = ( - all_args_to_update_environment_variables) - - self._run_workers("update_environment_variables", - self._get_env_vars_to_be_updated()) - - distributed_init_method = get_distributed_init_method( - driver_ip, get_open_port()) - - # Initialize the actual workers inside worker wrapper. - all_kwargs = [] - for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids): - local_rank = 0 - kwargs = dict( - vllm_config=self.vllm_config, - local_rank=local_rank, - rank=rank, - distributed_init_method=distributed_init_method, - is_driver_worker=(not self.parallel_config) - or (rank % self.parallel_config.tensor_parallel_size == 0), - ) - all_kwargs.append(kwargs) - self._run_workers("init_worker", all_kwargs) - - self._run_workers("init_device") - self._run_workers("load_model", - max_concurrent_workers=self.parallel_config. - max_parallel_loading_workers) - - if self.use_ray_spmd_worker: - for pp_rank in range(self.parallel_config.pipeline_parallel_size): - self.pp_tp_workers.append([]) - for tp_rank in range( - self.parallel_config.tensor_parallel_size): - # PP=2, TP=4 - # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]] - rank = (pp_rank * self.parallel_config.tensor_parallel_size - ) + tp_rank - assert len(self.pp_tp_workers[pp_rank]) == tp_rank - assert pp_rank < len(self.pp_tp_workers) - self.pp_tp_workers[pp_rank].append(self.workers[rank]) - - # This is the list of workers that are rank 0 of each TP group EXCEPT - # global rank 0. These are the workers that will broadcast to the - # rest of the workers. - self.tp_driver_workers: List[RayWorkerWrapper] = [] - # This is the list of workers that are not drivers and not the first - # worker in a TP group. These are the workers that will be - # broadcasted to. - self.non_driver_workers: List[RayWorkerWrapper] = [] - - # Enforce rank order for correct rank to return final output. - for index, worker in enumerate(self.workers): - # The driver worker is rank 0 and not in self.workers. - rank = index + 1 - if rank % self.parallel_config.tensor_parallel_size == 0: - self.tp_driver_workers.append(worker) - else: - self.non_driver_workers.append(worker) - - def shutdown(self) -> None: - logger.info( - "Shutting down Ray distributed executor. If you see error log " - "from logging.cc regarding SIGTERM received, please ignore because " - "this is the expected termination process in Ray.") - if hasattr(self, "forward_dag") and self.forward_dag is not None: - self.forward_dag.teardown() - import ray - for worker in self.workers: - ray.kill(worker) - self.forward_dag = None diff --git a/roll/third_party/vllm/vllm_0_10_0/v1/async_llm.py b/roll/third_party/vllm/vllm_0_10_0/v1/async_llm.py deleted file mode 100644 index 67a4025e8..000000000 --- a/roll/third_party/vllm/vllm_0_10_0/v1/async_llm.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -import asyncio -from typing import (Tuple, List, Dict, Optional, Union, Any, - Callable, Dict, List, Optional) - -from vllm import envs -from vllm.v1.engine.async_llm import AsyncLLM -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.entrypoints.llm import _R -from vllm.usage.usage_lib import UsageContext -from vllm.v1.executor.abstract import Executor - -from roll.utils.logging import get_logger -from roll.utils.send_recv_utils import SendBucketManager - -logger = get_logger() - -class AsyncLLM0100(AsyncLLM): - - def __init__(self, resource_placement_groups, **kwargs): - assert envs.VLLM_USE_V1 - - engine_args = AsyncEngineArgs( - **kwargs, - ) - engine_args.enable_sleep_mode = True - vllm_config = engine_args.create_engine_config(UsageContext.ENGINE_CONTEXT) - - parallel_config = vllm_config.parallel_config - assert len(resource_placement_groups) == parallel_config.world_size - parallel_config.placement_group = resource_placement_groups - - assert not vllm_config.scheduler_config.is_multi_step - assert not vllm_config.speculative_config - parallel_config.worker_cls = "roll.third_party.vllm.vllm_0_10_0.v1.worker.Worker0100" - - executor_class = Executor.get_class(vllm_config) - if parallel_config.distributed_executor_backend == "ray": - from roll.third_party.vllm.vllm_0_10_0.v1.ray_distributed_executor import ( - CustomRayDistributedExecutor as V1CustomeRayDistributedExecutor) - executor_class = V1CustomeRayDistributedExecutor - - # https://github.com/vllm-project/vllm/pull/14189/files - # TODO do not override other options in PYTORCH_CUDA_ALLOC_CONF - os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "" - - # Default fork method is not compatible with ScaleAligner. - os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' - - logger.info(f"Using AsyncLLM") - logger.info(f"Using executor_class: {executor_class}") - logger.info(f"Using worker cls: {parallel_config.worker_cls}") - return super().__init__( - vllm_config=vllm_config, - executor_class=executor_class, - start_engine_loop=True, - log_requests=True, - log_stats=True, - usage_context=UsageContext.ENGINE_CONTEXT, - ) - - def collective_rpc(self, - method: Union[str, Callable[..., _R]], - timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: - loop = asyncio.get_event_loop() - return loop.run_until_complete(self.engine_core.collective_rpc_async(method, timeout, args, kwargs)) - - def load_states(self): - self.collective_rpc(method="load_states") - - def offload_states(self, level=1): - self.reset_prefix_cache() - self.collective_rpc(method="offload_states", args=(level,)) - - # 参数同步接口 - def setup_collective_group(self, *args, **kwargs): - self.collective_rpc(method="setup_collective_group", args=args, kwargs=kwargs) - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - if envs.VLLM_USE_V1: - SendBucketManager.meta_to_dict(meta_infos) - self.collective_rpc(method="broadcast_bucket", args=(src_pp_rank, meta_infos, bucket_size)) - - def broadcast_parameter(self, *args, **kwargs): - self.collective_rpc(method="broadcast_parameter", args=args, kwargs=kwargs) - - def update_parameter(self, *args, **kwargs): - self.collective_rpc(method="update_parameter", args=args, kwargs=kwargs) - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - if envs.VLLM_USE_V1: - SendBucketManager.meta_to_dict(meta_infos) - self.collective_rpc(method="update_parameter_in_bucket", args=(meta_infos, buffer, ranks_in_worker)) - - def add_lora(self, *args, **kwargs): - self.collective_rpc(method="add_lora", args=args, kwargs=kwargs) diff --git a/roll/third_party/vllm/vllm_0_10_0/v1/llm_engine.py b/roll/third_party/vllm/vllm_0_10_0/v1/llm_engine.py deleted file mode 100644 index cb36342f5..000000000 --- a/roll/third_party/vllm/vllm_0_10_0/v1/llm_engine.py +++ /dev/null @@ -1,241 +0,0 @@ -import os -from collections.abc import Mapping, Sequence -from copy import copy -from typing import Optional, Union - -from vllm import envs -from vllm.config import VllmConfig -from vllm.usage.usage_lib import UsageContext -from vllm.engine.metrics_types import StatLoggerBase -from vllm.v1.engine.processor import Processor -from vllm.config import VllmConfig -from vllm.inputs import ProcessorInputs -from vllm.inputs.parse import split_enc_dec_inputs -from vllm.outputs import RequestOutput -from vllm.lora.request import LoRARequest -from vllm.multimodal import MultiModalKwargs -from vllm.multimodal.inputs import PlaceholderRange -from vllm.multimodal.utils import merge_and_sort_multimodal_metadata -from vllm.pooling_params import PoolingParams -from vllm.sampling_params import SamplingParams -from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine import EngineCoreOutputs -from vllm.v1.engine.core_client import SyncMPClient -from vllm.v1.executor.abstract import Executor -from vllm.v1.engine.llm_engine import LLMEngine -from vllm.v1.engine.parallel_sampling import ParentRequest -from roll.utils.logging import get_logger - -logger = get_logger() - -def custom_process_inputs( - self, - request_id: str, - prompt: ProcessorInputs, - params: Union[SamplingParams, PoolingParams], - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, -) -> EngineCoreRequest: - - self._validate_lora(lora_request) - self._validate_params(params,lora_request) - if priority != 0: - raise ValueError("V1 does not support priority yet.") - if trace_headers is not None: - raise ValueError("V1 does not support tracing yet.") - - assert arrival_time is not None - - processed_inputs: ProcessorInputs = prompt - eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) - - self._validate_model_inputs(processed_inputs, lora_request) - - encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) - - if encoder_inputs is not None: - raise NotImplementedError - - assert isinstance(params, SamplingParams) - sampling_params = params.clone() - # If unset max tokens, then generate up to the max_model_len. - if sampling_params.max_tokens is None: - sampling_params.max_tokens = ( - self.model_config.max_model_len - - len(decoder_inputs["prompt_token_ids"])) - sampling_params.update_from_generation_config( - self.generation_config_fields, eos_token_id) - sampling_params.update_from_tokenizer( - self.tokenizer.get_lora_tokenizer(lora_request)) - pooling_params = None - - # Multimodal related. - sorted_mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]] = None - sorted_mm_positions: Optional[list[PlaceholderRange]] = None - sorted_mm_hashes: Optional[list[str]] = None - if decoder_inputs["type"] == "multimodal": - decoder_mm_inputs = decoder_inputs["mm_kwargs"] - - # Merge and flatten multimodal placeholders, hashes and inputs - # from dictionaries to lists, and sort them by each item's position - # in the input sequence. - ( - sorted_item_modalities, - sorted_mm_positions, - sorted_mm_hashes, - ) = merge_and_sort_multimodal_metadata( - decoder_inputs["mm_placeholders"], - decoder_inputs["mm_hashes"] if self.use_hash else None, - ) - - # The output of merged multi-modal processor (`decoder_mm_inputs`) - # is a single MultiModalKwargs for all items from all modalities. - # This code flattens kwargs for individual items in a list and - # sorts them by each item's position in the input sequence if there - # are multiple modalities. - unique_modalities = set(sorted_item_modalities) - if len(unique_modalities) > 1: - orig_sorted_mm_inputs = [] - used_indices = {modality: 0 for modality in unique_modalities} - - for modality in sorted_item_modalities: - items = decoder_mm_inputs.get_items(modality) - item = items[used_indices[modality]] - - orig_sorted_mm_inputs.append( - MultiModalKwargs.from_items([item])) - used_indices[modality] += 1 - else: - orig_sorted_mm_inputs = [ - MultiModalKwargs.from_items([item]) for item in - decoder_mm_inputs.get_items(sorted_item_modalities[0]) - ] - - if sorted_mm_hashes is not None: - sorted_mm_inputs = self.mm_input_cache_client.get_and_update_p0( - orig_sorted_mm_inputs, sorted_mm_hashes) - else: - sorted_mm_inputs = orig_sorted_mm_inputs - - return decoder_inputs.get("prompt"),EngineCoreRequest( - request_id=request_id, - prompt_token_ids=decoder_inputs["prompt_token_ids"], - mm_inputs=sorted_mm_inputs, - mm_hashes=sorted_mm_hashes, - mm_placeholders=sorted_mm_positions, - sampling_params=sampling_params, - pooling_params=pooling_params, - eos_token_id=eos_token_id, - arrival_time=arrival_time, - lora_request=lora_request, - cache_salt=None, - data_parallel_rank=None, - ) - -Processor.custom_process_inputs = custom_process_inputs - -def get_output_nowait(self) -> EngineCoreOutputs: - """ - Only get an item if one is immediately available. Otherwise - raise the queue.Empty exception. - """ - return self.outputs_queue.get_nowait() - -# Function 'step' of vllm v1 and v0 engine has different semantic. -# Function vllm.v1.engine.LLMEngine.step is blocking but that of v0 is not. -# This will cause deadlock when calling roll.third_party.vllm.vllm_0_8_4.Llm084.fetch_output -# inside VllmStrategy if set generate_opt_level to 1. -SyncMPClient.get_output_nowait = get_output_nowait - -class LLMEngine0100(LLMEngine): - - @classmethod - def from_vllm_config( - cls, - vllm_config: VllmConfig, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[dict[str, StatLoggerBase]] = None, - disable_log_stats: bool = False, - ) -> "LLMEngine": - if stat_loggers is not None: - raise NotImplementedError( - "Passing StatLoggers to V1 is not yet supported. " - "Set VLLM_USE_V1=0 and file and issue on Github.") - - parallel_config = vllm_config.parallel_config - - executor_class = Executor.get_class(vllm_config) - if parallel_config.distributed_executor_backend == "ray": - from roll.third_party.vllm.vllm_0_10_0.v1.ray_distributed_executor import ( - CustomRayDistributedExecutor as V1CustomeRayDistributedExecutor) - executor_class = V1CustomeRayDistributedExecutor - - # Default fork method is not compatible with ScaleAligner. - os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' - - logger.info(f"Using executor_class: {executor_class}") - logger.info(f"Using worker cls: {parallel_config.worker_cls}") - return cls(vllm_config=vllm_config, - executor_class=executor_class, - log_stats=(not disable_log_stats), - usage_context=usage_context, - stat_loggers=stat_loggers, - multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING) - - def _add_processed_request( - self, - request_id: str, - processed_inputs: ProcessorInputs, - params: Union[SamplingParams, PoolingParams], - arrival_time: float, - lora_request: Optional[LoRARequest], - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - ) -> None: - prompt_str, request = self.processor.custom_process_inputs(request_id, processed_inputs, params, - arrival_time, lora_request, - trace_headers, - priority) - - n = params.n if isinstance(params, SamplingParams) else 1 - - if n == 1: - # Make a new RequestState and queue. - self.output_processor.add_request(request, prompt_str, None, 0) - # Add the request to EngineCore. - self.engine_core.add_request(request) - return - - # Fan out child requests (for n>1). - parent_req = ParentRequest(request_id, params) - for idx in range(n): - request_id, params = parent_req.get_child_info(idx) - child_request = request if idx == n - 1 else copy(request) - child_request.request_id = request_id - child_request.sampling_params = params - - # Make a new RequestState and queue. - self.output_processor.add_request(child_request,prompt_str, parent_req, idx) - # Add the request to EngineCore. - self.engine_core.add_request(child_request) - - def step_nowait(self) -> list[RequestOutput]: - - if self.should_execute_dummy_batch: - self.should_execute_dummy_batch = False - self.engine_core.execute_dummy_batch() - return [] - - # 1) Get EngineCoreOutput from the EngineCore. - outputs = self.engine_core.get_output_nowait() - - # 2) Process EngineCoreOutputs. - processed_outputs = self.output_processor.process_outputs( - outputs.outputs) - - # 3) Abort any reqs that finished due to stop strings. - self.engine_core.abort_requests(processed_outputs.reqs_to_abort) - - return processed_outputs.request_outputs diff --git a/roll/third_party/vllm/vllm_0_10_0/v1/ray_distributed_executor.py b/roll/third_party/vllm/vllm_0_10_0/v1/ray_distributed_executor.py deleted file mode 100644 index 9897230c3..000000000 --- a/roll/third_party/vllm/vllm_0_10_0/v1/ray_distributed_executor.py +++ /dev/null @@ -1,9 +0,0 @@ -from vllm.v1.executor.ray_distributed_executor import RayDistributedExecutor - -from roll.third_party.vllm.vllm_0_10_0.ray_distributed_executor import ( - CustomRayDistributedExecutor as CustomRayDistributedExecutorV0) - -# Force RayDistributedExecutor to come before CustomRayDistributedExecutorV0 -# to ensure correct method resolution order (MRO) and override behavior. -class CustomRayDistributedExecutor(RayDistributedExecutor, CustomRayDistributedExecutorV0): - pass diff --git a/roll/third_party/vllm/vllm_0_10_0/v1/worker.py b/roll/third_party/vllm/vllm_0_10_0/v1/worker.py deleted file mode 100644 index f65f07430..000000000 --- a/roll/third_party/vllm/vllm_0_10_0/v1/worker.py +++ /dev/null @@ -1,51 +0,0 @@ -import gc -import time -from collections import OrderedDict -from typing import Optional - -import torch -from vllm.device_allocator.cumem import CuMemAllocator -from vllm.v1.worker.gpu_worker import Worker - -from roll.third_party.vllm.vllm_utils import TensorLoRARequest, patch_vllm_lora_manager -from roll.third_party.vllm.worker_helper import WorkerHelper -from roll.utils.logging import get_logger -from roll.utils.send_recv_utils import RecvBucketManager -from roll.platforms import current_platform - -logger = get_logger() - - -class Worker0100(WorkerHelper, Worker): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.lora_params = OrderedDict() - patch_vllm_lora_manager() - - def update_parameter(self, parameter_name, weight, ranks_in_worker, is_lora): - weight_dict = weight - weight = torch.tensor(weight_dict["weight"], dtype=weight_dict["dtype"]).to(current_platform.device_type) - super().update_parameter(parameter_name, weight, ranks_in_worker, is_lora) - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - RecvBucketManager.dict_to_meta(meta_infos) - super().broadcast_bucket(src_pp_rank, meta_infos, bucket_size) - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - RecvBucketManager.dict_to_meta(meta_infos) - buffer = torch.tensor(buffer, dtype=torch.int8, device=current_platform.device_type) - super().update_parameter_in_bucket(meta_infos, buffer, ranks_in_worker) - - def add_lora(self, peft_config) -> bool: - lora_int_id = int(time.time_ns() % 0x7FFFFFFF) - lora_request = TensorLoRARequest( - lora_name=f"{lora_int_id}", - lora_int_id=lora_int_id, - lora_path="dummy_lora_path", - peft_config=peft_config, - lora_tensors=self.lora_params, - ) - del self.lora_params - self.lora_params = OrderedDict() - super().reload_model() - return self.model_runner.add_lora(lora_request) diff --git a/roll/third_party/vllm/vllm_0_10_0/worker.py b/roll/third_party/vllm/vllm_0_10_0/worker.py deleted file mode 100644 index 41217a532..000000000 --- a/roll/third_party/vllm/vllm_0_10_0/worker.py +++ /dev/null @@ -1,15 +0,0 @@ -import gc -from typing import Optional - -import torch -from vllm.worker.worker import Worker - -from roll.third_party.vllm.worker_helper import WorkerHelper -from roll.utils.logging import get_logger - -logger = get_logger() - - -class Worker0100(WorkerHelper, Worker): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) diff --git a/roll/third_party/vllm/vllm_0_10_2/llm.py b/roll/third_party/vllm/vllm_0_10_2/llm.py deleted file mode 100644 index fe38a85ad..000000000 --- a/roll/third_party/vllm/vllm_0_10_2/llm.py +++ /dev/null @@ -1,285 +0,0 @@ -import os -import queue -import time -from typing import Any, Dict, Iterable, List, Optional, Union - -import cloudpickle -import torch -from vllm import LLM, EngineArgs, SamplingParams, envs -from vllm.config import (CompilationConfig, ModelDType, TokenizerMode, - is_init_field) -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides, - PoolerConfig, RunnerOption) -from vllm.v1.sample.logits_processor import LogitsProcessor -from vllm.entrypoints.utils import (_validate_truncation_size, - log_non_default_args) -from vllm.lora.request import LoRARequest -from vllm.usage.usage_lib import UsageContext -from vllm.utils import Counter -from vllm.plugins.io_processors import get_io_processor -from vllm.envs import get_default_cache_root - -from roll.third_party.vllm.vllm_0_10_2.llm_engine import LLMEngine0102 -from roll.utils.send_recv_utils import SendBucketManager -from roll.utils.logging import get_logger -from roll.platforms import current_platform - -logger = get_logger() - -class Llm0102(LLM): - - def __init__( - self, - resource_placement_groups: List[Dict], - model: str, - *, - runner: RunnerOption = "auto", - convert: ConvertOption = "auto", - tokenizer: Optional[str] = None, - tokenizer_mode: TokenizerMode = "auto", - skip_tokenizer_init: bool = False, - trust_remote_code: bool = False, - allowed_local_media_path: str = "", - tensor_parallel_size: int = 1, - dtype: ModelDType = "auto", - quantization: Optional[QuantizationMethods] = None, - revision: Optional[str] = None, - tokenizer_revision: Optional[str] = None, - seed: Optional[int] = None, - gpu_memory_utilization: float = 0.9, - swap_space: float = 4, - cpu_offload_gb: float = 0, - enforce_eager: bool = False, - max_seq_len_to_capture: int = 8192, - disable_custom_all_reduce: bool = False, - disable_async_output_proc: bool = False, - hf_token: Optional[Union[bool, str]] = None, - hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[dict[str, Any]] = None, - override_pooler_config: Optional[PoolerConfig] = None, - kv_cache_memory_bytes: Optional[int] = None, - compilation_config: Optional[Union[int, dict[str, Any], - CompilationConfig]] = None, - logits_processors: Optional[list[Union[str, - type[LogitsProcessor]]]] = None, - **kwargs: Any, - ) -> None: - """LLM constructor.""" - # setup envs for vllm - # https://github.com/vllm-project/vllm/pull/14189/files - # TODO do not override other options in PYTORCH_CUDA_ALLOC_CONF - os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "" - # torch.cuda may already init, explicitly disable expandable_segments - # here (only matters when VLLM_USE_RAY_SPMD_WORKER=0) - current_platform.set_allocator_settings("expandable_segments:False") - - os.environ["VLLM_CACHE_ROOT"] = os.path.join( - get_default_cache_root(), "vllm", os.environ.get("WORKER_NAME", "")) - - if "disable_log_stats" not in kwargs: - kwargs["disable_log_stats"] = True - - if "worker_cls" in kwargs: - worker_cls = kwargs["worker_cls"] - # if the worker_cls is not qualified string name, - # we serialize it using cloudpickle to avoid pickling issues - if isinstance(worker_cls, type): - kwargs["worker_cls"] = cloudpickle.dumps(worker_cls) - - if "kv_transfer_config" in kwargs and isinstance( - kwargs["kv_transfer_config"], dict): - from vllm.config.kv_transfer import KVTransferConfig - raw_config_dict = kwargs["kv_transfer_config"] - try: - kwargs["kv_transfer_config"] = KVTransferConfig( - **raw_config_dict) - except ValidationError as e: - logger.error( - "Failed to convert 'kv_transfer_config' dict to " - "KVTransferConfig object. Dict: %s. Error: %s", - raw_config_dict, e) - # Consider re-raising a more specific vLLM error or ValueError - # to provide better context to the user. - raise ValueError( - f"Invalid 'kv_transfer_config' provided: {e}") from e - - if hf_overrides is None: - hf_overrides = {} - - if compilation_config is not None: - if isinstance(compilation_config, int): - compilation_config_instance = CompilationConfig( - level=compilation_config) - elif isinstance(compilation_config, dict): - predicate = lambda x: is_init_field(CompilationConfig, x[0]) - compilation_config_instance = CompilationConfig( - **dict(filter(predicate, compilation_config.items()))) - else: - compilation_config_instance = compilation_config - else: - compilation_config_instance = CompilationConfig() - - kwargs["enable_sleep_mode"] = True - engine_args = EngineArgs( - model=model, - runner=runner, - convert=convert, - tokenizer=tokenizer, - tokenizer_mode=tokenizer_mode, - skip_tokenizer_init=skip_tokenizer_init, - trust_remote_code=trust_remote_code, - allowed_local_media_path=allowed_local_media_path, - tensor_parallel_size=tensor_parallel_size, - dtype=dtype, - quantization=quantization, - revision=revision, - tokenizer_revision=tokenizer_revision, - seed=seed, - gpu_memory_utilization=gpu_memory_utilization, - kv_cache_memory_bytes=kv_cache_memory_bytes, - swap_space=swap_space, - cpu_offload_gb=cpu_offload_gb, - enforce_eager=enforce_eager, - max_seq_len_to_capture=max_seq_len_to_capture, - disable_custom_all_reduce=disable_custom_all_reduce, - disable_async_output_proc=disable_async_output_proc, - hf_token=hf_token, - hf_overrides=hf_overrides, - mm_processor_kwargs=mm_processor_kwargs, - override_pooler_config=override_pooler_config, - compilation_config=compilation_config_instance, - logits_processors=logits_processors, - **kwargs, - ) - engine_args.resource_placement_groups = resource_placement_groups - - log_non_default_args(engine_args) - - # Create the Engine (autoselects V0 vs V1) - self.llm_engine = LLMEngine0102.from_engine_args( - engine_args=engine_args, usage_context=UsageContext.LLM_CLASS) - self.engine_class = type(self.llm_engine) - - self.request_counter = Counter() - self.default_sampling_params: Union[dict[str, Any], None] = None - - if envs.VLLM_USE_V1: - supported_tasks = self.llm_engine \ - .get_supported_tasks() # type: ignore - else: - supported_tasks = self.llm_engine.model_config.supported_tasks - - logger.info("Supported_tasks: %s", supported_tasks) - - self.supported_tasks = supported_tasks - - # Load the Input/Output processor plugin if any - io_processor_plugin = self.llm_engine.model_config.io_processor_plugin - self.io_processor = get_io_processor(self.llm_engine.vllm_config, - io_processor_plugin) - - - def load_states(self): - self.collective_rpc(method="load_states") - - def offload_states(self, level=1): - self.reset_prefix_cache() - self.collective_rpc(method="offload_states", args=(level,)) - - def fetch_output(self): - # simulating non blocking semantic when using v1 engine - if envs.VLLM_USE_V1: - try: - request_outputs = self.llm_engine.step_nowait() - except queue.Empty: - request_outputs = [] - else: - request_outputs = self.llm_engine.step() - - return request_outputs - - def get_num_waiting(self): - stats = self.llm_engine._get_stats(scheduler_outputs=None) - return stats.num_waiting_sys - - def add_requests( - self, - prompt_token_ids: List[List[int]], - request_ids: List[int] | None, - sampling_params: SamplingParams, - multi_modal_data: List[int] | None, - lora_requests: List[LoRARequest] | None, - ): - assert len(prompt_token_ids) == len(request_ids) - if multi_modal_data: - assert len(multi_modal_data) == len(request_ids) - for i, (token_ids, request_id)in enumerate(zip(prompt_token_ids, request_ids)): - if request_id is None: - request_id = next(self.request_counter) - lora_request = lora_requests[i] if lora_requests is not None else None - if multi_modal_data: - # in v1, input_preprocessor is in engine.processor - processor = getattr(self.llm_engine, "processor", None) - input_preprocessor = processor.input_preprocessor if processor else self.llm_engine.input_preprocessor - preprocessed_inputs = input_preprocessor.preprocess( - prompt={"prompt_token_ids": token_ids, "multi_modal_data": multi_modal_data[i]}, - lora_request=lora_request, - ) - # in v1, engine does not use a input_processor - processed_inputs = ( - self.llm_engine.input_processor(preprocessed_inputs) - if hasattr(self.llm_engine, "input_processor") - else preprocessed_inputs - ) - else: - processed_inputs = { - "type": "token", - "prompt_token_ids": token_ids - } - self.llm_engine._add_processed_request( - request_id=request_id, - processed_inputs=processed_inputs, - params=sampling_params, - arrival_time=time.time(), - lora_request=lora_request, - ) - - def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: - self.llm_engine.abort_request(request_id) - - def clear_unfinished_requests(self): - self._run_engine(use_tqdm=True) - - # 参数同步接口 - def setup_collective_group(self, *args, **kwargs): - self.collective_rpc(method="setup_collective_group", args=args, kwargs=kwargs) - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - if envs.VLLM_USE_V1: - SendBucketManager.meta_to_dict(meta_infos) - self.collective_rpc(method="broadcast_bucket", args=(src_pp_rank, meta_infos, bucket_size)) - - def broadcast_parameter(self, *args, **kwargs): - self.collective_rpc(method="broadcast_parameter", args=args, kwargs=kwargs) - - def update_parameter(self, parameter_name, weight, ranks_in_worker, is_lora): - if envs.VLLM_USE_V1: - weight_dict = { - "dtype": weight.dtype, - "weight": weight.cpu().tolist() - } - self.collective_rpc(method="update_parameter", args=(parameter_name, weight_dict, ranks_in_worker, is_lora)) - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - if envs.VLLM_USE_V1: - SendBucketManager.meta_to_dict(meta_infos) - # vllm 084 does not support serialization of torch.Tensor(GPU), must use custom - # numpy array encoder or use pickle. - # Can not convert to numpy array here, because of bug in encoder/decoder of vllm 084. - # Newer version of vllm support efficient serilization of torch.Tensor. - buffer = buffer.cpu().tolist() - self.collective_rpc(method="update_parameter_in_bucket", args=(meta_infos, buffer, ranks_in_worker)) - - def add_lora(self, *args, **kwargs): - self.collective_rpc(method="add_lora", args=args, kwargs=kwargs) diff --git a/roll/third_party/vllm/vllm_0_10_2/llm_engine.py b/roll/third_party/vllm/vllm_0_10_2/llm_engine.py deleted file mode 100644 index 52e6f1e0a..000000000 --- a/roll/third_party/vllm/vllm_0_10_2/llm_engine.py +++ /dev/null @@ -1,87 +0,0 @@ -from typing import Dict, Optional, Type - -from vllm import LLMEngine, EngineArgs, envs -from vllm.config import VllmConfig -from vllm.usage.usage_lib import UsageContext -from vllm.engine.metrics_types import StatLoggerBase - -import roll.third_party.vllm.fp8 as fp8 -from roll.utils.logging import get_logger - -logger = get_logger() - - -class LLMEngine0102(LLMEngine): - - @classmethod - def from_vllm_config( - cls, - vllm_config: VllmConfig, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - disable_log_stats: bool = False, - ) -> "LLMEngine": - parallel_config = vllm_config.parallel_config - - executor_class = cls._get_executor_cls(vllm_config) - if parallel_config.distributed_executor_backend == "ray": - from roll.third_party.vllm.vllm_0_10_0.ray_distributed_executor import ( - CustomRayDistributedExecutor as V0CustomRayDistributedExecutor) - executor_class = V0CustomRayDistributedExecutor - - logger.info(f"Using executor_class: {executor_class}") - logger.info(f"Using worker cls: {parallel_config.worker_cls}") - return cls( - vllm_config=vllm_config, - executor_class=executor_class, - log_stats=(not disable_log_stats), - usage_context=usage_context, - stat_loggers=stat_loggers, - ) - - @classmethod - def from_engine_args( - cls, - engine_args: EngineArgs, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - ) -> "LLMEngine": - """Creates an LLM engine from the engine arguments.""" - # Create the engine configs. - vllm_config = engine_args.create_engine_config(usage_context) - parallel_config = vllm_config.parallel_config - - resource_placement_groups = getattr(engine_args, "resource_placement_groups") - assert len(resource_placement_groups) == parallel_config.world_size - parallel_config.placement_group = resource_placement_groups - - # change worker cls to custom - cls.update_worker_cls_config(vllm_config) - - fp8.update_quant_config(vllm_config) - - engine_cls = cls - if envs.VLLM_USE_V1: - from roll.third_party.vllm.vllm_0_10_2.v1.llm_engine import ( - LLMEngine0102 as V1LLMEngine0102) - engine_cls = V1LLMEngine0102 - - return engine_cls.from_vllm_config( - vllm_config=vllm_config, - usage_context=usage_context, - stat_loggers=stat_loggers, - disable_log_stats=engine_args.disable_log_stats, - ) - - @classmethod - def update_worker_cls_config(cls, vllm_config: VllmConfig) -> None: - parallel_config = vllm_config.parallel_config - - assert parallel_config.worker_cls != "auto" - if vllm_config.speculative_config: - pass - else: - if envs.VLLM_USE_V1: - parallel_config.worker_cls = "roll.third_party.vllm.vllm_0_10_2.v1.worker.Worker0102" - else: - parallel_config.worker_cls = "roll.third_party.vllm.vllm_0_10_2.worker.Worker0102" diff --git a/roll/third_party/vllm/vllm_0_10_2/ray_distributed_executor.py b/roll/third_party/vllm/vllm_0_10_2/ray_distributed_executor.py index 0b6131969..ec00fbb29 100644 --- a/roll/third_party/vllm/vllm_0_10_2/ray_distributed_executor.py +++ b/roll/third_party/vllm/vllm_0_10_2/ray_distributed_executor.py @@ -110,6 +110,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", env_vars = {} env_vars.update(roll_current_platform.get_custom_env_vars()) env_vars.update(roll_current_platform.get_vllm_run_time_env_vars(gpu_rank)) + env_vars["FLASHINFER_WORKSPACE_BASE"] = f"{os.environ['FLASHINFER_WORKSPACE_BASE']}_{rank}" runtime_env = RuntimeEnv(env_vars=env_vars) assert current_platform.ray_device_key == "GPU" # NV+AMD GPUs, and Intel XPUs diff --git a/roll/third_party/vllm/vllm_0_10_2/v1/llm_engine.py b/roll/third_party/vllm/vllm_0_10_2/v1/llm_engine.py deleted file mode 100644 index 8b72e6522..000000000 --- a/roll/third_party/vllm/vllm_0_10_2/v1/llm_engine.py +++ /dev/null @@ -1,235 +0,0 @@ -import os -import time -from collections.abc import Mapping, Sequence -from copy import copy -from typing import Any, Optional, Union - -from vllm import envs -from vllm.config import VllmConfig -from vllm.usage.usage_lib import UsageContext -from vllm.v1.metrics.loggers import (PrometheusStatLogger, StatLoggerBase, - StatLoggerFactory) -from vllm.v1.engine.processor import Processor -from vllm.config import VllmConfig -from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs -from vllm.multimodal.inputs import MultiModalFeatureSpec, MultiModalUUIDDict -from vllm.inputs.parse import split_enc_dec_inputs -from vllm.outputs import PoolingRequestOutput, RequestOutput -from vllm.lora.request import LoRARequest -from vllm.multimodal import MultiModalKwargs -from vllm.multimodal.inputs import PlaceholderRange -from vllm.pooling_params import PoolingParams -from vllm.sampling_params import SamplingParams -from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine import EngineCoreOutputs -from vllm.v1.engine.core_client import SyncMPClient -from vllm.v1.executor.abstract import Executor -from vllm.v1.engine.llm_engine import LLMEngine -from vllm.v1.engine.parallel_sampling import ParentRequest -from roll.utils.logging import get_logger - -logger = get_logger() - -def custom_process_inputs( - self, - request_id: str, - prompt: ProcessorInputs, - params: Union[SamplingParams, PoolingParams], - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, -) -> tuple[Optional[str], EngineCoreRequest]: - - # TODO(woosuk): Support pooling models. - self._validate_lora(lora_request) - self._validate_params(params, lora_request) - - data_parallel_size = self.vllm_config.parallel_config.data_parallel_size - if data_parallel_rank is not None and not (0 <= data_parallel_rank < - data_parallel_size): - raise ValueError(f"data_parallel_rank {data_parallel_rank} " - f"is out of range [0, {data_parallel_size}).") - - assert arrival_time is not None - - processed_inputs: ProcessorInputs = prompt - eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) - - self._validate_model_inputs(processed_inputs, lora_request) - - encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) - - sampling_params = None - pooling_params = None - if isinstance(params, SamplingParams): - # TODO: can we avoid cloning here in multiproc case? - sampling_params = params.clone() - # If unset max tokens, then generate up to the max_model_len. - if sampling_params.max_tokens is None: - sampling_params.max_tokens = ( - self.model_config.max_model_len - - len(decoder_inputs["prompt_token_ids"])) - sampling_params.update_from_generation_config( - self.generation_config_fields, eos_token_id) - if self.tokenizer is not None: - sampling_params.update_from_tokenizer( - self.tokenizer.get_lora_tokenizer(lora_request)) - else: - pooling_params = params.clone() - - # Multimodal related. - mm_features: Optional[list[MultiModalFeatureSpec]] = None - - if decoder_inputs["type"] == "multimodal": - decoder_mm_inputs = decoder_inputs["mm_kwargs"] - decoder_mm_positions = decoder_inputs["mm_placeholders"] - decoder_mm_hashes = decoder_inputs["mm_hashes"] - - # Merge and flatten multimodal placeholders, hashes and inputs - # from dictionaries to lists, and sort them by each item's position - # in the input sequence. - sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions) - - mm_features = [] - for modality, idx in sorted_mm_idxs: - mm_features.append( - MultiModalFeatureSpec( - data=decoder_mm_inputs[modality][idx], - modality=modality, - identifier=decoder_mm_hashes[modality][idx], - mm_position=decoder_mm_positions[modality][idx])) - - return decoder_inputs.get("prompt"), EngineCoreRequest( - request_id=request_id, - prompt_token_ids=decoder_inputs["prompt_token_ids"], - mm_features=mm_features, - sampling_params=sampling_params, - pooling_params=pooling_params, - eos_token_id=eos_token_id, - arrival_time=arrival_time, - lora_request=lora_request, - cache_salt=decoder_inputs.get("cache_salt"), - priority=priority, - data_parallel_rank=data_parallel_rank, - trace_headers=trace_headers, - ) - -Processor.custom_process_inputs = custom_process_inputs - -def get_output_nowait(self) -> EngineCoreOutputs: - """ - Only get an item if one is immediately available. Otherwise - raise the queue.Empty exception. - """ - # If an exception arises in process_outputs_socket task, - # it is forwarded to the outputs_queue so we can raise it - # from this (run_output_handler) task to shut down the server. - outputs = self.outputs_queue.get_nowait() - if isinstance(outputs, Exception): - raise self._format_exception(outputs) from None - if outputs.wave_complete is not None: - self.engines_running = False - return outputs - -# Function 'step' of vllm v1 and v0 engine has different semantic. -# Function vllm.v1.engine.LLMEngine.step is blocking but that of v0 is not. -# This will cause deadlock when calling roll.third_party.vllm.vllm_0_8_4.Llm084.fetch_output -# inside VllmStrategy if set generate_opt_level to 1. -SyncMPClient.get_output_nowait = get_output_nowait - -class LLMEngine0102(LLMEngine): - - @classmethod - def from_vllm_config( - cls, - vllm_config: VllmConfig, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[list[StatLoggerFactory]] = None, - disable_log_stats: bool = False, - ) -> "LLMEngine": - parallel_config = vllm_config.parallel_config - - executor_class = Executor.get_class(vllm_config) - if parallel_config.distributed_executor_backend == "ray": - from roll.third_party.vllm.vllm_0_10_0.v1.ray_distributed_executor import ( - CustomRayDistributedExecutor as V1CustomeRayDistributedExecutor) - executor_class = V1CustomeRayDistributedExecutor - - # Default fork method is not compatible with ScaleAligner. - os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' - - logger.info(f"Using executor_class: {executor_class}") - logger.info(f"Using worker cls: {parallel_config.worker_cls}") - return cls(vllm_config=vllm_config, - executor_class=executor_class, - log_stats=(not disable_log_stats), - usage_context=usage_context, - stat_loggers=stat_loggers, - multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING) - - def _add_processed_request( - self, - request_id: str, - processed_inputs: ProcessorInputs, - params: Union[SamplingParams, PoolingParams], - arrival_time: float, - lora_request: Optional[LoRARequest], - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - ) -> None: - prompt_str, request = self.processor.custom_process_inputs(request_id, processed_inputs, params, - arrival_time, lora_request, - trace_headers, - priority) - - n = params.n if isinstance(params, SamplingParams) else 1 - - if n == 1: - # Make a new RequestState and queue. - self.output_processor.add_request(request, prompt_str, None, 0) - # Add the request to EngineCore. - self.engine_core.add_request(request) - return - - # Fan out child requests (for n>1). - parent_req = ParentRequest(request_id, params) - for idx in range(n): - request_id, params = parent_req.get_child_info(idx) - child_request = request if idx == n - 1 else copy(request) - child_request.request_id = request_id - child_request.sampling_params = params - - # Make a new RequestState and queue. - self.output_processor.add_request(child_request,prompt_str, parent_req, idx) - # Add the request to EngineCore. - self.engine_core.add_request(child_request) - - def step_nowait(self) -> Union[list[RequestOutput], list[PoolingRequestOutput]]: - - if self.should_execute_dummy_batch: - self.should_execute_dummy_batch = False - self.engine_core.execute_dummy_batch() - return [] - - # 1) Get EngineCoreOutput from the EngineCore. - outputs = self.engine_core.get_output_nowait() - - # 2) Process EngineCoreOutputs. - iteration_stats = IterationStats() if self.log_stats else None - processed_outputs = self.output_processor.process_outputs( - outputs.outputs, - engine_core_timestamp=outputs.timestamp, - iteration_stats=iteration_stats) - - # 3) Abort any reqs that finished due to stop strings. - self.engine_core.abort_requests(processed_outputs.reqs_to_abort) - - # 4) Record stats - if self.stat_logger is not None: - assert outputs.scheduler_stats is not None - self.stat_logger.record(scheduler_stats=outputs.scheduler_stats, - iteration_stats=iteration_stats) - - return processed_outputs.request_outputs diff --git a/roll/third_party/vllm/vllm_0_10_2/v1/ray_distributed_executor.py b/roll/third_party/vllm/vllm_0_10_2/v1/ray_distributed_executor.py index 9897230c3..60da6dd0c 100644 --- a/roll/third_party/vllm/vllm_0_10_2/v1/ray_distributed_executor.py +++ b/roll/third_party/vllm/vllm_0_10_2/v1/ray_distributed_executor.py @@ -1,6 +1,6 @@ from vllm.v1.executor.ray_distributed_executor import RayDistributedExecutor -from roll.third_party.vllm.vllm_0_10_0.ray_distributed_executor import ( +from roll.third_party.vllm.vllm_0_10_2.ray_distributed_executor import ( CustomRayDistributedExecutor as CustomRayDistributedExecutorV0) # Force RayDistributedExecutor to come before CustomRayDistributedExecutorV0 diff --git a/roll/third_party/vllm/vllm_0_10_2/v1/worker.py b/roll/third_party/vllm/vllm_0_10_2/v1/worker.py deleted file mode 100644 index 3b7a467cc..000000000 --- a/roll/third_party/vllm/vllm_0_10_2/v1/worker.py +++ /dev/null @@ -1,52 +0,0 @@ -import gc -import time -from collections import OrderedDict -from typing import Optional - -import torch -from vllm.device_allocator.cumem import CuMemAllocator -from vllm.v1.worker.gpu_worker import Worker - -from roll.third_party.vllm.vllm_utils import TensorLoRARequest, patch_vllm_lora_manager -from roll.third_party.vllm.worker_helper import WorkerHelper -from roll.utils.logging import get_logger -from roll.utils.send_recv_utils import RecvBucketManager -from roll.platforms import current_platform - - -logger = get_logger() - - -class Worker0102(WorkerHelper, Worker): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.lora_params = OrderedDict() - patch_vllm_lora_manager() - - def update_parameter(self, parameter_name, weight, ranks_in_worker, is_lora): - weight_dict = weight - weight = torch.tensor(weight_dict["weight"], dtype=weight_dict["dtype"]).to(current_platform.device_type) - super().update_parameter(parameter_name, weight, ranks_in_worker, is_lora) - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - RecvBucketManager.dict_to_meta(meta_infos) - super().broadcast_bucket(src_pp_rank, meta_infos, bucket_size) - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - RecvBucketManager.dict_to_meta(meta_infos) - buffer = torch.tensor(buffer, dtype=torch.int8, device=current_platform.device_type) - super().update_parameter_in_bucket(meta_infos, buffer, ranks_in_worker) - - def add_lora(self, peft_config) -> bool: - lora_int_id = int(time.time_ns() % 0x7FFFFFFF) - lora_request = TensorLoRARequest( - lora_name=f"{lora_int_id}", - lora_int_id=lora_int_id, - lora_path="dummy_lora_path", - peft_config=peft_config, - lora_tensors=self.lora_params, - ) - del self.lora_params - self.lora_params = OrderedDict() - super().reload_model() - return self.model_runner.add_lora(lora_request) diff --git a/roll/third_party/vllm/vllm_0_10_2/worker.py b/roll/third_party/vllm/vllm_0_10_2/worker.py deleted file mode 100644 index aa4e52155..000000000 --- a/roll/third_party/vllm/vllm_0_10_2/worker.py +++ /dev/null @@ -1,15 +0,0 @@ -import gc -from typing import Optional - -import torch -from vllm.worker.worker import Worker - -from roll.third_party.vllm.worker_helper import WorkerHelper -from roll.utils.logging import get_logger - -logger = get_logger() - - -class Worker0102(WorkerHelper, Worker): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) diff --git a/roll/third_party/vllm/vllm_0_11_0/llm.py b/roll/third_party/vllm/vllm_0_11_0/llm.py deleted file mode 100644 index e9a117880..000000000 --- a/roll/third_party/vllm/vllm_0_11_0/llm.py +++ /dev/null @@ -1,307 +0,0 @@ -import os -import queue -import time -from typing import Any, Dict, Iterable, List, Optional, Union - -import cloudpickle -import torch -from pydantic import ValidationError -from vllm import LLM, EngineArgs, SamplingParams, envs -from vllm.config import ( - CompilationConfig, - StructuredOutputsConfig, - is_init_field, -) - -try: - # 0.11.1rc2.dev0+gc3a722fcb.d20251021 has import diff - from vllm.config.model import ModelDType, TokenizerMode -except ImportError: - from vllm.config import ModelDType, TokenizerMode - -from vllm.model_executor.layers.quantization import QuantizationMethods -from vllm.engine.arg_utils import (ConvertOption, EngineArgs, HfOverrides, - PoolerConfig, RunnerOption) -from vllm.v1.sample.logits_processor import LogitsProcessor -from vllm.entrypoints.utils import (log_non_default_args) -from vllm.lora.request import LoRARequest -from vllm.usage.usage_lib import UsageContext -from vllm.utils import Counter -from vllm.plugins.io_processors import get_io_processor -from vllm.envs import get_default_cache_root - -from roll.third_party.vllm.vllm_0_11_0.llm_engine import LLMEngine0110 -from roll.utils.send_recv_utils import SendBucketManager -from roll.utils.logging import get_logger -from roll.platforms import current_platform - -logger = get_logger() - -class Llm0110(LLM): - - def __init__( - self, - resource_placement_groups: List[Dict], - model: str, - *, - runner: RunnerOption = "auto", - convert: ConvertOption = "auto", - tokenizer: Optional[str] = None, - tokenizer_mode: TokenizerMode = "auto", - skip_tokenizer_init: bool = False, - trust_remote_code: bool = False, - allowed_local_media_path: str = "", - allowed_media_domains: Optional[list[str]] = None, - tensor_parallel_size: int = 1, - dtype: ModelDType = "auto", - quantization: Optional[QuantizationMethods] = None, - revision: Optional[str] = None, - tokenizer_revision: Optional[str] = None, - seed: Optional[int] = None, - gpu_memory_utilization: float = 0.9, - swap_space: float = 4, - cpu_offload_gb: float = 0, - enforce_eager: bool = False, - disable_custom_all_reduce: bool = False, - hf_token: Optional[Union[bool, str]] = None, - hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[dict[str, Any]] = None, - pooler_config: Optional[PoolerConfig] = None, - override_pooler_config: Optional[PoolerConfig] = None, - structured_outputs_config: Optional[Union[dict[ - str, Any], StructuredOutputsConfig]] = None, - kv_cache_memory_bytes: Optional[int] = None, - compilation_config: Optional[Union[int, dict[str, Any], - CompilationConfig]] = None, - logits_processors: Optional[list[Union[str, - type[LogitsProcessor]]]] = None, - **kwargs: Any, - ) -> None: - """LLM constructor.""" - # setup envs for vllm - # https://github.com/vllm-project/vllm/pull/14189/files - # TODO do not override other options in PYTORCH_CUDA_ALLOC_CONF - os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "" - # torch.cuda may already init, explicitly disable expandable_segments - # here (only matters when VLLM_USE_RAY_SPMD_WORKER=0) - current_platform.set_allocator_settings("expandable_segments:False") - - os.environ["VLLM_CACHE_ROOT"] = os.path.join( - get_default_cache_root(), "vllm", os.environ.get("WORKER_NAME", "")) - - if "disable_log_stats" not in kwargs: - kwargs["disable_log_stats"] = True - - if "worker_cls" in kwargs: - worker_cls = kwargs["worker_cls"] - # if the worker_cls is not qualified string name, - # we serialize it using cloudpickle to avoid pickling issues - if isinstance(worker_cls, type): - kwargs["worker_cls"] = cloudpickle.dumps(worker_cls) - - if "kv_transfer_config" in kwargs and isinstance( - kwargs["kv_transfer_config"], dict): - from vllm.config.kv_transfer import KVTransferConfig - raw_config_dict = kwargs["kv_transfer_config"] - try: - kwargs["kv_transfer_config"] = KVTransferConfig( - **raw_config_dict) - except ValidationError as e: - logger.error( - "Failed to convert 'kv_transfer_config' dict to " - "KVTransferConfig object. Dict: %s. Error: %s", - raw_config_dict, e) - # Consider re-raising a more specific vLLM error or ValueError - # to provide better context to the user. - raise ValueError( - f"Invalid 'kv_transfer_config' provided: {e}") from e - - if hf_overrides is None: - hf_overrides = {} - - if compilation_config is not None: - if isinstance(compilation_config, int): - compilation_config_instance = CompilationConfig( - level=compilation_config) - elif isinstance(compilation_config, dict): - predicate = lambda x: is_init_field(CompilationConfig, x[0]) - compilation_config_instance = CompilationConfig( - **dict(filter(predicate, compilation_config.items()))) - else: - compilation_config_instance = compilation_config - else: - compilation_config_instance = CompilationConfig() - - if structured_outputs_config is not None: - if isinstance(structured_outputs_config, dict): - structured_outputs_instance = StructuredOutputsConfig( - **{ - k: v - for k, v in structured_outputs_config.items() - if is_init_field(StructuredOutputsConfig, k) - }) - else: - structured_outputs_instance = structured_outputs_config - else: - structured_outputs_instance = StructuredOutputsConfig() - - kwargs["enable_sleep_mode"] = True - engine_args = EngineArgs( - model=model, - runner=runner, - convert=convert, - tokenizer=tokenizer, - tokenizer_mode=tokenizer_mode, - skip_tokenizer_init=skip_tokenizer_init, - trust_remote_code=trust_remote_code, - allowed_local_media_path=allowed_local_media_path, - allowed_media_domains=allowed_media_domains, - tensor_parallel_size=tensor_parallel_size, - dtype=dtype, - quantization=quantization, - revision=revision, - tokenizer_revision=tokenizer_revision, - seed=seed, - gpu_memory_utilization=gpu_memory_utilization, - kv_cache_memory_bytes=kv_cache_memory_bytes, - swap_space=swap_space, - cpu_offload_gb=cpu_offload_gb, - enforce_eager=enforce_eager, - disable_custom_all_reduce=disable_custom_all_reduce, - hf_token=hf_token, - hf_overrides=hf_overrides, - mm_processor_kwargs=mm_processor_kwargs, - pooler_config=pooler_config, - override_pooler_config=override_pooler_config, - structured_outputs_config=structured_outputs_instance, - compilation_config=compilation_config_instance, - logits_processors=logits_processors, - **kwargs, - ) - engine_args.resource_placement_groups = resource_placement_groups - - log_non_default_args(engine_args) - - # Create the Engine (autoselects V0 vs V1) - self.llm_engine = LLMEngine0110.from_engine_args( - engine_args=engine_args, usage_context=UsageContext.LLM_CLASS) - self.engine_class = type(self.llm_engine) - - self.request_counter = Counter() - self.default_sampling_params: Union[dict[str, Any], None] = None - - supported_tasks = self.llm_engine.get_supported_tasks() # type: ignore - - logger.info("Supported_tasks: %s", supported_tasks) - - self.supported_tasks = supported_tasks - - # Load the Input/Output processor plugin if any - self.model_config = self.llm_engine.model_config - io_processor_plugin = self.llm_engine.model_config.io_processor_plugin - self.io_processor = get_io_processor(self.llm_engine.vllm_config, - io_processor_plugin) - - def load_states(self): - self.collective_rpc(method="load_states") - - def offload_states(self, level=1): - self.reset_prefix_cache() - self.collective_rpc(method="offload_states", args=(level,)) - - def fetch_output(self): - # simulating non blocking semantic when using v1 engine - if envs.VLLM_USE_V1: - try: - request_outputs = self.llm_engine.step_nowait() - except queue.Empty: - request_outputs = [] - else: - request_outputs = self.llm_engine.step() - - return request_outputs - - def get_num_waiting(self): - stats = self.llm_engine._get_stats(scheduler_outputs=None) - return stats.num_waiting_sys - - def add_requests( - self, - prompt_token_ids: List[List[int]], - request_ids: List[int] | None, - sampling_params: SamplingParams, - multi_modal_data: List[int] | None, - lora_requests: List[LoRARequest] | None, - ): - assert len(prompt_token_ids) == len(request_ids) - if multi_modal_data: - assert len(multi_modal_data) == len(request_ids) - for i, (token_ids, request_id)in enumerate(zip(prompt_token_ids, request_ids)): - if request_id is None: - request_id = next(self.request_counter) - lora_request = lora_requests[i] if lora_requests is not None else None - if multi_modal_data: - # in v1, input_preprocessor is in engine.processor - processor = getattr(self.llm_engine, "processor", None) - input_preprocessor = processor.input_preprocessor if processor else self.llm_engine.input_preprocessor - preprocessed_inputs = input_preprocessor.preprocess( - prompt={"prompt_token_ids": token_ids, "multi_modal_data": multi_modal_data[i]}, - lora_request=lora_request, - ) - # in v1, engine does not use a input_processor - processed_inputs = ( - self.llm_engine.input_processor(preprocessed_inputs) - if hasattr(self.llm_engine, "input_processor") - else preprocessed_inputs - ) - else: - processed_inputs = { - "type": "token", - "prompt_token_ids": token_ids - } - self.llm_engine._add_processed_request( - request_id=request_id, - processed_inputs=processed_inputs, - params=sampling_params, - arrival_time=time.time(), - lora_request=lora_request, - ) - - def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: - self.llm_engine.abort_request(request_id) - - def clear_unfinished_requests(self): - self._run_engine(use_tqdm=True) - - # 参数同步接口 - def setup_collective_group(self, *args, **kwargs): - self.collective_rpc(method="setup_collective_group", args=args, kwargs=kwargs) - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - if envs.VLLM_USE_V1: - SendBucketManager.meta_to_dict(meta_infos) - self.collective_rpc(method="broadcast_bucket", args=(src_pp_rank, meta_infos, bucket_size)) - - def broadcast_parameter(self, *args, **kwargs): - self.collective_rpc(method="broadcast_parameter", args=args, kwargs=kwargs) - - def update_parameter(self, parameter_name, weight, ranks_in_worker, is_lora): - if envs.VLLM_USE_V1: - weight_dict = { - "dtype": weight.dtype, - "weight": weight.cpu().tolist() - } - self.collective_rpc(method="update_parameter", args=(parameter_name, weight_dict, ranks_in_worker, is_lora)) - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - if envs.VLLM_USE_V1: - SendBucketManager.meta_to_dict(meta_infos) - # vllm 084 does not support serialization of torch.Tensor(GPU), must use custom - # numpy array encoder or use pickle. - # Can not convert to numpy array here, because of bug in encoder/decoder of vllm 084. - # Newer version of vllm support efficient serilization of torch.Tensor. - buffer = buffer.cpu().tolist() - self.collective_rpc(method="update_parameter_in_bucket", args=(meta_infos, buffer, ranks_in_worker)) - - def add_lora(self, *args, **kwargs): - self.collective_rpc(method="add_lora", args=args, kwargs=kwargs) diff --git a/roll/third_party/vllm/vllm_0_11_0/llm_engine.py b/roll/third_party/vllm/vllm_0_11_0/llm_engine.py deleted file mode 100644 index bd573ccea..000000000 --- a/roll/third_party/vllm/vllm_0_11_0/llm_engine.py +++ /dev/null @@ -1,87 +0,0 @@ -from typing import Dict, Optional, Type - -from vllm import LLMEngine, EngineArgs, envs -from vllm.config import VllmConfig -from vllm.usage.usage_lib import UsageContext -from vllm.engine.metrics_types import StatLoggerBase - -import roll.third_party.vllm.fp8 as fp8 -from roll.utils.logging import get_logger - -logger = get_logger() - - -class LLMEngine0110(LLMEngine): - - @classmethod - def from_vllm_config( - cls, - vllm_config: VllmConfig, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - disable_log_stats: bool = False, - ) -> "LLMEngine": - parallel_config = vllm_config.parallel_config - - executor_class = cls._get_executor_cls(vllm_config) - if parallel_config.distributed_executor_backend == "ray": - from roll.third_party.vllm.vllm_0_10_0.ray_distributed_executor import ( - CustomRayDistributedExecutor as V0CustomRayDistributedExecutor) - executor_class = V0CustomRayDistributedExecutor - - logger.info(f"Using executor_class: {executor_class}") - logger.info(f"Using worker cls: {parallel_config.worker_cls}") - return cls( - vllm_config=vllm_config, - executor_class=executor_class, - log_stats=(not disable_log_stats), - usage_context=usage_context, - stat_loggers=stat_loggers, - ) - - @classmethod - def from_engine_args( - cls, - engine_args: EngineArgs, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - ) -> "LLMEngine": - """Creates an LLM engine from the engine arguments.""" - # Create the engine configs. - vllm_config = engine_args.create_engine_config(usage_context) - parallel_config = vllm_config.parallel_config - - resource_placement_groups = getattr(engine_args, "resource_placement_groups") - assert len(resource_placement_groups) == parallel_config.world_size - parallel_config.placement_group = resource_placement_groups - - # change worker cls to custom - cls.update_worker_cls_config(vllm_config) - - fp8.update_quant_config(vllm_config) - - engine_cls = cls - if envs.VLLM_USE_V1: - from roll.third_party.vllm.vllm_0_11_0.v1.llm_engine import ( - LLMEngine0110 as V1LLMEngine0110) - engine_cls = V1LLMEngine0110 - - return engine_cls.from_vllm_config( - vllm_config=vllm_config, - usage_context=usage_context, - stat_loggers=stat_loggers, - disable_log_stats=engine_args.disable_log_stats, - ) - - @classmethod - def update_worker_cls_config(cls, vllm_config: VllmConfig) -> None: - parallel_config = vllm_config.parallel_config - - assert parallel_config.worker_cls != "auto" - if vllm_config.speculative_config: - pass - else: - if envs.VLLM_USE_V1: - parallel_config.worker_cls = "roll.third_party.vllm.vllm_0_11_0.v1.worker.Worker0110" - else: - parallel_config.worker_cls = "roll.third_party.vllm.vllm_0_11_0.worker.Worker0110" diff --git a/roll/third_party/vllm/vllm_0_11_0/ray_distributed_executor.py b/roll/third_party/vllm/vllm_0_11_0/ray_distributed_executor.py index 1b9288825..608f7f460 100644 --- a/roll/third_party/vllm/vllm_0_11_0/ray_distributed_executor.py +++ b/roll/third_party/vllm/vllm_0_11_0/ray_distributed_executor.py @@ -109,29 +109,18 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", env_vars = {} env_vars.update(roll_current_platform.get_custom_env_vars()) env_vars.update(roll_current_platform.get_vllm_run_time_env_vars(gpu_rank)) + env_vars["FLASHINFER_WORKSPACE_BASE"] = f"{os.environ['FLASHINFER_WORKSPACE_BASE']}_{rank}" runtime_env = RuntimeEnv(env_vars=env_vars) assert current_platform.ray_device_key == "GPU" # NV+AMD GPUs, and Intel XPUs - if current_platform.ray_device_key == "GPU": - worker = ray.remote( - num_cpus=0, - num_gpus=0.01, - runtime_env=runtime_env, - scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg, ), - **ray_remote_kwargs, - )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, - rpc_rank=rank) - else: - worker = ray.remote( - num_cpus=0, - num_gpus=0, - runtime_env=runtime_env, - resources={current_platform.ray_device_key: 0.01}, - scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg, ), - **ray_remote_kwargs, - )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, - rpc_rank=rank) - + worker = ray.remote( + num_cpus=0, + num_gpus=0.01, + runtime_env=runtime_env, + scheduling_strategy=PlacementGroupSchedulingStrategy(placement_group=pg, ), + **ray_remote_kwargs, + )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, + rpc_rank=rank) worker_metadata.append( RayWorkerMetaData(worker=worker, created_rank=rank)) diff --git a/roll/third_party/vllm/vllm_0_11_0/v1/llm_engine.py b/roll/third_party/vllm/vllm_0_11_0/v1/llm_engine.py deleted file mode 100644 index 8a0d95c28..000000000 --- a/roll/third_party/vllm/vllm_0_11_0/v1/llm_engine.py +++ /dev/null @@ -1,233 +0,0 @@ -import os -import time -from collections.abc import Mapping, Sequence -from copy import copy -from typing import Any, Optional, Union - -from vllm import envs -from vllm.config import VllmConfig -from vllm.usage.usage_lib import UsageContext -from vllm.v1.metrics.loggers import StatLoggerFactory -from vllm.v1.engine.processor import Processor -from vllm.config import VllmConfig -from vllm.inputs import ProcessorInputs, PromptType, SingletonInputs -from vllm.multimodal.inputs import MultiModalFeatureSpec -from vllm.inputs.parse import split_enc_dec_inputs -from vllm.outputs import PoolingRequestOutput, RequestOutput -from vllm.lora.request import LoRARequest -from vllm.multimodal.utils import argsort_mm_positions -from vllm.pooling_params import PoolingParams -from vllm.sampling_params import SamplingParams -from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine import EngineCoreOutputs -from vllm.v1.engine.core_client import SyncMPClient -from vllm.v1.executor.abstract import Executor -from vllm.v1.engine.llm_engine import LLMEngine -from vllm.v1.engine.parallel_sampling import ParentRequest -from vllm.v1.metrics.stats import IterationStats -from roll.utils.logging import get_logger - -logger = get_logger() - -def custom_process_inputs( - self, - request_id: str, - prompt: ProcessorInputs, - params: Union[SamplingParams, PoolingParams], - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - data_parallel_rank: Optional[int] = None, -) -> tuple[Optional[str], EngineCoreRequest]: - - # TODO(woosuk): Support pooling models. - self._validate_lora(lora_request) - self._validate_params(params) - - data_parallel_size = self.vllm_config.parallel_config.data_parallel_size - if data_parallel_rank is not None and not (0 <= data_parallel_rank < - data_parallel_size): - raise ValueError(f"data_parallel_rank {data_parallel_rank} " - f"is out of range [0, {data_parallel_size}).") - - assert arrival_time is not None - processed_inputs: ProcessorInputs = prompt - - eos_token_id = self.input_preprocessor.get_eos_token_id() - - encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) - self._validate_model_inputs(encoder_inputs, decoder_inputs) - - sampling_params = None - pooling_params = None - if isinstance(params, SamplingParams): - # TODO: can we avoid cloning here in multiproc case? - sampling_params = params.clone() - # If unset max tokens, then generate up to the max_model_len. - if sampling_params.max_tokens is None: - sampling_params.max_tokens = ( - self.model_config.max_model_len - - len(decoder_inputs["prompt_token_ids"])) - sampling_params.update_from_generation_config( - self.generation_config_fields, eos_token_id) - if self.tokenizer is not None: - sampling_params.update_from_tokenizer(self.tokenizer) - else: - pooling_params = params.clone() - - # Multimodal related. - mm_features: Optional[list[MultiModalFeatureSpec]] = None - - if decoder_inputs["type"] == "multimodal": - decoder_mm_inputs = decoder_inputs["mm_kwargs"] - decoder_mm_positions = decoder_inputs["mm_placeholders"] - decoder_mm_hashes = decoder_inputs["mm_hashes"] - - # Merge and flatten multimodal placeholders, hashes and inputs - # from dictionaries to lists, and sort them by each item's position - # in the input sequence. - sorted_mm_idxs = argsort_mm_positions(decoder_mm_positions) - - mm_features = [] - for modality, idx in sorted_mm_idxs: - mm_features.append( - MultiModalFeatureSpec( - data=decoder_mm_inputs[modality][idx], - modality=modality, - identifier=decoder_mm_hashes[modality][idx], - mm_position=decoder_mm_positions[modality][idx])) - - return decoder_inputs.get("prompt"), EngineCoreRequest( - request_id=request_id, - prompt_token_ids=decoder_inputs["prompt_token_ids"], - mm_features=mm_features, - sampling_params=sampling_params, - pooling_params=pooling_params, - eos_token_id=eos_token_id, - arrival_time=arrival_time, - lora_request=lora_request, - cache_salt=decoder_inputs.get("cache_salt"), - priority=priority, - data_parallel_rank=data_parallel_rank, - trace_headers=trace_headers, - ) - -Processor.custom_process_inputs = custom_process_inputs - -def get_output_nowait(self) -> EngineCoreOutputs: - """ - Only get an item if one is immediately available. Otherwise - raise the queue.Empty exception. - """ - # If an exception arises in process_outputs_socket task, - # it is forwarded to the outputs_queue so we can raise it - # from this (run_output_handler) task to shut down the server. - outputs = self.outputs_queue.get_nowait() - if isinstance(outputs, Exception): - raise self._format_exception(outputs) from None - if outputs.wave_complete is not None: - self.engines_running = False - return outputs - -# Function 'step' of vllm v1 and v0 engine has different semantic. -# Function vllm.v1.engine.LLMEngine.step is blocking but that of v0 is not. -# This will cause deadlock when calling roll.third_party.vllm.vllm_0_8_4.Llm084.fetch_output -# inside VllmStrategy if set generate_opt_level to 1. -SyncMPClient.get_output_nowait = get_output_nowait - -class LLMEngine0110(LLMEngine): - - @classmethod - def from_vllm_config( - cls, - vllm_config: VllmConfig, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[list[StatLoggerFactory]] = None, - disable_log_stats: bool = False, - ) -> "LLMEngine": - parallel_config = vllm_config.parallel_config - - executor_class = Executor.get_class(vllm_config) - if parallel_config.distributed_executor_backend == "ray": - from roll.third_party.vllm.vllm_0_11_0.v1.ray_distributed_executor import ( - CustomRayDistributedExecutor as V1CustomeRayDistributedExecutor) - executor_class = V1CustomeRayDistributedExecutor - - # Default fork method is not compatible with ScaleAligner. - os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' - - logger.info(f"Using executor_class: {executor_class}") - logger.info(f"Using worker cls: {parallel_config.worker_cls}") - return cls(vllm_config=vllm_config, - executor_class=executor_class, - log_stats=(not disable_log_stats), - usage_context=usage_context, - stat_loggers=stat_loggers, - multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING) - - def _add_processed_request( - self, - request_id: str, - processed_inputs: ProcessorInputs, - params: Union[SamplingParams, PoolingParams], - arrival_time: float, - lora_request: Optional[LoRARequest], - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - ) -> None: - prompt_str, request = self.processor.custom_process_inputs(request_id, processed_inputs, params, - arrival_time, lora_request, - trace_headers, - priority) - - n = params.n if isinstance(params, SamplingParams) else 1 - - if n == 1: - # Make a new RequestState and queue. - self.output_processor.add_request(request, prompt_str, None, 0) - # Add the request to EngineCore. - self.engine_core.add_request(request) - return - - # Fan out child requests (for n>1). - parent_req = ParentRequest(request_id, params) - for idx in range(n): - request_id, params = parent_req.get_child_info(idx) - child_request = request if idx == n - 1 else copy(request) - child_request.request_id = request_id - child_request.sampling_params = params - - # Make a new RequestState and queue. - self.output_processor.add_request(child_request,prompt_str, parent_req, idx) - # Add the request to EngineCore. - self.engine_core.add_request(child_request) - - def step_nowait(self) -> Union[list[RequestOutput], list[PoolingRequestOutput]]: - - if self.should_execute_dummy_batch: - self.should_execute_dummy_batch = False - self.engine_core.execute_dummy_batch() - return [] - - # 1) Get EngineCoreOutput from the EngineCore. - outputs = self.engine_core.get_output_nowait() - - # 2) Process EngineCoreOutputs. - iteration_stats = IterationStats() if self.log_stats else None - processed_outputs = self.output_processor.process_outputs( - outputs.outputs, - engine_core_timestamp=outputs.timestamp, - iteration_stats=iteration_stats) - - # 3) Abort any reqs that finished due to stop strings. - self.engine_core.abort_requests(processed_outputs.reqs_to_abort) - - # 4) Record stats - if self.logger_manager is not None: - assert outputs.scheduler_stats is not None - self.logger_manager.record(scheduler_stats=outputs.scheduler_stats, - iteration_stats=iteration_stats) - self.do_log_stats_with_interval() - - return processed_outputs.request_outputs diff --git a/roll/third_party/vllm/vllm_0_11_0/v1/worker.py b/roll/third_party/vllm/vllm_0_11_0/v1/worker.py deleted file mode 100644 index 316ea41eb..000000000 --- a/roll/third_party/vllm/vllm_0_11_0/v1/worker.py +++ /dev/null @@ -1,51 +0,0 @@ -import gc -import time -from collections import OrderedDict - -import torch - -from roll.platforms import current_platform -from roll.third_party.vllm.vllm_utils import TensorLoRARequest, patch_vllm_lora_manager -from roll.third_party.vllm.worker_helper import WorkerHelper -from roll.utils.logging import get_logger -from roll.utils.send_recv_utils import RecvBucketManager - - -logger = get_logger() - -Worker = current_platform.get_vllm_worker_class() - - -class Worker0110(WorkerHelper, Worker): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.lora_params = OrderedDict() - patch_vllm_lora_manager() - - def update_parameter(self, parameter_name, weight, ranks_in_worker, is_lora): - weight_dict = weight - weight = torch.tensor(weight_dict["weight"], dtype=weight_dict["dtype"]).cuda() - super().update_parameter(parameter_name, weight, ranks_in_worker, is_lora) - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - RecvBucketManager.dict_to_meta(meta_infos) - super().broadcast_bucket(src_pp_rank, meta_infos, bucket_size) - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - RecvBucketManager.dict_to_meta(meta_infos) - buffer = torch.tensor(buffer, dtype=torch.int8, device='cuda') - super().update_parameter_in_bucket(meta_infos, buffer, ranks_in_worker) - - def add_lora(self, peft_config) -> bool: - lora_int_id = int(time.time_ns() % 0x7FFFFFFF) - lora_request = TensorLoRARequest( - lora_name=f"{lora_int_id}", - lora_int_id=lora_int_id, - lora_path="dummy_lora_path", - peft_config=peft_config, - lora_tensors=self.lora_params, - ) - del self.lora_params - self.lora_params = OrderedDict() - super().reload_model() - return self.model_runner.add_lora(lora_request) diff --git a/roll/third_party/vllm/vllm_0_11_0/worker.py b/roll/third_party/vllm/vllm_0_11_0/worker.py deleted file mode 100644 index d88e99b9c..000000000 --- a/roll/third_party/vllm/vllm_0_11_0/worker.py +++ /dev/null @@ -1,15 +0,0 @@ -import gc -from typing import Optional - -import torch -from vllm.worker.worker import Worker - -from roll.third_party.vllm.worker_helper import WorkerHelper -from roll.utils.logging import get_logger - -logger = get_logger() - - -class Worker0110(WorkerHelper, Worker): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) diff --git a/roll/third_party/vllm/vllm_0_10_0/v1/__init__.py b/roll/third_party/vllm/vllm_0_12_0/__init__.py similarity index 100% rename from roll/third_party/vllm/vllm_0_10_0/v1/__init__.py rename to roll/third_party/vllm/vllm_0_12_0/__init__.py diff --git a/roll/third_party/vllm/vllm_0_12_0/ray_distributed_executor.py b/roll/third_party/vllm/vllm_0_12_0/ray_distributed_executor.py new file mode 100644 index 000000000..979457445 --- /dev/null +++ b/roll/third_party/vllm/vllm_0_12_0/ray_distributed_executor.py @@ -0,0 +1,190 @@ +import os +from typing import TYPE_CHECKING + +import ray +from ray.runtime_env import RuntimeEnv +from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy +from vllm.platforms import current_platform +from vllm.ray.ray_env import get_env_vars_to_copy +from vllm.utils.network_utils import get_distributed_init_method, get_ip, get_open_port +from vllm.v1.core.sched.output import SchedulerOutput +from vllm.v1.executor.ray_executor import RayDistributedExecutor, RayWorkerMetaData +from vllm.v1.executor.ray_utils import RayWorkerWrapper + +from roll.platforms import current_platform as roll_current_platform +from roll.utils.logging import get_logger + + +if TYPE_CHECKING: + from ray.util.placement_group import PlacementGroup + + +logger = get_logger() + + +def initialize_ray_cluster(ray_address: str | None = None): + if ray.is_initialized(): + return + ray.init(address=ray_address) + + +class CustomRayDistributedExecutor(RayDistributedExecutor): + def _init_executor(self) -> None: + self.forward_dag: ray.dag.CompiledDAG | None = None + + assert not current_platform.is_tpu() + + placement_group = self.parallel_config.placement_group + assert self.uses_ray + assert len(placement_group) > 0 + initialize_ray_cluster(placement_group[0]["ray_address"]) + assert ray.is_initialized() + + # Disable Ray usage stats collection. + ray_usage = os.environ.get("RAY_USAGE_STATS_ENABLED", "0") + if ray_usage != "1": + os.environ["RAY_USAGE_STATS_ENABLED"] = "0" + + # Create the parallel GPU workers. + self._init_workers_ray(placement_group) + + # KV connector setup + self.has_connector = self.vllm_config.kv_transfer_config is not None + + self.uses_sampler = self.vllm_config.model_config.runner_type != "pooling" and ( + self.vllm_config.ec_transfer_config is None or not self.vllm_config.ec_transfer_config.is_ec_producer + ) + + self.scheduler_output: SchedulerOutput | None = None + + def _init_workers_ray(self, placement_group: "PlacementGroup", **ray_remote_kwargs): + assert len(placement_group) == self.parallel_config.world_size + + # The driver dummy worker does not actually use any resources. + # It holds the resource for the driver worker. + self.driver_dummy_worker: RayWorkerWrapper | None = None + # The remaining workers are the actual ray actors. + self.workers: list[RayWorkerWrapper] = [] + + # Used in ray compiled DAG: indexed first by PP rank, + # and then TP rank. In other words, the inner list is + # the TP group of workers for a PP rank. + self.pp_tp_workers: list[list[RayWorkerWrapper]] = [] + + if self.parallel_config.ray_workers_use_nsight: + ray_remote_kwargs = self._configure_ray_workers_use_nsight(ray_remote_kwargs) + + worker_metadata: list[RayWorkerMetaData] = [] + driver_ip = get_ip() + for rank in range(self.parallel_config.world_size): + pg = placement_group[rank]["placement_group"] + gpu_rank = placement_group[rank]["gpu_rank"] + env_vars = {} + env_vars.update(roll_current_platform.get_custom_env_vars()) + env_vars.update(roll_current_platform.get_vllm_run_time_env_vars(gpu_rank)) + runtime_env = RuntimeEnv(env_vars=env_vars) + assert current_platform.ray_device_key == "GPU" + # NV+AMD GPUs, and Intel XPUs + worker = ray.remote( + num_cpus=0, + num_gpus=0.01, + runtime_env=runtime_env, + scheduling_strategy=PlacementGroupSchedulingStrategy( + placement_group=pg, + ), + **ray_remote_kwargs, + )(RayWorkerWrapper).remote(vllm_config=self.vllm_config, rpc_rank=rank) + worker_metadata.append(RayWorkerMetaData(worker=worker, created_rank=rank)) + + worker_ips = ray.get( + [ + each.worker.get_node_ip.remote() # type: ignore[attr-defined] + for each in worker_metadata + ] + ) + + for each, ip in zip(worker_metadata, worker_ips): + each.ip = ip + + logger.debug("workers: %s", worker_metadata) + logger.debug("driver_dummy_worker: %s", self.driver_dummy_worker) + + # No need to sort, just use the given resource order of the placement group + for i, item in enumerate(worker_metadata): + item.adjusted_rank = i + self.workers = [item.worker for item in worker_metadata] + rerank_mapping = {item.created_rank: item.adjusted_rank for item in worker_metadata} + self.collective_rpc("adjust_rank", args=(rerank_mapping,)) + + # Get the set of GPU IDs used on each node. + worker_node_and_gpu_ids = [] + for worker in [self.driver_dummy_worker] + self.workers: + if worker is None: + # driver_dummy_worker can be None when using ray spmd worker. + continue + worker_node_and_gpu_ids.append(ray.get(worker.get_node_and_gpu_ids.remote())) # type: ignore[attr-defined] + + # Set environment variables for the driver and workers. + # remove device_control_env_var(CUDA_VISIBLE_DEVICES), for we only allocate one gpu for each worker + all_args_to_update_environment_variables = [{}] * len(worker_node_and_gpu_ids) + + # Environment variables to copy from driver to workers + env_vars_to_copy = get_env_vars_to_copy( + exclude_vars=self.WORKER_SPECIFIC_ENV_VARS, + additional_vars=set(current_platform.additional_env_vars).union(self.ADDITIONAL_ENV_VARS), + destination="workers", + ) + + # Copy existing env vars to each worker's args + for args in all_args_to_update_environment_variables: + # TODO: refactor platform-specific env vars + for name in env_vars_to_copy: + if name in os.environ: + args[name] = os.environ[name] + + self._env_vars_for_all_workers = all_args_to_update_environment_variables + + self.collective_rpc("update_environment_variables", args=(self._get_env_vars_to_be_updated(),)) + + distributed_init_method = get_distributed_init_method(driver_ip, get_open_port()) + + # Initialize the actual workers inside worker wrapper. + all_kwargs = [] + for rank, (node_id, _) in enumerate(worker_node_and_gpu_ids): + local_rank = 0 + kwargs = dict( + vllm_config=self.vllm_config, + local_rank=local_rank, + rank=rank, + distributed_init_method=distributed_init_method, + is_driver_worker=(not self.parallel_config) or (rank % self.parallel_config.tensor_parallel_size == 0), + ) + all_kwargs.append(kwargs) + self.collective_rpc("init_worker", args=(all_kwargs,)) + + self.collective_rpc("init_device") + self.collective_rpc("load_model") + + for pp_rank in range(self.parallel_config.pipeline_parallel_size): + self.pp_tp_workers.append([]) + for tp_rank in range(self.parallel_config.tensor_parallel_size): + # PP=2, TP=4 + # pp_tp_workers = [[0, 1, 2, 3], [4, 5, 6, 7]] + rank = (pp_rank * self.parallel_config.tensor_parallel_size) + tp_rank + assert len(self.pp_tp_workers[pp_rank]) == tp_rank + assert pp_rank < len(self.pp_tp_workers) + self.pp_tp_workers[pp_rank].append(self.workers[rank]) + + def shutdown(self) -> None: + logger.info( + "Shutting down Ray distributed executor. If you see error log " + "from logging.cc regarding SIGTERM received, please ignore because " + "this is the expected termination process in Ray." + ) + if hasattr(self, "forward_dag") and self.forward_dag is not None: + self.forward_dag.teardown() + import ray + + for worker in self.workers: + ray.kill(worker) + self.forward_dag = None diff --git a/roll/third_party/vllm/vllm_0_8_4/__init__.py b/roll/third_party/vllm/vllm_0_8_4/__init__.py index 6f1750d59..633252a34 100644 --- a/roll/third_party/vllm/vllm_0_8_4/__init__.py +++ b/roll/third_party/vllm/vllm_0_8_4/__init__.py @@ -1,16 +1,89 @@ +# Patch CustomAsyncLLM.generate and OutputProcessor.abort_requests +# (more on tests.third_party.vllm.test_vllm_local.test_vllm_abort) +from typing import Optional +from collections.abc import AsyncGenerator, Mapping, Iterable +import asyncio + +from vllm.inputs import PromptType +from vllm.lora.request import LoRARequest +from vllm.outputs import RequestOutput +from vllm.prompt_adapter.request import PromptAdapterRequest +from vllm.sampling_params import SamplingParams +from vllm.v1.engine.output_processor import OutputProcessor + +from roll.third_party.vllm.async_llm import CustomAsyncLLM + +async def generate( + self, + prompt: PromptType, + sampling_params: SamplingParams, + request_id: str, + lora_request: Optional[LoRARequest] = None, + trace_headers: Optional[Mapping[str, str]] = None, + prompt_adapter_request: Optional[PromptAdapterRequest] = None, + priority: int = 0, +) -> AsyncGenerator[RequestOutput, None]: + try: + if self.output_handler is None: + self.output_handler = asyncio.create_task( + self._run_output_handler()) + + q = await self.add_request( + request_id, + prompt, + sampling_params, + lora_request=lora_request, + trace_headers=trace_headers, + prompt_adapter_request=prompt_adapter_request, + priority=priority, + ) + + finished = False + while not finished: + out = q.get_nowait() or await q.get() + + if isinstance(out, BaseException) or (isinstance(out, type) and issubclass(out, BaseException)): + # raise asyncio.CancelledError, will not cause dead recursive + raise out + + finished = out.finished + yield out + + except asyncio.CancelledError: + await self.abort(request_id) + raise +CustomAsyncLLM.generate = generate + +def abort_requests( + self, + request_ids: Iterable[str], +) -> list[str]: + request_ids_to_abort = [] + for request_id in request_ids: + req_state = self.request_states.pop(request_id, None) + if req_state is not None: + self.lora_states.abort_request(req_state) + request_ids_to_abort.append(request_id) + req_state.queue.put(asyncio.CancelledError) # wakeup generate coroutine with asyncio.CancelledError + else: + parent = self.parent_requests.pop(request_id, None) + if parent and parent.child_requests: + self.abort_requests(parent.child_requests) + request_ids_to_abort.extend(parent.child_requests) + return request_ids_to_abort +OutputProcessor.abort_requests = abort_requests + + # patch qwen3 fp8 # https://github.com/vllm-project/vllm/issues/17327 # https://github.com/vllm-project/vllm/pull/17318 - from vllm.model_executor.layers.linear import QKVParallelLinear - from typing import Optional import torch from vllm.model_executor.parameter import (BasevLLMParameter, BlockQuantScaleParameter, PerTensorScaleParameter, RowvLLMParameter) - def weight_loader_v2(self, param: BasevLLMParameter, loaded_weight: torch.Tensor, @@ -45,7 +118,7 @@ def weight_loader_v2(self, shard_id=loaded_shard_id, shard_offset=shard_offset, shard_size=shard_size) - QKVParallelLinear.weight_loader_v2 = weight_loader_v2 + __all__ = [] diff --git a/roll/third_party/vllm/vllm_0_8_4/llm.py b/roll/third_party/vllm/vllm_0_8_4/llm.py deleted file mode 100644 index 0f716bb53..000000000 --- a/roll/third_party/vllm/vllm_0_8_4/llm.py +++ /dev/null @@ -1,230 +0,0 @@ -import os -import queue -import time -from typing import Any, Dict, Iterable, List, Optional, Union - -import cloudpickle -import torch -from vllm import LLM, EngineArgs, SamplingParams, envs -from vllm.config import CompilationConfig -from vllm.engine.arg_utils import HfOverrides, PoolerConfig, TaskOption -from vllm.lora.request import LoRARequest -from vllm.usage.usage_lib import UsageContext -from vllm.utils import Counter -from vllm.envs import get_default_cache_root - -from roll.platforms import current_platform -from roll.third_party.vllm.vllm_0_8_4.llm_engine import LLMEngine084 -from roll.utils.send_recv_utils import SendBucketManager - - -class Llm084(LLM): - - def __init__( - self, - resource_placement_groups: List[Dict], - model: str, - tokenizer: Optional[str] = None, - tokenizer_mode: str = "auto", - skip_tokenizer_init: bool = False, - trust_remote_code: bool = False, - allowed_local_media_path: str = "", - tensor_parallel_size: int = 1, - dtype: str = "auto", - quantization: Optional[str] = None, - revision: Optional[str] = None, - tokenizer_revision: Optional[str] = None, - seed: Optional[int] = None, - gpu_memory_utilization: float = 0.9, - swap_space: float = 4, - cpu_offload_gb: float = 0, - enforce_eager: Optional[bool] = None, - max_seq_len_to_capture: int = 8192, - disable_custom_all_reduce: bool = False, - disable_async_output_proc: bool = False, - hf_overrides: Optional[HfOverrides] = None, - mm_processor_kwargs: Optional[dict[str, Any]] = None, - # After positional args are removed, move this right below `model` - task: TaskOption = "auto", - override_pooler_config: Optional[PoolerConfig] = None, - compilation_config: Optional[Union[int, dict[str, Any]]] = None, - **kwargs, - ) -> None: - ''' - LLM constructor. - - Note: if enforce_eager is unset (enforce_eager is None) - it defaults to False. - ''' - - # setup envs for vllm - # https://github.com/vllm-project/vllm/pull/14189/files - # TODO do not override other options in PYTORCH_CUDA_ALLOC_CONF - os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "" - # torch.cuda may already init, explicitly disable expandable_segments - # here (only matters when VLLM_USE_RAY_SPMD_WORKER=0) - current_platform.set_allocator_settings("expandable_segments:False") - - os.environ["VLLM_CACHE_ROOT"] = os.path.join( - get_default_cache_root(), "vllm", os.environ.get("WORKER_NAME", "")) - - if "disable_log_stats" not in kwargs: - kwargs["disable_log_stats"] = True - - if "worker_cls" in kwargs: - worker_cls = kwargs["worker_cls"] - # if the worker_cls is not qualified string name, - # we serialize it using cloudpickle to avoid pickling issues - if isinstance(worker_cls, type): - kwargs["worker_cls"] = cloudpickle.dumps(worker_cls) - - if compilation_config is not None: - if isinstance(compilation_config, (int, dict)): - compilation_config_instance = CompilationConfig.from_cli( - str(compilation_config)) - else: - compilation_config_instance = compilation_config - else: - compilation_config_instance = None - - kwargs["enable_sleep_mode"] = True - engine_args = EngineArgs( - model=model, - task=task, - tokenizer=tokenizer, - tokenizer_mode=tokenizer_mode, - skip_tokenizer_init=skip_tokenizer_init, - trust_remote_code=trust_remote_code, - allowed_local_media_path=allowed_local_media_path, - tensor_parallel_size=tensor_parallel_size, - dtype=dtype, - quantization=quantization, - revision=revision, - tokenizer_revision=tokenizer_revision, - seed=seed, - gpu_memory_utilization=gpu_memory_utilization, - swap_space=swap_space, - cpu_offload_gb=cpu_offload_gb, - enforce_eager=enforce_eager, - max_seq_len_to_capture=max_seq_len_to_capture, - disable_custom_all_reduce=disable_custom_all_reduce, - disable_async_output_proc=disable_async_output_proc, - hf_overrides=hf_overrides, - mm_processor_kwargs=mm_processor_kwargs, - override_pooler_config=override_pooler_config, - compilation_config=compilation_config_instance, - **kwargs, - ) - engine_args.resource_placement_groups = resource_placement_groups - - # Create the Engine (autoselects V0 vs V1) - self.llm_engine = LLMEngine084.from_engine_args( - engine_args=engine_args, usage_context=UsageContext.LLM_CLASS) - self.engine_class = type(self.llm_engine) - - self.request_counter = Counter() - self.default_sampling_params: Union[dict[str, Any], None] = None - - def load_states(self): - self.collective_rpc(method="load_states") - - def offload_states(self, level=1): - self.reset_prefix_cache() - self.collective_rpc(method="offload_states", args=(level,)) - - def fetch_output(self): - # simulating non blocking semantic when using v1 engine - if envs.VLLM_USE_V1: - try: - request_outputs = self.llm_engine.step_nowait() - except queue.Empty: - request_outputs = [] - else: - request_outputs = self.llm_engine.step() - return request_outputs - - def get_num_waiting(self): - stats = self.llm_engine._get_stats(scheduler_outputs=None) - return stats.num_waiting_sys - - def add_requests( - self, - prompt_token_ids: List[List[int]], - request_ids: List[int] | None, - sampling_params: SamplingParams, - multi_modal_data: List[int] | None, - lora_requests: List[LoRARequest] | None, - ): - assert len(prompt_token_ids) == len(request_ids) - if multi_modal_data: - assert len(multi_modal_data) == len(request_ids) - for i, (token_ids, request_id) in enumerate(zip(prompt_token_ids, request_ids)): - if request_id is None: - request_id = next(self.request_counter) - lora_request = lora_requests[i] if lora_requests is not None else None - if multi_modal_data: - # in v1, input_preprocessor is in engine.processor - processor = getattr(self.llm_engine, "processor", None) - input_preprocessor = processor.input_preprocessor if processor else self.llm_engine.input_preprocessor - preprocessed_inputs = input_preprocessor.preprocess( - prompt={"prompt_token_ids": token_ids, "multi_modal_data": multi_modal_data[i]}, - lora_request=lora_request, - prompt_adapter_request=None, - ) - # in v1, engine does not use a input_processor - processed_inputs = ( - self.llm_engine.input_processor(preprocessed_inputs) - if hasattr(self.llm_engine, "input_processor") - else preprocessed_inputs - ) - else: - processed_inputs = {"type": "token", "prompt_token_ids": token_ids} - self.llm_engine._add_processed_request( - request_id=request_id, - processed_inputs=processed_inputs, - params=sampling_params, - arrival_time=time.time(), - lora_request=lora_request, - prompt_adapter_request=None, - ) - - def abort_request(self, request_id: Union[str, Iterable[str]]) -> None: - self.llm_engine.abort_request(request_id) - - def clear_unfinished_requests(self): - self._run_engine(use_tqdm=True) - - # 参数同步接口 - def setup_collective_group(self, *args, **kwargs): - self.collective_rpc(method="setup_collective_group", args=args, kwargs=kwargs) - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - if envs.VLLM_USE_V1: - SendBucketManager.meta_to_dict(meta_infos) - self.collective_rpc(method="broadcast_bucket", args=(src_pp_rank, meta_infos, bucket_size)) - - def broadcast_parameter(self, *args, **kwargs): - self.collective_rpc(method="broadcast_parameter", args=args, kwargs=kwargs) - - def update_parameter(self, parameter_name, weight, ranks_in_worker, is_lora): - if envs.VLLM_USE_V1: - weight_dict = { - "dtype": weight.dtype, - "weight": weight.cpu().tolist() - } - else: - weight_dict = weight - self.collective_rpc(method="update_parameter", args=(parameter_name, weight_dict, ranks_in_worker, is_lora)) - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - if envs.VLLM_USE_V1: - SendBucketManager.meta_to_dict(meta_infos) - # vllm 084 does not support serialization of torch.Tensor(GPU), must use custom - # numpy array encoder or use pickle. - # Can not convert to numpy array here, because of bug in encoder/decoder of vllm 084. - # Newer version of vllm support efficient serilization of torch.Tensor. - buffer = buffer.cpu().tolist() - self.collective_rpc(method="update_parameter_in_bucket", args=(meta_infos, buffer, ranks_in_worker)) - - def add_lora(self, *args, **kwargs): - self.collective_rpc(method="add_lora", args=args, kwargs=kwargs) diff --git a/roll/third_party/vllm/vllm_0_8_4/llm_engine.py b/roll/third_party/vllm/vllm_0_8_4/llm_engine.py deleted file mode 100644 index 3a61a6169..000000000 --- a/roll/third_party/vllm/vllm_0_8_4/llm_engine.py +++ /dev/null @@ -1,89 +0,0 @@ -from typing import Dict, Optional, Type - -from vllm import LLMEngine, EngineArgs, envs -from vllm.config import VllmConfig -from vllm.usage.usage_lib import UsageContext -from vllm.engine.metrics_types import StatLoggerBase - -import roll.third_party.vllm.fp8 as fp8 -from roll.utils.logging import get_logger - -logger = get_logger() - - -class LLMEngine084(LLMEngine): - - @classmethod - def from_vllm_config( - cls, - vllm_config: VllmConfig, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - disable_log_stats: bool = False, - ) -> "LLMEngine": - parallel_config = vllm_config.parallel_config - - executor_class = cls._get_executor_cls(vllm_config) - if parallel_config.distributed_executor_backend == "ray": - from roll.third_party.vllm.vllm_0_8_4.ray_distributed_executor import ( - CustomRayDistributedExecutor as V0CustomRayDistributedExecutor) - executor_class = V0CustomRayDistributedExecutor - - logger.info(f"Using executor_class: {executor_class}") - logger.info(f"Using worker cls: {parallel_config.worker_cls}") - return cls( - vllm_config=vllm_config, - executor_class=executor_class, - log_stats=(not disable_log_stats), - usage_context=usage_context, - stat_loggers=stat_loggers, - ) - - @classmethod - def from_engine_args( - cls, - engine_args: EngineArgs, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[Dict[str, StatLoggerBase]] = None, - ) -> "LLMEngine": - # Create the engine configs. - vllm_config = engine_args.create_engine_config(usage_context) - parallel_config = vllm_config.parallel_config - - resource_placement_groups = getattr(engine_args, "resource_placement_groups") - assert len(resource_placement_groups) == parallel_config.world_size - parallel_config.placement_group = resource_placement_groups - - # change worker cls to custom - cls.update_worker_cls_config(vllm_config) - - fp8.update_quant_config(vllm_config) - - engine_cls = cls - if envs.VLLM_USE_V1: - from roll.third_party.vllm.vllm_0_8_4.v1.llm_engine import ( - LLMEngine084 as V1LLMEngine084) - engine_cls = V1LLMEngine084 - - return engine_cls.from_vllm_config( - vllm_config=vllm_config, - usage_context=usage_context, - stat_loggers=stat_loggers, - disable_log_stats=engine_args.disable_log_stats, - ) - - @classmethod - def update_worker_cls_config(cls, vllm_config: VllmConfig) -> None: - parallel_config = vllm_config.parallel_config - scheduler_config = vllm_config.scheduler_config - - assert parallel_config.worker_cls != "auto" - if scheduler_config.is_multi_step: - pass - elif vllm_config.speculative_config: - pass - else: - if envs.VLLM_USE_V1: - parallel_config.worker_cls = "roll.third_party.vllm.vllm_0_8_4.v1.worker.Worker084" - else: - parallel_config.worker_cls = "roll.third_party.vllm.vllm_0_8_4.worker.Worker084" diff --git a/roll/third_party/vllm/vllm_0_8_4/ray_distributed_executor.py b/roll/third_party/vllm/vllm_0_8_4/ray_distributed_executor.py index 7f0e6a45d..85c2596ed 100644 --- a/roll/third_party/vllm/vllm_0_8_4/ray_distributed_executor.py +++ b/roll/third_party/vllm/vllm_0_8_4/ray_distributed_executor.py @@ -109,6 +109,7 @@ def _init_workers_ray(self, placement_group: "PlacementGroup", env_vars = {} env_vars.update(roll_current_platform.get_custom_env_vars()) env_vars.update(roll_current_platform.get_vllm_run_time_env_vars(gpu_rank)) + env_vars["FLASHINFER_WORKSPACE_BASE"] = f"{os.environ['FLASHINFER_WORKSPACE_BASE']}_{rank}" runtime_env = RuntimeEnv(env_vars=env_vars) assert current_platform.ray_device_key == "GPU" # NV+AMD GPUs, and Intel XPUs diff --git a/roll/third_party/vllm/vllm_0_8_4/v1/async_llm.py b/roll/third_party/vllm/vllm_0_8_4/v1/async_llm.py deleted file mode 100644 index 430b11887..000000000 --- a/roll/third_party/vllm/vllm_0_8_4/v1/async_llm.py +++ /dev/null @@ -1,98 +0,0 @@ -import os -import asyncio -from typing import (Tuple, List, Dict, Optional, Union, Any, - Callable, Dict, List, Optional) - -from vllm import envs -from vllm.v1.engine.async_llm import AsyncLLM -from vllm.engine.arg_utils import AsyncEngineArgs -from vllm.entrypoints.llm import _R -from vllm.usage.usage_lib import UsageContext -from vllm.v1.executor.abstract import Executor - -from roll.utils.logging import get_logger -from roll.utils.send_recv_utils import SendBucketManager - -logger = get_logger() - -class AsyncLLM084(AsyncLLM): - - def __init__(self, resource_placement_groups, **kwargs): - assert envs.VLLM_USE_V1 - - engine_args = AsyncEngineArgs( - **kwargs, - ) - engine_args.enable_sleep_mode = True - vllm_config = engine_args.create_engine_config(UsageContext.ENGINE_CONTEXT) - - parallel_config = vllm_config.parallel_config - assert len(resource_placement_groups) == parallel_config.world_size - parallel_config.placement_group = resource_placement_groups - - assert not vllm_config.scheduler_config.is_multi_step - assert not vllm_config.speculative_config - parallel_config.worker_cls = "roll.third_party.vllm.vllm_0_8_4.v1.worker.Worker084" - - executor_class = Executor.get_class(vllm_config) - if parallel_config.distributed_executor_backend == "ray": - from roll.third_party.vllm.vllm_0_8_4.v1.ray_distributed_executor import ( - CustomRayDistributedExecutor as V1CustomeRayDistributedExecutor) - executor_class = V1CustomeRayDistributedExecutor - - # https://github.com/vllm-project/vllm/pull/14189/files - # TODO do not override other options in PYTORCH_CUDA_ALLOC_CONF - os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "" - - # Default fork method is not compatible with ScaleAligner. - os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' - - logger.info(f"Using AsyncLLM") - logger.info(f"Using executor_class: {executor_class}") - logger.info(f"Using worker cls: {parallel_config.worker_cls}") - return super().__init__( - vllm_config=vllm_config, - executor_class=executor_class, - start_engine_loop=True, - log_requests=True, - log_stats=True, - usage_context=UsageContext.ENGINE_CONTEXT, - ) - - def collective_rpc(self, - method: Union[str, Callable[..., _R]], - timeout: Optional[float] = None, - args: Tuple = (), - kwargs: Optional[Dict[str, Any]] = None) -> List[_R]: - loop = asyncio.get_event_loop() - return loop.run_until_complete(self.engine_core.collective_rpc_async(method, timeout, args, kwargs)) - - def load_states(self): - self.collective_rpc(method="load_states") - - def offload_states(self, level=1): - self.reset_prefix_cache() - self.collective_rpc(method="offload_states", args=(level,)) - - # 参数同步接口 - def setup_collective_group(self, *args, **kwargs): - self.collective_rpc(method="setup_collective_group", args=args, kwargs=kwargs) - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - if envs.VLLM_USE_V1: - SendBucketManager.meta_to_dict(meta_infos) - self.collective_rpc(method="broadcast_bucket", args=(src_pp_rank, meta_infos, bucket_size)) - - def broadcast_parameter(self, *args, **kwargs): - self.collective_rpc(method="broadcast_parameter", args=args, kwargs=kwargs) - - def update_parameter(self, *args, **kwargs): - self.collective_rpc(method="update_parameter", args=args, kwargs=kwargs) - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - if envs.VLLM_USE_V1: - SendBucketManager.meta_to_dict(meta_infos) - self.collective_rpc(method="update_parameter_in_bucket", args=(meta_infos, buffer, ranks_in_worker)) - - def add_lora(self, *args, **kwargs): - self.collective_rpc(method="add_lora", args=args, kwargs=kwargs) diff --git a/roll/third_party/vllm/vllm_0_8_4/v1/llm_engine.py b/roll/third_party/vllm/vllm_0_8_4/v1/llm_engine.py deleted file mode 100644 index 8373f6968..000000000 --- a/roll/third_party/vllm/vllm_0_8_4/v1/llm_engine.py +++ /dev/null @@ -1,244 +0,0 @@ -import os -from collections.abc import Mapping, Sequence -from copy import copy -from typing import Optional, Union - -from vllm import envs -from vllm.config import VllmConfig -from vllm.usage.usage_lib import UsageContext -from vllm.engine.metrics_types import StatLoggerBase -from vllm.v1.engine.processor import Processor -from vllm.config import VllmConfig -from vllm.inputs import ProcessorInputs -from vllm.inputs.parse import split_enc_dec_inputs -from vllm.outputs import RequestOutput -from vllm.lora.request import LoRARequest -from vllm.multimodal import MultiModalKwargs -from vllm.multimodal.inputs import PlaceholderRange -from vllm.multimodal.utils import merge_and_sort_multimodal_metadata -from vllm.pooling_params import PoolingParams -from vllm.prompt_adapter.request import PromptAdapterRequest -from vllm.sampling_params import SamplingParams -from vllm.v1.engine import EngineCoreRequest -from vllm.v1.engine import EngineCoreOutputs -from vllm.v1.engine.core_client import SyncMPClient -from vllm.v1.executor.abstract import Executor -from vllm.v1.engine.llm_engine import LLMEngine -from vllm.v1.engine.parallel_sampling import ParentRequest -from roll.utils.logging import get_logger - -logger = get_logger() - -def custom_process_inputs( - self, - request_id: str, - prompt: ProcessorInputs, - params: Union[SamplingParams, PoolingParams], - arrival_time: Optional[float] = None, - lora_request: Optional[LoRARequest] = None, - trace_headers: Optional[Mapping[str, str]] = None, - prompt_adapter_request: Optional[PromptAdapterRequest] = None, - priority: int = 0, -) -> EngineCoreRequest: - - self._validate_lora(lora_request) - self._validate_params(params) - if priority != 0: - raise ValueError("V1 does not support priority yet.") - if trace_headers is not None: - raise ValueError("V1 does not support tracing yet.") - if prompt_adapter_request is not None: - raise ValueError("V1 does not support prompt_adapter_request.") - - assert arrival_time is not None - - processed_inputs: ProcessorInputs = prompt - eos_token_id = self.input_preprocessor.get_eos_token_id(lora_request) - - self._validate_model_inputs(processed_inputs, lora_request) - - encoder_inputs, decoder_inputs = split_enc_dec_inputs(processed_inputs) - - if encoder_inputs is not None: - raise NotImplementedError - - assert isinstance(params, SamplingParams) - sampling_params = params.clone() - # If unset max tokens, then generate up to the max_model_len. - if sampling_params.max_tokens is None: - sampling_params.max_tokens = ( - self.model_config.max_model_len - - len(decoder_inputs["prompt_token_ids"])) - sampling_params.update_from_generation_config( - self.generation_config_fields, eos_token_id) - sampling_params.update_from_tokenizer( - self.tokenizer.get_lora_tokenizer(lora_request)) - - # Multimodal related. - sorted_mm_inputs: Optional[Sequence[Optional[MultiModalKwargs]]] = None - sorted_mm_positions: Optional[list[PlaceholderRange]] = None - sorted_mm_hashes: Optional[list[str]] = None - if decoder_inputs["type"] == "multimodal": - decoder_mm_inputs = decoder_inputs["mm_kwargs"] - - # Merge and flatten multimodal placeholders, hashes and inputs - # from dictionaries to lists, and sort them by each item's position - # in the input sequence. - ( - sorted_item_modalities, - sorted_mm_positions, - sorted_mm_hashes, - ) = merge_and_sort_multimodal_metadata( - decoder_inputs["mm_placeholders"], - decoder_inputs["mm_hashes"] if self.use_hash else None, - ) - - # The output of merged multi-modal processor (`decoder_mm_inputs`) - # is a single MultiModalKwargs for all items from all modalities. - # This code flattens kwargs for individual items in a list and - # sorts them by each item's position in the input sequence if there - # are multiple modalities. - unique_modalities = set(sorted_item_modalities) - if len(unique_modalities) > 1: - orig_sorted_mm_inputs = [] - used_indices = {modality: 0 for modality in unique_modalities} - - for modality in sorted_item_modalities: - items = decoder_mm_inputs.get_items(modality) - item = items[used_indices[modality]] - - orig_sorted_mm_inputs.append( - MultiModalKwargs.from_items([item])) - used_indices[modality] += 1 - else: - orig_sorted_mm_inputs = [ - MultiModalKwargs.from_items([item]) for item in - decoder_mm_inputs.get_items(sorted_item_modalities[0]) - ] - - if sorted_mm_hashes is not None: - sorted_mm_inputs = self.mm_input_cache_client.get_and_update_p0( - orig_sorted_mm_inputs, sorted_mm_hashes) - else: - sorted_mm_inputs = orig_sorted_mm_inputs - - return EngineCoreRequest( - request_id=request_id, - prompt=decoder_inputs.get("prompt"), - prompt_token_ids=decoder_inputs["prompt_token_ids"], - mm_inputs=sorted_mm_inputs, - mm_hashes=sorted_mm_hashes, - mm_placeholders=sorted_mm_positions, - sampling_params=sampling_params, - eos_token_id=eos_token_id, - arrival_time=arrival_time, - lora_request=lora_request, - ) - -Processor.custom_process_inputs = custom_process_inputs - -def get_output_nowait(self) -> EngineCoreOutputs: - """ - Only get an item if one is immediately available. Otherwise - raise the queue.Empty exception. - """ - return self.outputs_queue.get_nowait() - -# Function 'step' of vllm v1 and v0 engine has different semantic. -# Function vllm.v1.engine.LLMEngine.step is blocking but that of v0 is not. -# This will cause deadlock when calling roll.third_party.vllm.vllm_0_8_4.Llm084.fetch_output -# inside VllmStrategy if set generate_opt_level to 1. -SyncMPClient.get_output_nowait = get_output_nowait - -class LLMEngine084(LLMEngine): - - @classmethod - def from_vllm_config( - cls, - vllm_config: VllmConfig, - usage_context: UsageContext = UsageContext.ENGINE_CONTEXT, - stat_loggers: Optional[dict[str, StatLoggerBase]] = None, - disable_log_stats: bool = False, - ) -> "LLMEngine": - if stat_loggers is not None: - raise NotImplementedError( - "Passing StatLoggers to V1 is not yet supported. " - "Set VLLM_USE_V1=0 and file and issue on Github.") - - parallel_config = vllm_config.parallel_config - - executor_class = Executor.get_class(vllm_config) - if parallel_config.distributed_executor_backend == "ray": - from roll.third_party.vllm.vllm_0_8_4.v1.ray_distributed_executor import ( - CustomRayDistributedExecutor as V1CustomeRayDistributedExecutor) - executor_class = V1CustomeRayDistributedExecutor - - # Default fork method is not compatible with ScaleAligner. - os.environ['VLLM_WORKER_MULTIPROC_METHOD'] = 'spawn' - - logger.info(f"Using executor_class: {executor_class}") - logger.info(f"Using worker cls: {parallel_config.worker_cls}") - return cls(vllm_config=vllm_config, - executor_class=executor_class, - log_stats=(not disable_log_stats), - usage_context=usage_context, - stat_loggers=stat_loggers, - multiprocess_mode=envs.VLLM_ENABLE_V1_MULTIPROCESSING) - - def _add_processed_request( - self, - request_id: str, - processed_inputs: ProcessorInputs, - params: Union[SamplingParams, PoolingParams], - arrival_time: float, - lora_request: Optional[LoRARequest], - prompt_adapter_request: Optional[PromptAdapterRequest], - trace_headers: Optional[Mapping[str, str]] = None, - priority: int = 0, - ) -> None: - request = self.processor.custom_process_inputs(request_id, processed_inputs, params, - arrival_time, lora_request, - trace_headers, - prompt_adapter_request, - priority) - - n = params.n if isinstance(params, SamplingParams) else 1 - - if n == 1: - # Make a new RequestState and queue. - self.output_processor.add_request(request, None, 0) - # Add the request to EngineCore. - self.engine_core.add_request(request) - return - - # Fan out child requests (for n>1). - parent_req = ParentRequest(request_id, params) - for idx in range(n): - request_id, params = parent_req.get_child_info(idx) - child_request = request if idx == n - 1 else copy(request) - child_request.request_id = request_id - child_request.sampling_params = params - - # Make a new RequestState and queue. - self.output_processor.add_request(child_request, parent_req, idx) - # Add the request to EngineCore. - self.engine_core.add_request(child_request) - - def step_nowait(self) -> list[RequestOutput]: - - if self.should_execute_dummy_batch: - self.should_execute_dummy_batch = False - self.engine_core.execute_dummy_batch() - return [] - - # 1) Get EngineCoreOutput from the EngineCore. - outputs = self.engine_core.get_output_nowait() - - # 2) Process EngineCoreOutputs. - processed_outputs = self.output_processor.process_outputs( - outputs.outputs) - - # 3) Abort any reqs that finished due to stop strings. - self.engine_core.abort_requests(processed_outputs.reqs_to_abort) - - return processed_outputs.request_outputs diff --git a/roll/third_party/vllm/vllm_0_8_4/v1/worker.py b/roll/third_party/vllm/vllm_0_8_4/v1/worker.py deleted file mode 100644 index a0e473d19..000000000 --- a/roll/third_party/vllm/vllm_0_8_4/v1/worker.py +++ /dev/null @@ -1,51 +0,0 @@ -import gc -import time -from collections import OrderedDict - -import torch -from vllm.device_allocator.cumem import CuMemAllocator - -from roll.platforms import current_platform -from roll.third_party.vllm.vllm_utils import TensorLoRARequest, patch_vllm_lora_manager -from roll.third_party.vllm.worker_helper import WorkerHelper -from roll.utils.logging import get_logger -from roll.utils.send_recv_utils import RecvBucketManager - - -logger = get_logger() - -Worker = current_platform.get_vllm_worker_class() - -class Worker084(WorkerHelper, Worker): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.lora_params = OrderedDict() - patch_vllm_lora_manager() - - def update_parameter(self, parameter_name, weight, ranks_in_worker, is_lora): - weight_dict = weight - weight = torch.tensor(weight_dict["weight"], dtype=weight_dict["dtype"]).to(current_platform.device_type) - super().update_parameter(parameter_name, weight, ranks_in_worker, is_lora) - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - RecvBucketManager.dict_to_meta(meta_infos) - super().broadcast_bucket(src_pp_rank, meta_infos, bucket_size) - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - RecvBucketManager.dict_to_meta(meta_infos) - buffer = torch.tensor(buffer, dtype=torch.int8, device=current_platform.device_type) - super().update_parameter_in_bucket(meta_infos, buffer, ranks_in_worker) - - def add_lora(self, peft_config) -> bool: - lora_int_id = int(time.time_ns() % 0x7FFFFFFF) - lora_request = TensorLoRARequest( - lora_name=f"{lora_int_id}", - lora_int_id=lora_int_id, - lora_path="dummy_lora_path", - peft_config=peft_config, - lora_tensors=self.lora_params, - ) - del self.lora_params - self.lora_params = OrderedDict() - super().reload_model() - return self.model_runner.add_lora(lora_request) diff --git a/roll/third_party/vllm/vllm_0_8_4/worker.py b/roll/third_party/vllm/vllm_0_8_4/worker.py deleted file mode 100644 index af38f2898..000000000 --- a/roll/third_party/vllm/vllm_0_8_4/worker.py +++ /dev/null @@ -1,16 +0,0 @@ -import gc -from typing import Optional - -import torch -from roll.platforms import current_platform - -from roll.third_party.vllm.worker_helper import WorkerHelper -from roll.utils.logging import get_logger - -logger = get_logger() - -Worker = current_platform.get_vllm_worker_class() - -class Worker084(WorkerHelper, Worker): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) diff --git a/roll/third_party/vllm/vllm_utils.py b/roll/third_party/vllm/vllm_utils.py index 2a61616ea..f8d65a86c 100644 --- a/roll/third_party/vllm/vllm_utils.py +++ b/roll/third_party/vllm/vllm_utils.py @@ -1,4 +1,3 @@ -# borrow from https://github.com/volcengine/verl/blob/main/verl/utils/vllm_utils.py from dataclasses import field from typing import List @@ -8,10 +7,14 @@ from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager +# TODO: remove this patch once vllm 0.8.4 is deprecated +# Patch weight loader for moe models +# borrow from https://github.com/volcengine/verl/blob/main/verl/utils/vllm_utils.py SUPPORTED_MOE_MODELS = [] try: from vllm.model_executor.models.deepseek_v2 import DeepseekV2ForCausalLM, DeepseekV3ForCausalLM + SUPPORTED_MOE_MODELS.append(DeepseekV2ForCausalLM) SUPPORTED_MOE_MODELS.append(DeepseekV3ForCausalLM) except ImportError: @@ -19,12 +22,14 @@ try: from vllm.model_executor.models.qwen2_moe import Qwen2MoeForCausalLM + SUPPORTED_MOE_MODELS.append(Qwen2MoeForCausalLM) except ImportError: pass try: from vllm.model_executor.models.qwen3_moe import Qwen3MoeForCausalLM + SUPPORTED_MOE_MODELS.append(Qwen3MoeForCausalLM) except ImportError: pass @@ -42,6 +47,7 @@ def patch_vllm_moe_model_weight_loader(model): if ("w13_weight" in name or "w2_weight" in name) and not skip_patch: param.weight_loader = mlp.experts.weight_loader + class TensorLoRARequest(LoRARequest): peft_config: dict = field(default=None) lora_tensors: dict = field(default=None) @@ -57,16 +63,21 @@ def load_adapter(self, lora_request: TensorLoRARequest) -> LoRAModel: To synchronize the LoRA tensors of the actor model, we need to find a workaround to enable VLLM to load memory-based LoRA tensors. """ try: + from packaging.version import Version + from vllm import __version__ as vllm_version + supported_lora_modules = self._adapter_manager.supported_lora_modules packed_modules_mapping = self._adapter_manager.packed_modules_mapping - expected_lora_modules: List[str] = [] + expected_lora_lst: list[str] = [] for module in supported_lora_modules: if module in packed_modules_mapping: - expected_lora_modules.extend(packed_modules_mapping[module]) + expected_lora_lst.extend(packed_modules_mapping[module]) else: - expected_lora_modules.append(module) + expected_lora_lst.append(module) + if module == "experts": + expected_lora_lst.append(module) - expected_lora_modules = list(set(expected_lora_modules)) + expected_lora_modules = list(set(expected_lora_lst)) lora_tensors = None from vllm.lora.peft_helper import PEFTHelper @@ -76,9 +87,15 @@ def load_adapter(self, lora_request: TensorLoRARequest) -> LoRAModel: lora_tensors = lora_request.lora_tensors peft_helper = PEFTHelper.from_dict(peft_config) else: + kwargs = {} + if Version(vllm_version) > Version("0.8.4"): + kwargs["tensorizer_config_dict"] = lora_request.tensorizer_config_dict lora_path = get_adapter_absolute_path(lora_request.lora_path) - - peft_helper = PEFTHelper.from_local_dir(lora_path, self.max_position_embeddings) + peft_helper = PEFTHelper.from_local_dir( + lora_path, + self.max_position_embeddings, + **kwargs, + ) # Validates the LoRA configuration against requirements before # loading weights, throwing an exception if validation fails. @@ -92,19 +109,33 @@ def load_adapter(self, lora_request: TensorLoRARequest) -> LoRAModel: hf_to_vllm_mapper = model.hf_to_vllm_mapper if isinstance(lora_request, TensorLoRARequest): + kwargs = {} + if Version(vllm_version) >= Version("0.12.0"): + kwargs["model_vocab_size"] = self.vocab_size + else: + kwargs["embeddings"] = None + kwargs["target_embedding_padding"] = self.vocab_size + self.lora_config.lora_extra_vocab_size + kwargs["embedding_modules"] = self.embedding_modules + kwargs["embedding_padding_modules"] = self.embedding_padding_modules lora = self._lora_model_cls.from_lora_tensors( lora_model_id=lora_request.lora_int_id, tensors=lora_tensors, peft_helper=peft_helper, device="cpu", dtype=self.lora_config.lora_dtype, - embeddings=None, - target_embedding_padding=self.vocab_size + self.lora_config.lora_extra_vocab_size, - embedding_modules=self.embedding_modules, - embedding_padding_modules=self.embedding_padding_modules, weights_mapper=hf_to_vllm_mapper, + **kwargs, ) else: + kwargs = {} + if Version(vllm_version) > Version("0.8.4"): + kwargs["tensorizer_config_dict"] = lora_request.tensorizer_config_dict + if Version(vllm_version) >= Version("0.12.0"): + kwargs["model_vocab_size"] = self.vocab_size + else: + kwargs["target_embedding_padding"] = self.vocab_size + self.lora_config.lora_extra_vocab_size + kwargs["embedding_modules"] = self.embedding_modules + kwargs["embedding_padding_modules"] = self.embedding_padding_modules lora = self._lora_model_cls.from_local_checkpoint( lora_path, expected_lora_modules, @@ -112,18 +143,12 @@ def load_adapter(self, lora_request: TensorLoRARequest) -> LoRAModel: lora_model_id=lora_request.lora_int_id, device="cpu", dtype=self.lora_config.lora_dtype, - target_embedding_padding=self.vocab_size + self.lora_config.lora_extra_vocab_size, - embedding_modules=self.embedding_modules, - embedding_padding_modules=self.embedding_padding_modules, weights_mapper=hf_to_vllm_mapper, + **kwargs, ) except Exception as e: raise e - if lora.extra_vocab_size > self.lora_config.lora_extra_vocab_size: - raise ValueError( - f"LoRA added vocab size {lora.extra_vocab_size} is greater than lora_extra_vocab_size {self.lora_config.lora_extra_vocab_size}." - ) return lora setattr(LRUCacheWorkerLoRAManager, "_load_adapter", load_adapter) diff --git a/roll/third_party/vllm/worker.py b/roll/third_party/vllm/worker.py new file mode 100644 index 000000000..ea82ceb40 --- /dev/null +++ b/roll/third_party/vllm/worker.py @@ -0,0 +1,169 @@ +import gc +import hashlib +import json +import time +from collections import OrderedDict +from typing import Iterable, Tuple + +import torch +import vllm +from packaging.version import Version + +from roll.platforms import current_platform +from roll.third_party.vllm.vllm_utils import TensorLoRARequest, patch_vllm_lora_manager +from roll.utils.collective import collective +from roll.utils.cuda_ipc_utils import MultiprocessingSerializer +from roll.utils.logging import get_logger +from roll.utils.send_recv_utils import monkey_patch_torch_reductions, named_tensors_from_bucket + +logger = get_logger() + + +class TensorLoraManager: + def __init__(self): + self.lora_params = OrderedDict() + self.add_lora_count = 0 + + def add_weight(self, name: str, weight: torch.Tensor): + self.lora_params[name] = weight + + def build_request(self, peft_config: dict) -> TensorLoRARequest: + """ + Generate a unique LoRA ID based on the PEFT configuration rather than + using a timestamp to assert all tp-ranks get the same LoRA ID. + """ + self.add_lora_count += 1 + peft_config["add_lora_count"] = self.add_lora_count + peft_config_str = json.dumps(peft_config, sort_keys=True) + hash_obj = hashlib.sha256(peft_config_str.encode("utf-8")) + hex_dig = hash_obj.hexdigest() + lora_int_id = int(hex_dig, 16) % 0x7FFFFFFF + + lora_request = TensorLoRARequest( + lora_name=f"{lora_int_id}", + lora_int_id=lora_int_id, + lora_path="dummy_lora_path", + peft_config=peft_config, + lora_tensors=self.lora_params, + ) + del self.lora_params + self.lora_params = OrderedDict() + return lora_request + + +class WorkerBase: + def custom_init_worker(self, *args, **kwargs): + self.weight_loaded: bool = True + self.kv_cache_loaded: bool = True + self.buffers = None + self.buffer_cache = None + self.tensor_lora_manager = TensorLoraManager() + + def reload_model(self): + if not self.weight_loaded: + self.wake_up(["weights"]) + self.weight_loaded = True + + def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): + # before updating the parameters, we need to reinitialize the previously released model + self.reload_model() + if vllm.__version__ < "0.8.5": + from roll.third_party.vllm.vllm_utils import patch_vllm_moe_model_weight_loader + + patch_vllm_moe_model_weight_loader(self.model_runner.model) + self.model_runner.model.load_weights(weights=weights) + + def load_states(self): + self.reload_model() + if not self.kv_cache_loaded: + self.wake_up(["kv_cache"]) + self.kv_cache_loaded = True + if vllm.__version__ < "0.8.5" and self.buffers is not None: + # https://github.com/vllm-project/vllm/issues/16564 + model = self.model_runner.model + for name, buffer in model.named_buffers(): + if name in self.buffers: + buffer.data.copy_(self.buffers[name].data) + self.buffers = None + + def offload_states(self, level): + assert (self.weight_loaded and self.kv_cache_loaded) or (not self.weight_loaded and not self.kv_cache_loaded) + if not self.weight_loaded: + return + if vllm.__version__ < "0.8.5" and level == 2: + # https://github.com/vllm-project/vllm/issues/16564 + model = self.model_runner.model + self.buffers = {name: buffer.cpu().clone() for name, buffer in model.named_buffers()} + self.sleep(level) + self.weight_loaded = False + self.kv_cache_loaded = False + if hasattr(self, "recv_manager"): + self.recv_manager.clear() + gc.collect() + current_platform.empty_cache() + + def setup_collective_group(self, master_address, master_port, rank_offset, world_size, group_name, backend): + group_rank = self.rank + rank_offset + collective.init_collective_group( + world_size, + rank=group_rank, + backend=backend, + group_name=group_name, + master_addr=master_address, + master_port=master_port, + ) + logger.info(f"setup_collective_group: {group_name} rank: {group_rank} world_size: {world_size}") + + def broadcast_parameter(self, names, dtypes, shapes, group_name, is_lora=False): + weights_and_handles = [] + for name, dtype, shape in zip(names, dtypes, shapes): + target_dtype = dtype if isinstance(dtype, torch.dtype) else getattr(torch, dtype) + weight = torch.empty(shape, dtype=target_dtype, device=self.device) + handle = collective.broadcast(tensor=weight, src_rank=0, group_name=group_name, async_op=True) + weights_and_handles.append((name, weight, handle)) + + def weights_iter(): + for name, weight, handle in weights_and_handles: + handle.wait() + yield name, weight + + if is_lora: + for name, weight in weights_iter(): + self.tensor_lora_manager.add_weight(name, weight) + return + self.load_weights(weights=weights_iter()) + + def update_parameter_in_bucket(self, serialized_named_tensors, is_lora=False): + monkey_patch_torch_reductions() + bucket_with_meta = MultiprocessingSerializer.deserialize(serialized_named_tensors[self.rank]) + named_params = named_tensors_from_bucket(**bucket_with_meta) + if is_lora: + for name, weight in named_params: + self.tensor_lora_manager.add_weight(name, weight) + return + self.load_weights([(name, weight) for name, weight in named_params]) + + def process_weights_after_loading(self): + if (Version("0.11.0") == Version(vllm.__version__) or + Version("0.11.1rc1") == Version(vllm.__version__) or + Version("0.11.1rc2.dev0+gc3a722fcb.d20251021") == Version(vllm.__version__)): + from vllm.model_executor.model_loader.utils import process_weights_after_loading,set_default_torch_dtype + device_config = self.device_config + load_config = self.vllm_config.load_config + load_device = (device_config.device if load_config.device is None else load_config.device) + target_device = torch.device(load_device) + with set_default_torch_dtype(self.model_config.dtype): + process_weights_after_loading(self.model_runner.model,self.model_config,target_device) + + +class WorkerV1(WorkerBase): + def custom_init_worker(self, *args, **kwargs): + super().custom_init_worker(*args, **kwargs) + patch_vllm_lora_manager() + + # Use custom prefix because worker_extension_cls can not has + # conflicting method name with vllm worker. + def custom_add_lora(self, peft_config) -> bool: + lora_request = self.tensor_lora_manager.build_request(peft_config) + super().reload_model() + return self.model_runner.add_lora(lora_request) diff --git a/roll/third_party/vllm/worker_helper.py b/roll/third_party/vllm/worker_helper.py deleted file mode 100644 index e6b0d9045..000000000 --- a/roll/third_party/vllm/worker_helper.py +++ /dev/null @@ -1,120 +0,0 @@ -import gc -from typing import Tuple, Iterable - -import torch -import torch.distributed as dist -import vllm - -from roll.utils.collective import collective -from roll.utils.functionals import get_dist_info_from_comm_plan -from roll.utils.logging import get_logger -from roll.utils.send_recv_utils import RecvBucketManager -from roll.third_party.vllm.vllm_utils import patch_vllm_moe_model_weight_loader -from roll.platforms import current_platform - -logger = get_logger() - - -class WorkerHelper: - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - self.weight_loaded : bool = True - self.kv_cache_loaded : bool = True - self.buffers = None - - def reload_model(self): - if not self.weight_loaded: - self.wake_up(["weights"]) - self.weight_loaded = True - - def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): - # before updating the parameters, we need to reinitialize the previously released model - self.reload_model() - patch_vllm_moe_model_weight_loader(self.model_runner.model) - self.model_runner.model.load_weights(weights=weights) - - def load_states(self): - self.reload_model() - if not self.kv_cache_loaded: - self.wake_up(["kv_cache"]) - self.kv_cache_loaded = True - if vllm.__version__ < "0.8.5" and self.buffers is not None: - # https://github.com/vllm-project/vllm/issues/16564 - model = self.model_runner.model - for name, buffer in model.named_buffers(): - if name in self.buffers: - buffer.data.copy_(self.buffers[name].data) - self.buffers = None - - def offload_states(self, level): - assert (self.weight_loaded and self.kv_cache_loaded) or (not self.weight_loaded and not self.kv_cache_loaded) - if not self.weight_loaded: - return - if vllm.__version__ < "0.8.5" and level == 2: - # https://github.com/vllm-project/vllm/issues/16564 - model = self.model_runner.model - self.buffers = {name: buffer.cpu().clone() for name, buffer in model.named_buffers()} - self.sleep(level) - self.weight_loaded = False - self.kv_cache_loaded = False - if hasattr(self, 'recv_manager'): - self.recv_manager.clear() - gc.collect() - current_platform.empty_cache() - - def setup_collective_group(self, comm_plan, backend, rank_in_cluster): - self.model_update_comm_plan = getattr(self, "model_update_comm_plan", {}) - rank, comm_plan_args = get_dist_info_from_comm_plan(comm_plan, rank_in_cluster=rank_in_cluster, - rank_in_worker=dist.get_rank()) - if rank is None: - logger.info(f"no comm_plan found for rank {rank_in_cluster}/{dist.get_rank()}") - return - group_name = comm_plan_args["group_name"] - master_addr = comm_plan_args["master_addr"] - master_port = comm_plan_args["master_port"] - world_size = len(comm_plan_args["tgt_devices"]) + 1 - src_pp_rank = comm_plan_args["src_pp_rank"] - collective.init_collective_group(world_size, rank, backend=backend, group_name=group_name, - master_addr=master_addr, master_port=master_port) - # A small all_reduce for warmup. - collective.allreduce(torch.zeros(1).to(current_platform.device_type), group_name=group_name) - self.model_update_comm_plan[src_pp_rank] = dict(rank=rank, - world_size=world_size, - src_pp_rank=src_pp_rank, - group_name=group_name, - comm_plan=comm_plan, - comm_plan_args=comm_plan_args) - logger.info(f"warmup setup_collective_group: {group_name} rank: {rank} world_size: {world_size}") - - def broadcast_bucket(self, src_pp_rank, meta_infos, bucket_size): - if src_pp_rank not in self.model_update_comm_plan: - return - comm_plan = self.model_update_comm_plan[src_pp_rank] - buffer = torch.empty(bucket_size, dtype=torch.int8, device=current_platform.device_type) - collective.broadcast(tensor=buffer, src_rank=0, group_name=comm_plan["group_name"]) - WorkerHelper.update_parameter_in_bucket(self, meta_infos, buffer, [dist.get_rank()]) - - def broadcast_parameter(self, src_pp_rank, dtype, shape, parameter_name, is_lora=False): - if src_pp_rank not in self.model_update_comm_plan: - return - comm_plan = self.model_update_comm_plan[src_pp_rank] - weight = torch.empty(shape, dtype=dtype, device=current_platform.device_type) - collective.broadcast(tensor=weight, src_rank=0, group_name=comm_plan["group_name"]) - WorkerHelper.update_parameter(self, parameter_name, weight, [dist.get_rank()], is_lora=is_lora) - - def update_parameter(self, parameter_name, weight, ranks_in_worker, is_lora=False): - if is_lora: - self.lora_params[parameter_name] = weight - return - if dist.get_rank() not in ranks_in_worker: - return - self.load_weights([(parameter_name, weight)]) - del weight - - def update_parameter_in_bucket(self, meta_infos, buffer, ranks_in_worker): - if dist.get_rank() not in ranks_in_worker: - return - self.recv_manager = getattr(self, "recv_manager", RecvBucketManager()) - named_params = self.recv_manager.process_bucket(meta_infos, buffer) - del buffer - self.load_weights([(name, weight) for name, weight in named_params.items()]) \ No newline at end of file diff --git a/roll/utils/asyncio_decorator.py b/roll/utils/asyncio_decorator.py new file mode 100644 index 000000000..5ae05620e --- /dev/null +++ b/roll/utils/asyncio_decorator.py @@ -0,0 +1,41 @@ +import asyncio +import functools +from typing import Callable, Coroutine, Any, Type + +def run_sync(func: Callable[..., Coroutine[Any, Any, Any]]) -> Callable[..., Any]: + """ + A decorator to run an async method synchronously. + It gets or creates an event loop and runs the async method until it completes. + """ + @functools.wraps(func) + def wrapper(*args, **kwargs): + assert asyncio.iscoroutinefunction(func) + try: + loop = asyncio.get_event_loop() + except: + loop = asyncio.new_event_loop() + asyncio.set_event_loop(loop) + assert not loop.is_closed() and not loop.is_running() + coro = func(*args, **kwargs) + return loop.run_until_complete(coro) + return wrapper + +def create_sync_class(OriginalCls: Type) -> Type: + """ + A factory function that creates a synchronous subclass of a given class. + + It generates and returns a new class that inherits from the original + but overrides all of its `async def` methods with synchronous wrappers. + + The name of the new class will be 'Sync' + original name. + """ + new_class_attrs = { + '__doc__': OriginalCls.__doc__ + } + + for name, method in OriginalCls.__dict__.items(): + if not name.startswith('_') and asyncio.iscoroutinefunction(method): + new_class_attrs[name] = run_sync(method) + + SyncVersion = type(f"Sync{OriginalCls.__name__}", (OriginalCls,), new_class_attrs) + return SyncVersion \ No newline at end of file diff --git a/roll/utils/collective/collective.py b/roll/utils/collective/collective.py index ea22a3983..78bcd5fcb 100644 --- a/roll/utils/collective/collective.py +++ b/roll/utils/collective/collective.py @@ -1,4 +1,4 @@ -from typing import Union +from typing import Union, Optional from torch._C._distributed_c10d import ReduceOp from torch.distributed import Backend @@ -21,13 +21,14 @@ def __init__(self): self._name_group_map = {} self._group_name_map = {} - def create_collective_group(self, backend, world_size, rank, master_addr: str, master_port: int, group_name): + def create_collective_group(self, backend, world_size, rank, master_addr: str, master_port: int, group_name, global_ranks=None): self._name_group_map[group_name] = init_custom_process_group( backend=backend, init_method=f"tcp://{master_addr}:{master_port}", world_size=world_size, rank=rank, group_name=group_name, + global_ranks=global_ranks ) return self._name_group_map[group_name] @@ -65,6 +66,7 @@ def init_collective_group( master_port: int, backend: Union[str, Backend] = current_platform.communication_backend, group_name: str = "default", + global_ranks: Optional[list] = None, ): global _group_mgr if not group_name: @@ -76,7 +78,7 @@ def init_collective_group( assert world_size > 0 assert rank >= 0 assert rank < world_size - _group_mgr.create_collective_group(backend, world_size, rank, master_addr, master_port, group_name) + _group_mgr.create_collective_group(backend, world_size, rank, master_addr, master_port, group_name, global_ranks=global_ranks) def allreduce(tensor, group_name: str = "default", op=ReduceOp.SUM): @@ -84,11 +86,20 @@ def allreduce(tensor, group_name: str = "default", op=ReduceOp.SUM): dist.all_reduce(tensor, op=op, group=_group_mgr.get_group_by_name(group_name)) -def broadcast(tensor, src_rank: int = 0, group_name: str = "default"): +def broadcast(tensor, src_rank: int = 0, group_name: str = "default", async_op=False): global _group_mgr - dist.broadcast(tensor, src=src_rank, group=_group_mgr.get_group_by_name(group_name)) - + return dist.broadcast(tensor, src=src_rank, group=_group_mgr.get_group_by_name(group_name), async_op=async_op) def barrier(group_name): global _group_mgr dist.barrier(group=_group_mgr.get_group_by_name(group_name), device_ids=[0]) + +def all_gather_object(object_list, obj, group_name): + global _group_mgr + dist.all_gather_object(object_list, obj, group=_group_mgr.get_group_by_name(group_name)) + +def broadcast_object_list(object_list, src=None, group_name="default", device=None, group_src=None): + global _group_mgr + assert (src is not None and group_src is None) or (src is None and group_src is not None),\ + ("Either src or group_src must be set, but they cannot be set simultaneously.") + dist.broadcast_object_list(object_list, src=src, group_src=group_src, group=_group_mgr.get_group_by_name(group_name)) diff --git a/roll/utils/collective/pg_utils.py b/roll/utils/collective/pg_utils.py index d6d384b22..2ac8bc78a 100644 --- a/roll/utils/collective/pg_utils.py +++ b/roll/utils/collective/pg_utils.py @@ -16,6 +16,7 @@ def init_custom_process_group( store=None, group_name=None, pg_options=None, + global_ranks=None, ): from torch.distributed.distributed_c10d import ( Backend, @@ -60,7 +61,7 @@ def init_custom_process_group( pg, _ = _new_process_group_helper( world_size, rank, - [], + global_ranks if global_ranks is not None else [], backend, store, group_name=group_name, @@ -68,7 +69,10 @@ def init_custom_process_group( timeout=timeout, ) - _world.pg_group_ranks[pg] = {i: i for i in range(world_size)} + if global_ranks is not None: + _world.pg_group_ranks[pg] = {gr: lr for lr, gr in enumerate(global_ranks)} + else: + _world.pg_group_ranks[pg] = {i: i for i in range(world_size)} # 多device id时,barrier还需要指定device_ids,不然会校验所有相关的device是否有相同 # barrier(group=pg, device_ids=[0]) diff --git a/roll/utils/constants.py b/roll/utils/constants.py index 4884f384c..94e5fb875 100644 --- a/roll/utils/constants.py +++ b/roll/utils/constants.py @@ -25,6 +25,7 @@ class GenerateStopReason(enum.Enum): FINISH = enum.auto() ABORT = enum.auto() MAX_LENGTH = enum.auto() + NO_SYSTEM_PROMPT = enum.auto() class EpisodeStopReason(enum.Enum): @@ -37,4 +38,5 @@ class EpisodeStopReason(enum.Enum): ENV_TIMEOUT = "env_timeout" LLM_GENERATE_FAILED = "llm_generate_failed" UNKNOWN = "unknown" - NO_SYSTEM_PROMPT = "no_system_prompt" \ No newline at end of file + NO_SYSTEM_PROMPT = "no_system_prompt" + EVAL_GT = "eval_gt" \ No newline at end of file diff --git a/roll/utils/context_parallel/__init__.py b/roll/utils/context_parallel/__init__.py index fff4995fc..8112b8d2b 100644 --- a/roll/utils/context_parallel/__init__.py +++ b/roll/utils/context_parallel/__init__.py @@ -1,5 +1,4 @@ from roll.utils.context_parallel.globals import get_ulysses_group, set_upg_manager from roll.utils.context_parallel.monkey_patch import apply_ulysses_patch, unapply_ulysses_patch - __all__ = ["set_upg_manager", "get_ulysses_group", "apply_ulysses_patch", "unapply_ulysses_patch"] diff --git a/roll/utils/context_parallel/all_to_all.py b/roll/utils/context_parallel/all_to_all.py index 1fff2a84b..155457273 100644 --- a/roll/utils/context_parallel/all_to_all.py +++ b/roll/utils/context_parallel/all_to_all.py @@ -36,19 +36,24 @@ def all_to_all_4D( # Pad sequence for multi-modality use case ulysses_seqlen = [torch.zeros(1, dtype=torch.int64, device=input.device) for _ in range(seq_world_size)] dist.barrier(group=group) - dist.all_gather(ulysses_seqlen, torch.tensor(shard_seqlen, device=input.device), group=group) + dist.all_gather( + ulysses_seqlen, + torch.tensor([shard_seqlen], device=input.device), + group=group, + ) set_ulysses_seqlen(ulysses_seqlen) max_global_length = max(ulysses_seqlen) # pad to the second dimension to the longest input = torch.nn.functional.pad(input, (0, 0, 0, 0, 0, max_global_length - shard_seqlen)) - seqlen = max_global_length * seq_world_size + shard_seqlen_padded = int(max_global_length.item()) + seqlen_padded = shard_seqlen_padded * seq_world_size shard_hc = hc // seq_world_size # transpose groups of heads with the seq-len parallel dimension, so that we can scatter them! # (bs, seqlen/P, hc, hs) -reshape-> (bs, seq_len/P, P, hc/P, hs) -transpose(0,2)-> (P, seq_len/P, bs, hc/P, hs) - input_t = input.reshape(bs, shard_seqlen, seq_world_size, shard_hc, hs).transpose(0, 2).contiguous() + input_t = input.reshape(bs, shard_seqlen_padded, seq_world_size, shard_hc, hs).transpose(0, 2).contiguous() output = torch.empty_like(input_t) # https://pytorch.org/docs/stable/distributed.html#torch.distributed.all_to_all_single @@ -61,18 +66,19 @@ def all_to_all_4D( else: output = input_t # if scattering the seq-dim, transpose the heads back to the original dimension - output = output.reshape(seqlen, bs, shard_hc, hs) + output = output.reshape(seqlen_padded, bs, shard_hc, hs) # then we will unpad it back - output_list = torch.split(output, max_global_length.item(), dim=0) + output_list = torch.split(output, shard_seqlen_padded, dim=0) assert len(output_list) == seq_world_size unpadded_output_list = [_output[: _seqlen.item()] for _output, _seqlen in zip(output_list, ulysses_seqlen)] # Concatenate the unpadded tensors back together output = torch.cat(unpadded_output_list) + seqlen_actual = int(output.size(0)) - # (seq_len, bs, hc/P, hs) -reshape-> (bs, seq_len, hc/P, hs) - output = output.transpose(0, 1).contiguous().reshape(bs, seqlen, shard_hc, hs) + # (seq_len_actual, bs, hc/P, hs) -> (bs, seq_len_actual, hc/P, hs) + output = output.transpose(0, 1).contiguous().reshape(bs, seqlen_actual, shard_hc, hs) return output @@ -117,11 +123,11 @@ def all_to_all_4D( output = output.reshape(hc, max_global_length, bs, hs) # unpad the output - self_length = ulysses_seqlen[dist.get_rank(group=group)] + self_length = int(ulysses_seqlen[dist.get_rank(group=group)].item()) output = output[:, :self_length, :, :] - # (hc, seqlen/N, bs, hs) -tranpose(0,2)-> (bs, seqlen/N, hc, hs) - output = output.transpose(0, 2).contiguous().reshape(bs, max_global_length, hc, hs) + # (hc, local_seqlen, bs, hs) -> (bs, local_seqlen, hc, hs) + output = output.transpose(0, 2).contiguous().reshape(bs, self_length, hc, hs) return output else: diff --git a/roll/utils/context_parallel/autograd_gather.py b/roll/utils/context_parallel/autograd_gather.py new file mode 100644 index 000000000..4a2ee5f6f --- /dev/null +++ b/roll/utils/context_parallel/autograd_gather.py @@ -0,0 +1,98 @@ +""" +PumpkinComment: + +Why this exists: +- CP ranks typically see the same (replicated) batch, but operate on different sequence shards. +- Downstream loss code often wants full-sequence tensors (e.g., log_probs, entropy). +- A naive gather using torch.distributed.nn.functional.all_gather has a backward that performs + ReduceScatter(SUM)-like behavior, which interacts poorly with replicated-loss semantics. + +- forward: gather shards and concatenate along `gather_dim` +- backward: *slice only* the gradient shard for this rank +- optional `grad_scaler`: multiply grad_output by world_size before slicing, so that if an outer + data-parallel reduction averages across CP replicas, the effective gradient matches cp_size=1. + +Reference: https://github.com/volcengine/verl/blob/main/verl/utils/ulysses.py +""" + +from typing import Optional + +import torch +import torch.distributed as dist + + +class _UlyssesGather(torch.autograd.Function): + @staticmethod + def forward( + ctx, + group: dist.ProcessGroup, + local_tensor: torch.Tensor, + gather_dim: int, + grad_scaler: bool, + ) -> torch.Tensor: + # Normalize dim. + if gather_dim < 0: + gather_dim = local_tensor.dim() + gather_dim + + world_size = dist.get_world_size(group=group) + rank = dist.get_rank(group=group) + part_size = local_tensor.size(gather_dim) + + ctx.group = group + ctx.gather_dim = gather_dim + ctx.grad_scaler = grad_scaler + ctx.world_size = world_size + ctx.rank = rank + ctx.part_size = part_size + + # Move gather_dim to leading dim so we can use all_gather_into_tensor on dim0. + x_perm = local_tensor.movedim(gather_dim, 0).contiguous() + out_perm = torch.empty( + (world_size * x_perm.size(0),) + tuple(x_perm.shape[1:]), + device=x_perm.device, + dtype=x_perm.dtype, + ) + dist.all_gather_into_tensor(out_perm, x_perm, group=group) + + full = out_perm.movedim(0, gather_dim).contiguous() + return full + + @staticmethod + def backward(ctx, grad_output: torch.Tensor): + # grad_output is the full concatenated tensor on every rank. + if ctx.grad_scaler: + grad_output = grad_output * ctx.world_size + + gdim = ctx.gather_dim + if gdim < 0: + gdim = grad_output.dim() + gdim + + grad_perm = grad_output.movedim(gdim, 0).contiguous() + start = ctx.rank * ctx.part_size + end = (ctx.rank + 1) * ctx.part_size + grad_local_perm = grad_perm[start:end].contiguous() + grad_local = grad_local_perm.movedim(0, gdim).contiguous() + return None, grad_local, None, None + + +def ulysses_gather( + x: torch.Tensor, + *, + gather_dim: int, + group: Optional[dist.ProcessGroup], + grad_scaler: bool = True, +) -> torch.Tensor: + """ + Gather shards across `group` and concatenate along `gather_dim` with autograd-friendly backward. + + Args: + x: local shard tensor + gather_dim: dim to concatenate along + group: process group (if None or world_size<=1, returns x) + grad_scaler: whether to scale grad_output by world_size before slicing in backward + """ + if group is None: + return x + if dist.get_world_size(group=group) <= 1: + return x + return _UlyssesGather.apply(group, x, gather_dim, grad_scaler) diff --git a/roll/utils/context_parallel/hf_flash_attention_patch.py b/roll/utils/context_parallel/hf_flash_attention_patch.py new file mode 100644 index 000000000..5524f9b56 --- /dev/null +++ b/roll/utils/context_parallel/hf_flash_attention_patch.py @@ -0,0 +1,404 @@ +import inspect +from typing import Any, Callable, Dict, Optional, Tuple + +import torch +import torch.distributed as dist + +from roll.utils.context_parallel.all_to_all import SeqAllToAll4D, all_to_all_4D +from roll.utils.context_parallel.globals import get_ulysses_group, get_ulysses_size +from roll.utils.context_parallel.ulysses_attention import expandKV +from roll.utils.logging import get_logger + +logger = get_logger() + +_DTYPE_ID_TO_DTYPE = { + 0: torch.int32, + 1: torch.int64, + 2: torch.bool, + 3: torch.float16, + 4: torch.bfloat16, + 5: torch.float32, +} + + +def _dtype_to_id(dtype: torch.dtype) -> int: + for k, v in _DTYPE_ID_TO_DTYPE.items(): + if v == dtype: + return k + return -1 + + +def _sync_optional_tensor_meta( + t: Any, + *, + group: dist.ProcessGroup, + dev: torch.device, +) -> Tuple[bool, Optional[torch.dtype]]: + """ + Synchronize whether `t` is a tensor across `group` and (if present) its dtype. + Returns: + (global_present, global_dtype_if_present) + """ + present = 1 if torch.is_tensor(t) else 0 + dtype_id = _dtype_to_id(t.dtype) if torch.is_tensor(t) else -1 + meta = torch.tensor([present, dtype_id], device=dev, dtype=torch.int32) + metas = [torch.empty_like(meta) for _ in range(dist.get_world_size(group))] + dist.all_gather(metas, meta, group=group) + meta_stack = torch.stack(metas, dim=0) + + global_present = bool(int(meta_stack[:, 0].max().item()) == 1) + if not global_present: + return False, None + + present_mask = meta_stack[:, 0] == 1 + dtype_ids = meta_stack[present_mask][:, 1] + dtype_min = int(dtype_ids.min().item()) + dtype_max = int(dtype_ids.max().item()) + if dtype_min != dtype_max or dtype_min not in _DTYPE_ID_TO_DTYPE: + return True, None + return True, _DTYPE_ID_TO_DTYPE[dtype_min] + + +_PATCH_STATE: Dict[str, Any] = { + "patched": False, + "orig_modeling_flash_attention_forward": None, + "orig_integrations_flash_attention_forward": None, +} + + +def _normalize_position_ids_for_fa_varlen(position_ids: Any) -> Any: + """ + Normalize `position_ids` for HF FlashAttention varlen bookkeeping. + + Some Transformers versions derive FlashAttention varlen `cu_seqlens` by scanning `position_ids == 0` + to find packed-sequence boundaries. In some pipelines, user-provided `position_ids` starts from 1, + meaning there are no zeros and boundary detection fails. + + In typical HF attention implementations, RoPE is applied to Q/K before calling the (FlashAttention) + forward, so `position_ids` passed into `_flash_attention_forward` is used for varlen metadata, not + for rotary math. Therefore shifting it here is safe for correctness of attention computation. + + Policy: + - If `position_ids` is an int tensor of shape (seqlen,) or (bs, seqlen) and the first token of each + sequence is not 0 (e.g. starts from 1), shift each sequence by its first value so it starts at 0. + This also works when CP-align padding introduces zeros later in the tensor (e.g. rmpad adds [0..pad)). + - Otherwise return it unchanged. + + Note: + - This normalization is intentionally applied *after* we gather `position_ids` to the global sequence + for Ulysses CP so that every rank sees consistent varlen metadata. + """ + if not torch.is_tensor(position_ids): + return position_ids + if position_ids.numel() == 0: + return position_ids + if position_ids.dtype not in (torch.int32, torch.int64): + return position_ids + if position_ids.dim() not in (1, 2): + return position_ids + + if position_ids.dim() == 1: + start_val = position_ids[:1] # [1] + if int(start_val.item()) == 0: + return position_ids + if int(start_val.item()) < 0: + return position_ids + return position_ids - start_val + + # dim == 2: shift each row by its own first token + start_val = position_ids[:, :1] # [bs, 1] + # If all rows already start at 0, leave unchanged. + if bool(torch.all(start_val == 0).item()): + return position_ids + # Avoid shifting for negative/sentinel schemes. + if bool(torch.any(start_val < 0).item()): + return position_ids + return position_ids - start_val + + +def _pad_to(t: torch.Tensor, target_len: int, *, dim: int = -1, pad_value: int = 0) -> torch.Tensor: + if dim < 0: + dim = dim % t.ndim + if t.size(dim) >= target_len: + return t + pad_len = target_len - t.size(dim) + pad = [0, 0] * t.ndim + pad[2 * (t.ndim - 1 - dim) + 1] = pad_len + return torch.nn.functional.pad(t, pad, value=pad_value) + + +def _gather_sharded_seq_tensor( + local: torch.Tensor, + *, + group: dist.ProcessGroup, + shard_lens: torch.Tensor, +) -> torch.Tensor: + world_size = dist.get_world_size(group) + max_len = int(shard_lens.max().item()) + + local_padded = _pad_to(local, max_len, dim=-1, pad_value=0).contiguous() + gathered = [ + torch.empty(local_padded.shape, device=local_padded.device, dtype=local_padded.dtype) + for _ in range(world_size) + ] + dist.all_gather(gathered, local_padded, group=group) + + pieces = [] + for i, g in enumerate(gathered): + li = int(shard_lens[i].item()) + pieces.append(g[..., :li]) + return torch.cat(pieces, dim=-1) + + +def _maybe_repeat_kv_for_ulysses( + key_states: torch.Tensor, + value_states: torch.Tensor, + *, + ulysses_size: int, +) -> Tuple[torch.Tensor, torch.Tensor]: + # PumpkinComment: (bs, seqlen, n_kv_heads, head_dim) + n_kv = key_states.size(2) + if ulysses_size > n_kv: + assert ( + ulysses_size % n_kv == 0 + ), f"ulysses_size={ulysses_size} must be divisible by num_key_value_heads={n_kv} (or vice versa)." + repeats = ulysses_size // n_kv + k = key_states.transpose(1, 2) + v = value_states.transpose(1, 2) + k, v = expandKV(k, v, repeats, 1) + return k.transpose(1, 2), v.transpose(1, 2) + return key_states, value_states + + +def make_ulysses_flash_attention_forward( + original_forward: Callable[..., Any], +) -> Callable[..., Any]: + """ + Wrap HF `_flash_attention_forward` by inserting Ulysses all-to-all before and after. + """ + + def _wrapped(*args: Any, **kwargs: Any) -> Any: + ulysses_group = get_ulysses_group() + ulysses_size = get_ulysses_size() or 1 + + # If Ulysses isn't enabled, do nothing. + if ulysses_group is None or ulysses_size <= 1: + return original_forward(*args, **kwargs) + + query_states = kwargs.get("query_states", args[0] if len(args) > 0 else None) + key_states = kwargs.get("key_states", args[1] if len(args) > 1 else None) + value_states = kwargs.get("value_states", args[2] if len(args) > 2 else None) + attention_mask = kwargs.get("attention_mask", args[3] if len(args) > 3 else None) + query_length = kwargs.get("query_length", args[4] if len(args) > 4 else None) + # Some callers pass `position_ids` positionally (Transformers signature has it after dropout). + # Handle both forms to avoid silently skipping the CP gather path for packed/varlen attention. + position_ids = kwargs.get("position_ids", args[7] if len(args) > 7 else None) + + if query_states is None or key_states is None or value_states is None: + return original_forward(*args, **kwargs) + + if query_states.dim() != 4: + # Unexpected, fall back. + return original_forward(*args, **kwargs) + + layout = "bshd" # (b, s, h, d) + dev = query_states.device + attn_present, attn_dtype = _sync_optional_tensor_meta(attention_mask, group=ulysses_group, dev=dev) + pos_present, pos_dtype = _sync_optional_tensor_meta(position_ids, group=ulysses_group, dev=dev) + if torch.is_tensor(attention_mask) and attention_mask.dim() == 2: + seq_len_local = attention_mask.size(1) + if query_states.size(1) != seq_len_local and query_states.size(2) == seq_len_local: + layout = "bhsd" + elif position_ids is not None and torch.is_tensor(position_ids): + seq_len_local = position_ids.size(-1) + if query_states.size(1) != seq_len_local and query_states.size(2) == seq_len_local: + layout = "bhsd" + + if layout == "bhsd": + query_states = query_states.transpose(1, 2) + key_states = key_states.transpose(1, 2) + value_states = value_states.transpose(1, 2) + + key_states, value_states = _maybe_repeat_kv_for_ulysses(key_states, value_states, ulysses_size=ulysses_size) + + q_global = SeqAllToAll4D.apply(ulysses_group, query_states, 2, 1, False) + k_global = SeqAllToAll4D.apply(ulysses_group, key_states, 2, 1, False) + v_global = SeqAllToAll4D.apply(ulysses_group, value_states, 2, 1, False) + + # Gather attention_mask / position_ids to global sequence if present. + # Use shard lengths from the local query sequence (before all2all). + shard_lens = torch.tensor([query_states.size(1)], device=query_states.device, dtype=torch.int64) + shard_lens_list = [torch.zeros_like(shard_lens) for _ in range(dist.get_world_size(ulysses_group))] + dist.all_gather(shard_lens_list, shard_lens, group=ulysses_group) + shard_lens_cat = torch.cat(shard_lens_list, dim=0) + + attn_mask_global = attention_mask + + # PumpkinComment: (Important for CP > 1 without rmpad) + # For transformers, it will make attn_mask to none is no pad tokens exists (all_causal) + # however, if two cp rank, one is fully causal, other is not, the gather process will be hang + # therefore, we set attn_mask to all ones if not present (fully causal) + if attn_present: + if not torch.is_tensor(attention_mask): + # Dummy local mask (all zeros) so all ranks participate in the same all_gather. + attention_mask = torch.ones( + (query_states.size(0), query_states.size(1)), + device=query_states.device, + dtype=attn_dtype, + ) + attn_mask_global = _gather_sharded_seq_tensor( + attention_mask, group=ulysses_group, shard_lens=shard_lens_cat + ) + + position_ids_global = position_ids + # PumpkinComment: + # Transformers sometimes sets `position_ids=None` when not needed, or passes it only in some codepaths. + # Under Ulysses CP, if one rank enters the gather path and another rank skips it, NCCL will hang. + if pos_present: + # Ensure all ranks participate in the gather: + # - If local `position_ids` is missing, create a dummy 1D tensor. + # - If local `position_ids` is provided, force it into the HF FlashAttention "1D PE" form. + local_len = int(query_states.size(1)) + bs = int(query_states.size(0)) + if not torch.is_tensor(position_ids): + # Create a dummy that matches the query batch size. + base = torch.arange(local_len, device=dev, dtype=pos_dtype) + position_ids = base.unsqueeze(0).expand(bs, -1).contiguous() + else: + if position_ids.dtype != pos_dtype: + position_ids = position_ids.to(dtype=pos_dtype) + if position_ids.dim() == 1: + position_ids = position_ids.unsqueeze(0).expand(bs, -1).contiguous() + elif position_ids.dim() == 2: + if int(position_ids.size(0)) == 1 and bs > 1: + position_ids = position_ids.expand(bs, -1).contiguous() + assert int(position_ids.size(0)) == bs, ( + "position_ids batch size must match query batch size under Ulysses CP. " + f"position_ids.shape={tuple(position_ids.shape)}, query_bs={bs}" + ) + else: + raise AssertionError( + "Ulysses CP FlashAttention wrapper only supports 1D or 2D `position_ids`. " + f"Got shape={tuple(position_ids.shape)}" + ) + + position_ids_global = _gather_sharded_seq_tensor( + position_ids, group=ulysses_group, shard_lens=shard_lens_cat + ) + position_ids_global = _normalize_position_ids_for_fa_varlen(position_ids_global) + + query_length_global = q_global.size(1) + + new_args = list(args) + if len(new_args) > 0: + new_args[0] = q_global + if len(new_args) > 1: + new_args[1] = k_global + if len(new_args) > 2: + new_args[2] = v_global + if len(new_args) > 3: + new_args[3] = attn_mask_global + if len(new_args) > 4: + new_args[4] = query_length_global + + # Only update kwargs keys that were already provided (do NOT inject new, version-dependent kw names). + if "query_states" in kwargs: + kwargs["query_states"] = q_global + if "key_states" in kwargs: + kwargs["key_states"] = k_global + if "value_states" in kwargs: + kwargs["value_states"] = v_global + if "attention_mask" in kwargs: + kwargs["attention_mask"] = attn_mask_global + if "position_ids" in kwargs: + kwargs["position_ids"] = position_ids_global + if "query_length" in kwargs: + kwargs["query_length"] = query_length_global + elif len(new_args) <= 4: + # If query_length isn't positional in this call, pass it iff the original accepts it. + sig = None + try: + sig = inspect.signature(original_forward) + except Exception: + sig = None + if sig is None or "query_length" in sig.parameters: + kwargs["query_length"] = query_length_global + + out = original_forward(*new_args, **kwargs) + + if isinstance(out, tuple): + attn_out = out[0] + else: + attn_out = out + + if torch.is_tensor(attn_out) and attn_out.dim() == 4: + local_out = SeqAllToAll4D.apply(ulysses_group, attn_out, 1, 2, False) + if layout == "bhsd": + local_out = local_out.transpose(1, 2) + if isinstance(out, tuple): + return (local_out,) + out[1:] + return local_out + + return out + + return _wrapped + + +def apply_hf_flash_attention_ulysses_patch() -> Dict[str, Any]: + """ + PumpkinComment: Patch for different versions of Transformers. + """ + if _PATCH_STATE["patched"]: + return {"patched": True, "already": True, **_PATCH_STATE} + + patched_any = False + result: Dict[str, Any] = {"patched": False, "targets": []} + + try: + import transformers.modeling_flash_attention_utils as mfu + + if hasattr(mfu, "_flash_attention_forward"): + _PATCH_STATE["orig_modeling_flash_attention_forward"] = mfu._flash_attention_forward + mfu._flash_attention_forward = make_ulysses_flash_attention_forward(mfu._flash_attention_forward) + patched_any = True + result["targets"].append("transformers.modeling_flash_attention_utils._flash_attention_forward") + except Exception as e: + logger.warning(f"Failed to patch transformers.modeling_flash_attention_utils._flash_attention_forward: {e}") + + try: + from transformers.integrations import flash_attention as fa + + if hasattr(fa, "_flash_attention_forward"): + _PATCH_STATE["orig_integrations_flash_attention_forward"] = fa._flash_attention_forward + fa._flash_attention_forward = make_ulysses_flash_attention_forward(fa._flash_attention_forward) + patched_any = True + result["targets"].append("transformers.integrations.flash_attention._flash_attention_forward") + except Exception as e: + logger.warning(f"Failed to patch transformers.integrations.flash_attention._flash_attention_forward: {e}") + + _PATCH_STATE["patched"] = patched_any + result["patched"] = patched_any + return result + + +def unapply_hf_flash_attention_ulysses_patch() -> None: + if not _PATCH_STATE["patched"]: + return + + try: + import transformers.modeling_flash_attention_utils as mfu + + if _PATCH_STATE["orig_modeling_flash_attention_forward"] is not None: + mfu._flash_attention_forward = _PATCH_STATE["orig_modeling_flash_attention_forward"] + except Exception: + pass + + try: + from transformers.integrations import flash_attention as fa + + if _PATCH_STATE["orig_integrations_flash_attention_forward"] is not None: + fa._flash_attention_forward = _PATCH_STATE["orig_integrations_flash_attention_forward"] + except Exception: + pass + + _PATCH_STATE["patched"] = False diff --git a/roll/utils/context_parallel/monkey_patch.py b/roll/utils/context_parallel/monkey_patch.py index b64b3c339..a98ec66d8 100644 --- a/roll/utils/context_parallel/monkey_patch.py +++ b/roll/utils/context_parallel/monkey_patch.py @@ -1,12 +1,9 @@ from transformers.modeling_utils import ALL_ATTENTION_FUNCTIONS from transformers.models.qwen2.modeling_qwen2 import Qwen2Model - - from roll.utils.logging import get_logger from roll.utils.packages import is_transformers_version_greater_than - logger = get_logger() @@ -25,8 +22,17 @@ def apply_ulysses_patch(): Qwen2Model._update_causal_mask = _update_causal_mask return _flash_attention_forward, _update_causal_mask else: - logger.warning("Currently, ulysses_attention patching is not supported for transformers>=4.53.0") - return None + from .hf_flash_attention_patch import apply_hf_flash_attention_ulysses_patch + + patch_info = apply_hf_flash_attention_ulysses_patch() + if not patch_info.get("patched", False): + logger.warning( + "Failed to apply ulysses_attention patching for transformers>=4.53.0 " + "(no FlashAttention2 hook patched)." + ) + return None + logger.info(f"Applied ulysses_attention patching for transformers>=4.53.0: {patch_info.get('targets')}") + return patch_info def unapply_ulysses_patch(): @@ -34,3 +40,10 @@ def unapply_ulysses_patch(): ALL_ATTENTION_FUNCTIONS["flash_attention_2"] = old_flash_attention_forward if not is_transformers_version_greater_than("4.53.0"): Qwen2Model._update_causal_mask = old_update_causal_mask + else: + try: + from .hf_flash_attention_patch import unapply_hf_flash_attention_ulysses_patch + + unapply_hf_flash_attention_ulysses_patch() + except Exception: + pass diff --git a/roll/utils/context_parallel/rmpad_ulysses.py b/roll/utils/context_parallel/rmpad_ulysses.py new file mode 100644 index 000000000..b119f4285 --- /dev/null +++ b/roll/utils/context_parallel/rmpad_ulysses.py @@ -0,0 +1,136 @@ +""" +Reference: https://verl.readthedocs.io/en/latest/_modules/verl/utils/ulysses.html +""" + +from typing import Optional, Tuple + +import torch +import torch.distributed as dist + +from roll.utils.context_parallel.autograd_gather import ulysses_gather +from roll.utils.context_parallel.globals import get_ulysses_group + + +def ulysses_pad_inputs( + input_ids_rmpad: torch.Tensor, + position_ids_rmpad: Optional[torch.Tensor] = None, + *, + cp_size: int, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], int]: + """ + Pad rmpad token streams so sequence length is divisible by cp_size, without slicing. + + This is used by VLM CP(Ulysses) "slice-after-embedding" paths where we must keep the + full token stream on every CP rank until the decoder slices `inputs_embeds`. + + Args: + input_ids_rmpad: shape [1, total_nnz] + position_ids_rmpad: shape [1, total_nnz] or [C, 1, total_nnz] (e.g. mrope) + cp_size: context parallel group size + + Returns: + padded_input_ids_rmpad: shape [1, total_padded] + padded_position_ids_rmpad: same padding, if provided + pad_size: how many tokens were padded at the end + """ + if cp_size <= 1: + return input_ids_rmpad, position_ids_rmpad, 0 + + assert ( + input_ids_rmpad.dim() == 2 and input_ids_rmpad.size(0) == 1 + ), f"Expected input_ids_rmpad shape [1, total_nnz], got {tuple(input_ids_rmpad.shape)}" + if position_ids_rmpad is not None: + assert position_ids_rmpad.size(-2) == 1, "position_ids_rmpad must have batch dim==1 for rmpad path" + assert input_ids_rmpad.size(-1) == position_ids_rmpad.size(-1) + + _, total_seq_len = input_ids_rmpad.shape + pad_size = (cp_size - (total_seq_len % cp_size)) % cp_size + if pad_size > 0: + input_ids_rmpad = torch.nn.functional.pad(input_ids_rmpad, (0, pad_size), value=0) + if position_ids_rmpad is not None: + pad_pos = torch.arange(pad_size, device=position_ids_rmpad.device).unsqueeze(0) # [1, pad] + if position_ids_rmpad.dim() == 3: + pad_pos = pad_pos.unsqueeze(0).repeat(position_ids_rmpad.size(0), 1, 1) # [C, 1, pad] + position_ids_rmpad = torch.cat((position_ids_rmpad, pad_pos), dim=-1) + + return input_ids_rmpad, position_ids_rmpad, pad_size + + +def ulysses_pad_and_slice_inputs( + input_ids_rmpad: torch.Tensor, + position_ids_rmpad: Optional[torch.Tensor] = None, + *, + cp_size: int, + cp_rank: int, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], int]: + """ + Pad and slice rmpad token streams so sequence length is divisible by cp_size. + + Args: + input_ids_rmpad: shape [1, total_nnz] + position_ids_rmpad: shape [1, total_nnz] or [C, 1, total_nnz] (e.g. mrope) + cp_size/cp_rank: context parallel group size/rank + + Returns: + sliced_input_ids_rmpad: shape [1, total_padded/cp_size] + sliced_position_ids_rmpad: same slicing, if provided + pad_size: how many tokens were padded at the end + """ + if cp_size <= 1: + return input_ids_rmpad, position_ids_rmpad, 0 + + assert ( + input_ids_rmpad.dim() == 2 and input_ids_rmpad.size(0) == 1 + ), f"Expected input_ids_rmpad shape [1, total_nnz], got {tuple(input_ids_rmpad.shape)}" + if position_ids_rmpad is not None: + assert position_ids_rmpad.size(-2) == 1, "position_ids_rmpad must have batch dim==1 for rmpad path" + assert input_ids_rmpad.size(-1) == position_ids_rmpad.size(-1) + + _, total_seq_len = input_ids_rmpad.shape + pad_size = (cp_size - (total_seq_len % cp_size)) % cp_size + if pad_size > 0: + input_ids_rmpad = torch.nn.functional.pad(input_ids_rmpad, (0, pad_size), value=0) + if position_ids_rmpad is not None: + pad_pos = torch.arange(pad_size, device=position_ids_rmpad.device).unsqueeze(0) # [1, pad] + if position_ids_rmpad.dim() == 3: + pad_pos = pad_pos.unsqueeze(0).repeat(position_ids_rmpad.size(0), 1, 1) # [C, 1, pad] + position_ids_rmpad = torch.cat((position_ids_rmpad, pad_pos), dim=-1) + + total_padded = input_ids_rmpad.size(1) + part = total_padded // cp_size + start = cp_rank * part + end = (cp_rank + 1) * part + input_ids_rmpad = input_ids_rmpad[:, start:end] + if position_ids_rmpad is not None: + position_ids_rmpad = position_ids_rmpad[..., start:end] + return input_ids_rmpad, position_ids_rmpad, pad_size + + +def gather_outputs_and_unpad( + x: torch.Tensor, + *, + gather_dim: int, + unpad_dim: Optional[int] = None, + padding_size: int = 0, + group: Optional[dist.ProcessGroup] = None, +) -> torch.Tensor: + """ + All-gather tensors across CP ranks and optionally remove padding added by `ulysses_pad_and_slice_inputs`. + + Note: this gathers full tensors onto every CP rank; use only when acceptable. + """ + group = get_ulysses_group() if group is None else group + if group is None or dist.get_world_size(group) <= 1: + if unpad_dim is not None and padding_size: + sl = [slice(None)] * x.dim() + sl[unpad_dim] = slice(0, x.size(unpad_dim) - padding_size) + return x[tuple(sl)] + return x + + out = ulysses_gather(x, gather_dim=gather_dim, group=group, grad_scaler=True) + + if unpad_dim is not None and padding_size: + sl = [slice(None)] * out.dim() + sl[unpad_dim] = slice(0, out.size(unpad_dim) - padding_size) + out = out[tuple(sl)] + return out diff --git a/roll/utils/context_parallel/ulysses_attention.py b/roll/utils/context_parallel/ulysses_attention.py index b46ee3e28..cb15dad2b 100644 --- a/roll/utils/context_parallel/ulysses_attention.py +++ b/roll/utils/context_parallel/ulysses_attention.py @@ -4,25 +4,19 @@ import copy import inspect import os -from typing import Any, Callable -from typing import Optional +from typing import Any, Callable, Optional import torch import torch.distributed as dist -from torch import Tensor - from flash_attn import flash_attn_func, flash_attn_varlen_func from flash_attn.bert_padding import pad_input - -from roll.utils.context_parallel.all_to_all import SeqAllToAll4D -from roll.utils.context_parallel.globals import get_ulysses_seqlen, get_ulysses_size - +from torch import Tensor from transformers.cache_utils import Cache from transformers.modeling_flash_attention_utils import _upad_input from transformers.utils import is_flash_attn_greater_or_equal -from roll.utils.context_parallel.globals import get_ulysses_group -from roll.utils.context_parallel.ulysses_attention import UlyssesAttention +from roll.utils.context_parallel.all_to_all import SeqAllToAll4D +from roll.utils.context_parallel.globals import get_ulysses_group, get_ulysses_seqlen, get_ulysses_size def _ulysses_attn_varlen_func( diff --git a/roll/utils/context_parallel/vlm_cp_patch.py b/roll/utils/context_parallel/vlm_cp_patch.py new file mode 100644 index 000000000..38348f2e8 --- /dev/null +++ b/roll/utils/context_parallel/vlm_cp_patch.py @@ -0,0 +1,147 @@ +""" +PumpkinComment: + +For many VLMs, slicing `input_ids` before the model builds `inputs_embeds` can break alignment between +visual placeholder tokens and visual features. Instead, keep the full token stream on every CP rank, +build `inputs_embeds`, then slice `inputs_embeds` (and associated tensors) inside the decoder forward. + +Reference: https://github.com/volcengine/verl/blob/main/verl/models/transformers/monkey_patch.py +""" + +import types +from typing import Any, Optional, Sequence, Tuple + +import torch +import torch.distributed as dist +from torch import nn + +from roll.utils.context_parallel.globals import get_ulysses_group, get_ulysses_size +from roll.utils.logging import get_logger + +logger = get_logger() + + +def _get_cp_info() -> Tuple[int, int, Optional[dist.ProcessGroup]]: + group = get_ulysses_group() + cp_size = int(get_ulysses_size() or 1) + if group is None or cp_size <= 1: + return 1, 0, group + return cp_size, dist.get_rank(group), group + + +def _slice_seq_dim(x: torch.Tensor, *, start: int, end: int, seq_dim: int) -> torch.Tensor: + sl = [slice(None)] * x.dim() + sl[seq_dim] = slice(start, end) + return x[tuple(sl)].contiguous() + + +def _slice_position_ids(position_ids: torch.Tensor, *, start: int, end: int) -> torch.Tensor: + # Common shapes: + # - (bs, seq) + # - (C, bs, seq) (e.g. some multimodal rope layouts) + # - (C, 1, seq) (rmpad path with bs==1) + if position_ids.dim() == 2: + return position_ids[:, start:end].contiguous() + if position_ids.dim() == 3: + return position_ids[..., start:end].contiguous() + raise ValueError(f"Unexpected position_ids shape: {position_ids.shape}") + + +def _slice_attention_mask(attention_mask: torch.Tensor, *, start: int, end: int) -> torch.Tensor: + if attention_mask.dim() == 2: + return attention_mask[:, start:end].contiguous() + if attention_mask.dim() == 4 and attention_mask.size(-1) >= end and attention_mask.size(-2) >= end: + return attention_mask[:, :, start:end, start:end].contiguous() + raise ValueError(f"Unexpected attention_mask shape: {attention_mask.shape}") + + +def patch_vlm_decoder_for_cp( + decoder_module: nn.Module, + *, + allow_no_inputs_embeds: bool = True, + name: str = "", +) -> bool: + """ + Patch a decoder/text-stack module to slice `inputs_embeds` inside forward under CP. + + This patches ONLY the given module instance (not the global class), to avoid affecting other code paths. + """ + if getattr(decoder_module, "_roll_vlm_cp_patched", False): + return True + + original_forward = decoder_module.forward + + def _wrapped_forward(self: nn.Module, *args: Any, **kwargs: Any): + cp_size, cp_rank, _ = _get_cp_info() + if cp_size <= 1: + return original_forward(*args, **kwargs) + + inputs_embeds = kwargs.get("inputs_embeds", None) + if not torch.is_tensor(inputs_embeds): + if allow_no_inputs_embeds: + return original_forward(*args, **kwargs) + raise RuntimeError("VLM CP patch expects `inputs_embeds` in decoder forward kwargs, but it was missing.") + + # Guard against re-entrancy / nested forwards. + if not getattr(self, "_roll_vlm_cp_needs_initial_slice", True): + return original_forward(*args, **kwargs) + + seq_len = inputs_embeds.size(1) + if seq_len % cp_size != 0: + # This should not happen if the caller padded to multiple-of-cp, but keep safe. + raise RuntimeError(f"inputs_embeds seq_len={seq_len} not divisible by cp_size={cp_size}") + part = seq_len // cp_size + start = cp_rank * part + end = (cp_rank + 1) * part + + call_kwargs = dict(kwargs) + call_kwargs["inputs_embeds"] = _slice_seq_dim(inputs_embeds, start=start, end=end, seq_dim=1) + + # Slice position_ids if present. + position_ids = call_kwargs.get("position_ids", None) + if torch.is_tensor(position_ids): + call_kwargs["position_ids"] = _slice_position_ids(position_ids, start=start, end=end) + + # Slice attention_mask if present (non-rmpad CP path). + attention_mask = call_kwargs.get("attention_mask", None) + if torch.is_tensor(attention_mask): + call_kwargs["attention_mask"] = _slice_attention_mask(attention_mask, start=start, end=end) + + # Qwen3-VL style extras (best-effort). + visual_pos_masks = call_kwargs.get("visual_pos_masks", None) + deepstack_visual_embeds = call_kwargs.get("deepstack_visual_embeds", None) + if torch.is_tensor(visual_pos_masks): + # visual_pos_masks expected shape: (bs, seq) + sliced_visual_mask = _slice_seq_dim(visual_pos_masks, start=start, end=end, seq_dim=1) + call_kwargs["visual_pos_masks"] = sliced_visual_mask + + if isinstance(deepstack_visual_embeds, Sequence) and len(deepstack_visual_embeds) > 0: + # Compute which visual embeddings belong to this CP shard. + # We count visual tokens across the whole (replicated) batch. + with torch.no_grad(): + visual_start = int(visual_pos_masks[:, :start].sum().item()) if start > 0 else 0 + visual_end = int(visual_pos_masks[:, :end].sum().item()) + + sliced_embeds = [] + for emb in deepstack_visual_embeds: + if not torch.is_tensor(emb): + sliced_embeds.append(emb) + continue + if visual_end <= visual_start: + sliced_embeds.append(emb[:0]) + else: + sliced_embeds.append(emb[visual_start:visual_end]) + call_kwargs["deepstack_visual_embeds"] = sliced_embeds + + self._roll_vlm_cp_needs_initial_slice = False + try: + return original_forward(*args, **call_kwargs) + finally: + self._roll_vlm_cp_needs_initial_slice = True + + decoder_module.forward = types.MethodType(_wrapped_forward, decoder_module) + setattr(decoder_module, "_roll_vlm_cp_patched", True) + setattr(decoder_module, "_roll_vlm_cp_needs_initial_slice", True) + if dist.is_available() and dist.is_initialized() and dist.get_rank() == 0: + logger.info(f"Applied VLM CP decoder slice patch to {name or decoder_module.__class__.__name__}") + return True diff --git a/roll/utils/dynamic_batching.py b/roll/utils/dynamic_batching.py index 83af8120a..34b9c0f08 100644 --- a/roll/utils/dynamic_batching.py +++ b/roll/utils/dynamic_batching.py @@ -1,8 +1,13 @@ +import bisect from typing import Iterator import torch from roll.distributed.scheduler.protocol import DataProto +from roll.utils.logging import get_logger + + +logger = get_logger() def dynamic_batching_shard( @@ -10,11 +15,17 @@ def dynamic_batching_shard( dp_size: int, max_tokens_per_microbatch: int, sequence_length_round: int, + pipeline_model_parallel_size: int = 1, + virtual_pipeline_model_parallel_size: int = None, log_prefix: str = None, ) -> tuple[DataProto, dict]: + #TODO use Karmarkar–Karp algorithm to replace the greedy implementation attention_mask = origin_batch.batch["attention_mask"] batch_size = attention_mask.shape[0] seq_lens = attention_mask.view(batch_size, -1).sum(-1).tolist() + + if 0 in seq_lens: + logger.warning(f"The attention_mask is all zero in the {log_prefix} stage. Please verify the rollout stage.") seq_index_sort_by_len = [i for i, _ in sorted(enumerate(seq_lens), key=lambda x: x[1])] seq_lens_sort = [seq_lens[i] for i in seq_index_sort_by_len] @@ -27,10 +38,9 @@ def dynamic_batching_shard( global_micro_batch_indices = [[0, 0]] global_micro_batch_lengths = [0] - max_seqlen_this_mb = 0 + max_seqlen_this_mb = sequence_length_round # at least `sequence_length_round` shard_size = len(aggregated_shards[0]) - total_tokens = 0 for shard_indice in range(shard_size): max_seqlen_this_shard_indice = 0 for shard, seq_lens in zip(aggregated_shards, seq_len_of_shard): @@ -53,7 +63,58 @@ def dynamic_batching_shard( global_micro_batch_indices.append([shard_indice, shard_indice + 1]) max_seqlen_this_mb = max_seqlen_this_shard_indice global_micro_batch_lengths.append(max_seqlen_this_mb) - total_tokens += total_tokens_in_mbs + + total_tokens = sum( + (end - start) * length + for (start, end), length in zip(global_micro_batch_indices, global_micro_batch_lengths) + ) + if pipeline_model_parallel_size > 1 and virtual_pipeline_model_parallel_size: + # pad to multiple of `microbatch_group_size_per_vp_stage` + num_micro_batches = len(global_micro_batch_indices) + padded_num_micro_batches = ( + (num_micro_batches + pipeline_model_parallel_size - 1) // pipeline_model_parallel_size + ) * pipeline_model_parallel_size + assert pipeline_model_parallel_size <= shard_size, f"The pipeline_model_size: {pipeline_model_parallel_size} should not be greater than num_seqs in one dp_rank" + assert padded_num_micro_batches <= shard_size + num_micro_batches_needed = padded_num_micro_batches - num_micro_batches + + splittable_mbs = [i for i in range(num_micro_batches) if (global_micro_batch_indices[i][1] - global_micro_batch_indices[i][0]) > 1] + # sort by tokens + splittable_mbs.sort(key=lambda x: (global_micro_batch_indices[x][1] - global_micro_batch_indices[x][0]) * global_micro_batch_lengths[x], reverse=True) + + assert len(splittable_mbs) >= num_micro_batches_needed + dropped_mbs = [] + added_micro_batch_indices = [] + added_micro_batch_lengths = [] + while num_micro_batches_needed: + mb_to_split = splittable_mbs.pop(0) + + # compute split point + split_start, split_end = global_micro_batch_indices[mb_to_split] + split_length = global_micro_batch_lengths[mb_to_split] + split_seqs = split_end - split_start + split_point = split_start + (split_seqs // 2) + + # generate new mb + new_mb1 = [split_start, split_point] + new_mb2 = [split_point, split_end] + + # record dropped and added mbs + dropped_mbs.append(mb_to_split) + added_micro_batch_indices += [new_mb1, new_mb2] + added_micro_batch_lengths += [split_length, split_length] + + num_micro_batches_needed -= 1 + + global_micro_batch_indices = [global_micro_batch_indices[i] for i in range(num_micro_batches) if i not in dropped_mbs] + global_micro_batch_lengths = [global_micro_batch_lengths[i] for i in range(num_micro_batches) if i not in dropped_mbs] + + # insert added_mbs, ensure sorted + for added_mbs_indices, added_mbs_length in zip(added_micro_batch_indices, added_micro_batch_lengths): + insert_indice = bisect.bisect_right(global_micro_batch_indices, added_mbs_indices) + global_micro_batch_indices.insert(insert_indice, added_mbs_indices) + global_micro_batch_lengths.insert(insert_indice, added_mbs_length) + batch = DataProto.concat(aggregated_shards) batch.meta_info["global_micro_batch_indices"] = global_micro_batch_indices batch.meta_info["global_micro_batch_lengths"] = global_micro_batch_lengths @@ -113,6 +174,7 @@ def make_mini_batch_iter_for_dynamic_batching( data.meta_info["micro_batch_indices"] = [[x - start for x in row] for row in indices_chunk] data.meta_info["micro_batch_lengths"] = global_micro_batch_lengths[i : i + ga_steps] + mini_batch.meta_info["mini_batch_size"] = mini_batch.batch.batch_size[0] mini_batch.meta_info["num_micro_batchs"] = len(indices_chunk) yield (mini_batch) @@ -125,7 +187,7 @@ def make_micro_batch_iter_for_dynamic_batching(mini_batch: DataProto): micro_batch = mini_batch.slice(start_idx, end_idx) input_ids_shape = micro_batch.batch["input_ids"].shape for k in mini_batch.batch.keys(): - if len(micro_batch.batch[k].shape) == len(input_ids_shape) and micro_batch.batch[k].shape[-1] in ( + if (len(micro_batch.batch[k].shape) == len(input_ids_shape) or k == "position_ids") and micro_batch.batch[k].shape[-1] in ( input_ids_shape[-1], input_ids_shape[-1] - 1, ): @@ -135,4 +197,4 @@ def make_micro_batch_iter_for_dynamic_batching(mini_batch: DataProto): start=0, length=seqlen if micro_batch.batch[k].shape[-1] == input_ids_shape[-1] else seqlen - 1, ) - yield micro_batch + yield micro_batch \ No newline at end of file diff --git a/roll/utils/fp8.py b/roll/utils/fp8.py new file mode 100644 index 000000000..56e108adb --- /dev/null +++ b/roll/utils/fp8.py @@ -0,0 +1,61 @@ +from typing import List + +import torch + +# Block quant operator +# +# Borrow from transformers +# https://huggingface.co/docs/transformers/en/quantization/finegrained_fp8 +# https://github.com/huggingface/transformers/blob/v4.55.0/src/transformers/quantizers/quantizer_finegrained_fp8.py#L83 +# +# May use op from torchao: +# https://github.com/pytorch/ao/pull/1668 +# https://github.com/volcengine/verl/pull/3084 +def per_block_fp8_quant(param_value: torch.Tensor, weight_block_size: List[int]): + """ + Quantizes weights to FP8 format using Block-wise quantization + """ + # Get FP8 min/max values + fp8_min = torch.finfo(torch.float8_e4m3fn).min + fp8_max = torch.finfo(torch.float8_e4m3fn).max + + block_size_m, block_size_n = weight_block_size + + rows, cols = param_value.shape[-2:] + + if rows % block_size_m != 0 or cols % block_size_n != 0: + raise ValueError( + f"Matrix dimensions ({rows}, {cols}) must be divisible by block sizes ({block_size_m}, {block_size_n})" + ) + param_value_orig_shape = param_value.shape + + param_value = param_value.reshape( + -1, rows // block_size_m, block_size_m, cols // block_size_n, block_size_n + ).permute(0, 1, 3, 2, 4) + + # Calculate scaling factor for each block + max_abs = torch.amax(torch.abs(param_value), dim=(-1, -2)) + scale = fp8_max / max_abs + scale_orig_shape = scale.shape + scale = scale.unsqueeze(-1).unsqueeze(-1) + + # Quantize the weights + quantized_param = torch.clamp(param_value * scale, min=fp8_min, max=fp8_max).to(torch.float8_e4m3fn) + + quantized_param = quantized_param.permute(0, 1, 3, 2, 4) + # Reshape back to matrix shape + quantized_param = quantized_param.reshape(param_value_orig_shape) + + # Construct the final, correct shape for the scales + num_row_blocks = rows // block_size_m + num_col_blocks = cols // block_size_n + # This preserves original batch dimensions, if any + final_scale_shape = (*param_value_orig_shape[:-2], num_row_blocks, num_col_blocks) + # Reshape directly to the correct shape and take the reciprocal + scale = scale.reshape(final_scale_shape).reciprocal() + + # TODO: DeepGemm scales need to be transposed and aligned (said in vLLM fp8.py)? + + # TODO: On B200, DeepGemm only support E8M0 scale + + return quantized_param, scale diff --git a/roll/utils/fsdp_utils.py b/roll/utils/fsdp_utils.py new file mode 100644 index 000000000..a32719a3f --- /dev/null +++ b/roll/utils/fsdp_utils.py @@ -0,0 +1,290 @@ +import copy +import dataclasses +from abc import ABC +from contextlib import contextmanager + +import torch +import torch.distributed as dist +import torch.nn as nn +from torch.distributed.fsdp import CPUOffloadPolicy, fully_shard +from torch.distributed.tensor import Shard + +from roll.models.model_providers import _is_moe_config +from roll.platforms import current_platform + +try: + from torch.distributed.device_mesh import DeviceMesh +except ImportError: + DeviceMesh = None + +fully_shard_module = torch.distributed.fsdp._fully_shard._fully_shard + + +@contextmanager +def maybe_patch_fsdp_module(model): + if fully_shard_module is None: + yield + return + + orig_fsdp_module = fully_shard_module.FSDPModule + + class FSDPModuleABC(ABC, orig_fsdp_module): + pass + + try: + if isinstance(model, ABC): + fully_shard_module.FSDPModule = FSDPModuleABC + yield + finally: + fully_shard_module.FSDPModule = orig_fsdp_module + + +def get_init_weight_context_manager(use_meta_tensor=True, mesh: DeviceMesh = None): + from accelerate import init_empty_weights + + cpu_init_weights = lambda: torch.device("cpu") + if use_meta_tensor: + if mesh is None: + init_context = init_empty_weights if torch.distributed.get_rank() != 0 else cpu_init_weights + else: + init_context = init_empty_weights if mesh.get_coordinate()[-1] != 0 else cpu_init_weights + else: + init_context = cpu_init_weights + return init_context + + +def get_shard_placement_fn(fsdp_size): + """ + Choose the dimension that can divide fsdp_size to avoid padding + Reference: https://github.com/volcengine/verl/blob/main/verl/utils/fsdp_utils.py + + """ + + def shard_placement_fn(param): + shape = list(param.shape) + for i in range(len(shape)): + if shape[i] % fsdp_size == 0: + return Shard(i) + return Shard(0) + + return shard_placement_fn + + +def _clone_mp_policy(mp_policy, **overrides): + if mp_policy is None: + return None + + if dataclasses.is_dataclass(mp_policy): + return dataclasses.replace(mp_policy, **overrides) + + # Try reconstructing via constructor from common attributes. + attrs = {} + for k in ("param_dtype", "reduce_dtype", "output_dtype", "cast_forward_inputs"): + if hasattr(mp_policy, k): + attrs[k] = getattr(mp_policy, k) + attrs.update(overrides) + return mp_policy.__class__(**attrs) + + +def _fsdp_kwargs_for_module(fsdp_kwargs: dict, module: nn.Module) -> dict: + """ + Allows overriding FSDP2 kwargs per module, e.g. disabling mp_policy.cast_forward_inputs + for specific classes like VL blocks. + """ + mp_policy = fsdp_kwargs.get("mp_policy", None) + if mp_policy is None or not hasattr(mp_policy, "cast_forward_inputs"): + return fsdp_kwargs + + attr_override = getattr(module, "_fsdp2_cast_forward_inputs", None) + if attr_override is not None: + desired = bool(attr_override) + else: + desired = False + + if desired == mp_policy.cast_forward_inputs: + return fsdp_kwargs + + new_kwargs = dict(fsdp_kwargs) + new_kwargs["mp_policy"] = _clone_mp_policy(mp_policy, cast_forward_inputs=desired) + return new_kwargs + + +def apply_fsdp2(model, fsdp_kwargs, config, is_lora=False): + """ + model: AutoModelForCausalLM + + Reference: https://github.com/volcengine/verl/blob/main/verl/utils/fsdp_utils.py + and LoRA Patch: https://github.com/volcengine/verl/issues/3470 + + """ + assert CPUOffloadPolicy is not None, "PyTorch version >= 2.4 is required for using fully_shard API (FSDP2)" + + model_cfg = getattr(model, "config", None) + is_moe = _is_moe_config(model_cfg) + apply_expert_patch = bool(config.get("apply_expert_patch", False)) + + if is_moe and apply_expert_patch: + from transformers.models.qwen3_moe.modeling_qwen3_moe import Qwen3MoeSparseMoeBlock + + from roll.third_party.fsdp2.qwen3_moe_patch import qwen3_moe_forward + + Qwen3MoeSparseMoeBlock.forward = qwen3_moe_forward + print("[apply_fsdp2] Applied expert patch for Qwen3MoeSparseMoeBlock") + + default_transformer_cls_names_to_wrap = getattr(model, "_no_split_modules", None) + fsdp_transformer_layer_cls_to_wrap = config.get("wrap_policy", {}).get( + "transformer_layer_cls_to_wrap", + default_transformer_cls_names_to_wrap, + ) + + if isinstance(fsdp_transformer_layer_cls_to_wrap, str): + fsdp_transformer_layer_cls_to_wrap = [fsdp_transformer_layer_cls_to_wrap] + + assert len(fsdp_transformer_layer_cls_to_wrap) > 0 and fsdp_transformer_layer_cls_to_wrap[0] is not None + + wrap_embeddings = bool(config.get("wrap_policy", {}).get("wrap_embeddings", False)) + wrap_lm_output = bool(config.get("wrap_policy", {}).get("wrap_lm_output", False)) + + def _get_embed_tokens(m: nn.Module): + inner = getattr(m, "model", None) + if inner is not None and hasattr(inner, "embed_tokens"): + return getattr(inner, "embed_tokens") + if hasattr(m, "embed_tokens"): + return getattr(m, "embed_tokens") + if hasattr(m, "get_input_embeddings"): + return m.get_input_embeddings() + return None + + def _already_fully_sharded(mod: nn.Module) -> bool: + # `fully_shard()` mutates the module into an internal FSDPModule type. If so, do not re-apply. + return fully_shard_module is not None and isinstance(mod, fully_shard_module.FSDPModule) + + lora_modules = [] + selected = [] + moe_modules = [] + for name, module in model.named_modules(): + if is_lora and ( + len(list(module.named_children())) == 0 + and getattr(module, "weight", None) is not None + and module.weight.requires_grad + ): + lora_modules.append(module) + + # PumpkinComment: + # (MoE): Do NOT FSDP-wrap individual experts by default. + # Experts are invoked conditionally per-rank (based on routing), + # so wrapping `experts.*` as separate FSDP modules can deadlock collectives when + # different ranks activate different experts. Therefor we only wrap experts + # if we apply the expert patch. + if is_moe and config.get("apply_expert_patch", False): + moe_block = config.get("wrap_policy", {}).get("moe_experts", None) + if isinstance(moe_block, str): + moe_block = [moe_block] + if moe_block is not None and module.__class__.__name__ in moe_block: + moe_modules.append(module) + print("[apply_fsdp2] Wrapped MoE expert module: ", name, module.__class__.__name__) + + # If `wrap_embeddings` is enabled, embeddings are handled explicitly below to avoid double wrapping. + if module.__class__.__name__ in fsdp_transformer_layer_cls_to_wrap or ( + (not wrap_embeddings) + and isinstance(module, nn.Embedding) + and (not getattr(getattr(model, "config", None), "tie_word_embeddings", True)) + ): + selected.append((name, module)) + + # PumpkinComment: + # Avoid wrapping both a parent module and its child module with the same mesh. + selected_names = [n for n, _ in selected] + non_leaf = set() + for n in selected_names: + if not n: + continue + parts = n.split(".") + for i in range(1, len(parts)): + non_leaf.add(".".join(parts[:i])) + + modules = [m for n, m in selected if n not in non_leaf] + + wrapped_ids = set() + + def _wrap_once(mod: nn.Module, kwargs: dict): + if mod is None: + return + if id(mod) in wrapped_ids: + return + if _already_fully_sharded(mod): + wrapped_ids.add(id(mod)) + return + with maybe_patch_fsdp_module(mod): + fully_shard(mod, **kwargs) + wrapped_ids.add(id(mod)) + + # 1. Embeddings + if wrap_embeddings: + _wrap_once(_get_embed_tokens(model), fsdp_kwargs) + + # 2. LoRA Modules (Linear Layer) + for idx, module in enumerate(lora_modules): + _wrap_once(module, fsdp_kwargs) + + # 3. MoE + for idx, module in enumerate(moe_modules): + _wrap_once(module, fsdp_kwargs) + + # 4. Transformers Layers + for idx, module in enumerate(modules): + _wrap_once(module, _fsdp_kwargs_for_module(fsdp_kwargs, module)) + + # 5. LM Output + if wrap_lm_output: + _wrap_once(getattr(model, "lm_head", None), fsdp_kwargs) + + # Root wrap last for remaining modules. (FSDP2 will not reshard_after_forward for the root module.) + root_kwargs = dict(fsdp_kwargs) + root_kwargs["mp_policy"] = _clone_mp_policy(root_kwargs.get("mp_policy", None), cast_forward_inputs=False) + _wrap_once(model, root_kwargs) + + +def fsdp2_load_full_state_dict( + model: torch.nn.Module, + full_state: dict, + device_mesh=None, + cpu_offload=None, +): + """ + Reference: https://github1s.com/volcengine/verl/blob/main/verl/utils/fsdp_utils.py + + Loads the full state dict (could be only on rank 0) into the sharded model. This is done by broadcasting the + parameters from rank 0 to all other ranks. This function modifies the model in-place. + + Args: + model (`torch.nn.Module`): The model to load the state dict into + full_state (`dict`): The full state dict to load, can only be on rank 0 + """ + + from torch.distributed.checkpoint.state_dict import StateDictOptions, set_model_state_dict + + device_id = current_platform.current_device() + + if dist.get_rank() == 0: + model = model.to(device=device_id, non_blocking=True) + else: + model = model.to_empty(device=device_id) + + cpu_offload = cpu_offload is not None + options = StateDictOptions( + full_state_dict=True, + cpu_offload=cpu_offload, + broadcast_from_rank0=True, + ) + set_model_state_dict(model, full_state, options=options) + + # rotary_emb is not in state_dict, so we need to broadcast it manually + for name, buf in model.named_buffers(): + dist.broadcast(buf, src=0) + + if cpu_offload: + # Ensure model is on CPU but buffers are on GPU for FSDP2 CPU offload + model.to("cpu", non_blocking=True) + for buf in model.buffers(): + buf.data = buf.data.to(device_id) diff --git a/roll/utils/functionals.py b/roll/utils/functionals.py index 3c1661d0c..0cee47c75 100644 --- a/roll/utils/functionals.py +++ b/roll/utils/functionals.py @@ -1,7 +1,13 @@ +from __future__ import annotations + import inspect +from typing import TYPE_CHECKING +if TYPE_CHECKING: + from roll.distributed.scheduler.protocol import DataProto import enum import traceback +import heapq from typing import Dict, List, Optional, Tuple, Union import numpy as np @@ -9,13 +15,12 @@ import torch.nn.functional as F from tensordict import TensorDict -from roll.pipeline.rlvr.rlvr_config import RLVRConfig from roll.configs.base_config import PPOConfig +from roll.pipeline.rlvr.rlvr_config import RLVRConfig from roll.platforms import current_platform from roll.utils.kl_controller import AdaptiveKLController from roll.utils.logging import get_logger - logger = get_logger() @@ -209,8 +214,8 @@ def entropy_from_logits(logits: torch.Tensor): return entropy -def agg_loss(loss_mat: torch.Tensor, loss_mask: torch.Tensor, loss_agg_mode: str, - weights: Optional[torch.Tensor] = None, loss_scale: Optional[float] = None): +def agg_loss(loss_mat: torch.Tensor, loss_mask: torch.Tensor, loss_agg_mode: str, batch_num_tokens: int = None, + global_valid_samples: int = None, weights: Optional[torch.Tensor] = None): """ ref: https://github.com/volcengine/verl/blob/78532923368aeb058f62201489546d013df47710/verl/trainer/ppo/core_algos.py#L370 Aggregate the loss matrix into a scalar. @@ -225,27 +230,30 @@ def agg_loss(loss_mat: torch.Tensor, loss_mask: torch.Tensor, loss_agg_mode: str "seq-mean-token-sum-norm" / "seq-mean-token-sum" is the default behavior weights: `torch.Tensor` - loss_scale: `(float)` Returns: loss: `a scalar torch.Tensor` aggregated loss """ + if batch_num_tokens is None: + batch_num_tokens = loss_mask.sum() + if global_valid_samples is None: + global_valid_samples = loss_mat.size(0) if loss_agg_mode == "token-mean": if weights is None: weights = torch.ones(loss_mask.shape[0], device=loss_mask.device) - loss = masked_mean(loss_mat * weights.unsqueeze(-1), loss_mask) + loss = (loss_mat * weights.unsqueeze(-1)).sum() / batch_num_tokens elif loss_agg_mode == "seq-mean-token-sum": - seq_losses = masked_sum(loss_mat, loss_mask, dim=-1) # token-sum + seq_losses = masked_sum(loss_mat, loss_mask, dim=-1) # token-sum valid_samples = torch.any(loss_mask > 0, dim=-1).float() if weights is None: weights = torch.ones(loss_mask.shape[0], device=loss_mask.device) - loss = (seq_losses * weights * valid_samples).sum() / (valid_samples.sum() + 1e-8) # seq-mean + loss = (seq_losses * weights * valid_samples).sum() / (global_valid_samples + 1e-8) # seq-mean elif loss_agg_mode == "seq-mean-token-mean": seq_losses = masked_mean(loss_mat, loss_mask, dim=-1) valid_samples = torch.any(loss_mask > 0, dim=-1).float() if weights is None: weights = torch.ones(loss_mask.shape[0], device=loss_mask.device) - loss = (seq_losses * weights * valid_samples).sum() / (valid_samples.sum() + 1e-8) # seq-mean + loss = (seq_losses * weights * valid_samples).sum() / (global_valid_samples + 1e-8) # seq-mean elif loss_agg_mode == "seq-mean-token-sum-norm": seq_losses = masked_sum(loss_mat, loss_mask, dim=-1) valid_samples = torch.any(loss_mask > 0, dim=-1).float() @@ -259,7 +267,7 @@ def agg_loss(loss_mat: torch.Tensor, loss_mask: torch.Tensor, loss_agg_mode: str else: raise ValueError(f"Invalid loss_agg_mode: {loss_agg_mode}") - return loss * loss_scale if loss_scale else loss + return loss def masked_mean(tensor: torch.Tensor, mask: torch.Tensor, dim: int = None) -> torch.Tensor: @@ -312,21 +320,21 @@ def get_pad_mask(response_id: torch.Tensor, pad_token: int = 0, eos_token: int = e.g. pad token=0 response_id: [1, 2, 2, 42, 3, 5, 1, 0, 0] pad_mask: [1, 1, 1, 1, 1, 1, 1, 0, 0] - + If eos_token == pad_token, the first pad token (which is the eos token) should be kept. e.g. pad_token=0, eos_token=0 response_id: [1, 2, 2, 42, 3, 5, 0, 0, 0] pad_mask: [1, 1, 1, 1, 1, 1, 1, 0, 0] (first pad token/eos token is kept) """ pad_mask = response_id.not_equal(pad_token).to(dtype) - + # eos_token == pad_token, 需要保留第一个pad token否则会误将eos token mask掉 if eos_token == pad_token: pad_positions = response_id.eq(pad_token).to(dtype) cumsum_pad = torch.cumsum(pad_positions, dim=-1) first_pad_token = (cumsum_pad == 1).to(dtype) pad_mask = pad_mask | first_pad_token - + assert ( not (pad_mask[:, 0] == 0).logical_and(pad_mask.sum(-1) != 0).any() ), f"response_id is not valid: {response_id}, pad_token is {pad_token}" @@ -364,56 +372,74 @@ def response_level_masked_whiten(values: torch.Tensor, mask: torch.Tensor, shift def reduce_metrics(metrics: dict, reduce_func=np.mean) -> dict: """ - Reduce metrics with enhanced aggregation support based on metric name suffixes. - - Supported suffixes: - - _mean: arithmetic mean (default) - - _max: maximum value - - _min: minimum value - - _p50: 50th percentile (median) - - _p99: 99th percentile - - _std: standard deviation - - _sum: sum of all values - - Args: - metrics: Dictionary of metric names to lists/tensors of values - reduce_func: Default reduction function (used for metrics without suffix) - - Returns: - Dictionary with reduced metric values + Reduce metrics by parsing an aggregation instruction from the metric name. + + Aggregation can be specified in the metric name using either of the following formats: + - Suffix after '@': e.g., "loss@sum", "latency@p99" + - Underscore suffix: e.g., "loss_sum", "latency_p99" + + Supported aggregation tags/suffixes: mean, max, min, p50, p99, std, sum + + Notes: + - The original metric key is preserved (the '@tag' or '_suffix' remains in the key). + - Scalar values (int, float, np.number) and torch.Tensor objects are left unchanged. + - Values of type list, tuple, or np.ndarray are reduced using the inferred aggregation function. + - If no aggregation tag or suffix is found, the default `reduce_func` is used. + - Empty sequences are skipped and not modified. """ import numpy as np - - def _parse_suffix(metric_name): - """Parse aggregation method from metric name suffix.""" - if metric_name.endswith('_mean'): - return np.mean - elif metric_name.endswith('_max'): - return np.max - elif metric_name.endswith('_min'): - return np.min - elif metric_name.endswith('_p50'): - return lambda x: np.percentile(x, 50) - elif metric_name.endswith('_p99'): - return lambda x: np.percentile(x, 99) - elif metric_name.endswith('_std'): - return np.std - elif metric_name.endswith('_sum'): - return np.sum - else: - return reduce_func - - for key, val in metrics.items(): - if isinstance(val, (list, tuple, np.ndarray)) and len(val) > 0: - # Use suffix-based aggregation if available - aggregation_func = _parse_suffix(key) - metrics[key] = float(aggregation_func(val)) + + reducers = { + "mean": np.mean, + "max": np.max, + "min": np.min, + "p50": lambda x: np.percentile(x, 50), + "p99": lambda x: np.percentile(x, 99), + "std": np.std, + "sum": np.sum, + } + + def _parse_aggregation_func(metric_name: str): + # First, check for '@' separator + if "@" in metric_name: + _, tag = metric_name.rsplit("@", 1) + tag = tag.strip() + if tag in reducers: + return reducers[tag] + else: + raise ValueError(f"Unknown reducer tag '{tag}' in metric '{metric_name}'") + + # Otherwise, check for underscore-based suffixes + for suffix_key in ["mean", "max", "min", "p50", "p99", "std", "sum"]: + if metric_name.endswith(f"_{suffix_key}"): + return reducers[suffix_key] + + # No aggregation specifier found → use default + return reduce_func + + for key, val in list(metrics.items()): + # Skip reduction for scalars and tensors + if isinstance(val, (int, float, np.number)) or isinstance(val, torch.Tensor): + continue + + # Reduce sequences + if isinstance(val, (list, tuple, np.ndarray)): + if len(val) == 0: + continue + agg_func = _parse_aggregation_func(key) + metrics[key] = float(agg_func(val)) else: - # Fallback to default reduction function - metrics[key] = reduce_func(val) - + # Fallback for other types (e.g., single-element containers) + metrics[key] = float(reduce_func(val)) + return metrics +def reduce_metrics_list(metrics_list: list, reduce_func=np.mean) -> dict: + if len(metrics_list) == 0: + return {} + merged_metrics = {k: reduce_func([m[k] for m in metrics_list]) for k in metrics_list[0].keys()} + return merged_metrics + def pad_to_length(tensor: torch.Tensor, length, pad_value, dim=-1): if tensor.size(dim) >= length: @@ -516,7 +542,9 @@ def expand_to_token_level(data: "DataProto"): return token_level_rewards -def reward_norm(response_level_rewards: torch.Tensor, n_sample=-1, running_ctrl={}, norm_mean_type=None, norm_std_type=None): +def reward_norm( + response_level_rewards: torch.Tensor, n_sample=-1, running_ctrl={}, norm_mean_type=None, norm_std_type=None +): group_mode = (norm_mean_type == "group") or (norm_std_type == "group") if group_mode and n_sample > 0: reshape_reward = response_level_rewards.reshape(*response_level_rewards.size()[:-1], -1, n_sample) @@ -543,10 +571,10 @@ def reward_norm(response_level_rewards: torch.Tensor, n_sample=-1, running_ctrl= rewards = reshape_reward if norm_mean_type == "group" else response_level_rewards # 标准化奖励 if norm_std_type is not None: - normalized_rewards = (rewards - reward_mean) / (reward_std + 1e-6) - else: + normalized_rewards = (rewards - reward_mean) / (reward_std + 1e-6) + else: normalized_rewards = (rewards - reward_mean) - + # 如果是对 group mean 归一化,需要恢复原始形状 if norm_mean_type == "group": normalized_rewards = normalized_rewards.reshape(*response_level_rewards.size()) @@ -609,7 +637,7 @@ def reward_postprocess(data: "DataProto", pipeline_config: RLVRConfig, running_c pipeline_config.norm_mean_type, pipeline_config.norm_std_type = "group", "group" response_level_rewards = reward_norm( - response_level_rewards, + response_level_rewards, n_sample=pipeline_config.actor_infer.generating_args.num_return_sequences, running_ctrl=running_ctrl, norm_mean_type=pipeline_config.norm_mean_type, @@ -777,14 +805,6 @@ def compute_advantage( data.batch["returns"] = returns return data - -class GenerateRequestType(enum.Enum): - ADD = enum.auto() - ABORT = enum.auto() - STOP = enum.auto() - ALIVE_CHECK = enum.auto() - - def postprocess_generate( prompts: "DataProto", output: torch.Tensor, @@ -793,7 +813,7 @@ def postprocess_generate( eos_token_id, pad_token_id, fill_eos_token=False, - output_logprobs: Optional[list[list[float]]]=None, + output_logprobs: Optional[list[list[float]]] = None, pad_to_seq_len=True, ) -> "DataProto": from roll.distributed.scheduler.protocol import DataProto @@ -811,7 +831,6 @@ def postprocess_generate( # input_batch_size * num_return_sequences output_batch_size = output.size(0) - input_batch_size = input_ids.size(0) prompt_length = input_ids.size(1) if pad_to_seq_len: @@ -825,7 +844,9 @@ def postprocess_generate( attention_mask = ( attention_mask.unsqueeze(1).repeat(1, num_return_sequences, 1).view(output_batch_size, prompt_length) ) - response_mask = get_pad_mask(response_id=response, pad_token=pad_token_id, eos_token=eos_token_id, dtype=attention_mask.dtype) + response_mask = get_pad_mask( + response_id=response, pad_token=pad_token_id, eos_token=eos_token_id, dtype=attention_mask.dtype + ) attention_mask = torch.cat((attention_mask, response_mask), dim=-1) position_ids = prompts.batch["position_ids"] @@ -837,7 +858,8 @@ def postprocess_generate( .view(output_batch_size, *position_ids.shape[-2:]) ) delta_position_id = torch.arange(1, (sequence_length - prompt_length) + 1, device=position_ids.device) - delta_position_id = delta_position_id.view(1, 1, -1).expand(output_batch_size, 3, -1) + # position_ids: (bsz, C, prompt_len). Expand delta along channel dim (C can be 3 or 4). + delta_position_id = delta_position_id.view(1, 1, -1).expand(output_batch_size, position_ids.size(1), -1) response_position_ids = position_ids[..., -1:] + delta_position_id # left padding for prompt and right padding for response, to be converted # to right padding which is consistent with output @@ -846,7 +868,11 @@ def postprocess_generate( assert attention_mask.any(dim=1).all(), f"has all 0 attention_mask, {attention_mask} {input_ids}" first_one = attention_mask.float().argmax(dim=1) new_response_mask = torch.zeros_like(attention_mask) # response mask for cat input_ids - logprobs = torch.zeros([output_batch_size, sequence_length - 1], dtype=torch.float32) if output_logprobs is not None else None + logprobs = ( + torch.zeros([output_batch_size, sequence_length - 1], dtype=torch.float32) + if output_logprobs is not None + else None + ) for i in range(output_batch_size): shift = first_one[i].item() if shift > 0: @@ -858,7 +884,7 @@ def postprocess_generate( attention_mask[i][:valid_length] = 1 attention_mask[i][valid_length:] = 0 prompt_len = valid_length - response_length - new_response_mask[i][prompt_len : valid_length] = 1 + new_response_mask[i][prompt_len:valid_length] = 1 if logprobs is not None: logprobs[i][prompt_len - 1 : valid_length - 1] = torch.tensor( output_logprobs[i][:response_length], dtype=logprobs.dtype @@ -873,8 +899,8 @@ def postprocess_generate( # cause error: Image features and image tokens do not match output_position_ids[i, ..., :-shift] = output_position_ids[i, ..., shift:].clone() # only clean in VLM(qwen2-vl) to make no effect on LLM - if prompt_length > response_length: - output[i, -shift:] = pad_token_id + if shift > 0 and prompt_length > valid_length: + output[i, -shift:] = pad_token_id prompt_mask = (attention_mask == 1) & (new_response_mask == 0) if position_ids.dim() == 3: @@ -984,3 +1010,280 @@ def group_reward_norm(data: "DataProto", n_sample=-1, div_std=True, div_std_glob reshape_reward = reshape_reward / (torch.std(reshape_reward) + 1e-6) data.batch["response_level_rewards"] = reshape_reward.reshape(*response_level_rewards.size()) return data + + +def adjust_sequence_length(sequence, target_length, origin_seq_len, pad_value=0): + """ + 调整序列长度。自动探测序列维度(优先最后一维,其次向前搜索)。 + + Args: + sequence: 输入张量 (e.g., [B, S], [B, S, D], [B, 3, S]) + target_length: 目标的全局序列长度 + origin_seq_len: 当前张量应当对应的参考原始长度 + pad_value: 填充值 + """ + if sequence.dim() < 2: + return sequence + + # --- 1. 探测序列维度 (seq_dim) --- + seq_dim = None + is_causal_shift = False + + # 优先级:最后一维 (-1),然后是倒数第二维 (-2),以此类推 + # 检查是否等于参考长度 或 参考长度-1 (causal shift) + candidate_dims = [-1] + list(range(-2, -sequence.dim() - 1, -1)) + + for d in candidate_dims: + curr_size = sequence.size(d) + if curr_size == origin_seq_len: + seq_dim = d + is_causal_shift = False + break + elif curr_size == origin_seq_len - 1: + seq_dim = d + is_causal_shift = True + break + + # 如果没找到任何维度匹配 origin_seq_len,说明该张量不需要处理 + if seq_dim is None: + return sequence + + # --- 2. 计算实际需要调整到的目标长度 --- + actual_len = sequence.size(seq_dim) + # 如果原始是 S-1,目标也应该是 target-1 (保持位移一致) + effective_target = target_length - 1 if is_causal_shift else target_length + + if actual_len == effective_target: + return sequence + + # --- 3. 执行 Padding 或 Truncation --- + if actual_len < effective_target: + # Padding 逻辑 + pad_size = effective_target - actual_len + + # torch.nn.functional.pad 的 pad 参数顺序是: + # [最后维左, 最后维右, 倒数第二维左, 倒数第二维右, ...] + # 我们只在识别到的 seq_dim 的右侧进行 padding + # 偏移量计算:abs(seq_dim) - 1 决定了前面有多少对 [0, 0] + pad_config = [0, 0] * (abs(seq_dim) - 1) + [0, pad_size] + + return torch.nn.functional.pad(sequence, pad_config, value=pad_value) + + else: + # Truncation 逻辑 (通用切片) + slices = [slice(None)] * sequence.dim() + slices[seq_dim] = slice(0, effective_target) + return sequence[tuple(slices)] + + +def get_seqlen_balanced_partitions(seqlen_list: List[float], + k_partitions: int, + equal_size: bool = False) -> List[List[int]]: + """ + Reference: https://github.com/volcengine/verl/blob/468adf22c43b744348051fccd7a5d830c6c3c36a/verl/utils/seqlen_balancing.py + + Partition sequences to balance workload using Karmarkar-Karp algorithm. + + Args: + seqlen_list: List of sequence lengths (or workloads) + k_partitions: Number of partitions to create + equal_size: If True, ensure all partitions have equal number of items + + Returns: + List of partitions, where each partition is a list of indices + """ + + class Set: + """Represents a set of items with their sum.""" + + def __init__(self): + self.sum = 0 + self.items = [] + + def add(self, idx: int, val: float): + self.items.append((idx, val)) + self.sum += val + + def merge(self, other): + for idx, val in other.items: + self.items.append((idx, val)) + self.sum += val + + def __lt__(self, other): + if self.sum != other.sum: + return self.sum < other.sum + if len(self.items) != len(other.items): + return len(self.items) < len(other.items) + return self.items < other.items + + class State: + """Represents a state in the partitioning algorithm.""" + + def __init__(self, items: List[Tuple[int, float]], k: int): + self.k = k + self.sets = [Set() for _ in range(k)] + assert len(items) in [1, k], f"{len(items)} not in [1, {k}]" + for i, (idx, seqlen) in enumerate(items): + self.sets[i].add(idx=idx, val=seqlen) + self.sets = sorted(self.sets, reverse=True) + + def get_partitions(self) -> List[List[int]]: + partitions = [] + for i in range(len(self.sets)): + cur_partition = [] + for idx, _ in self.sets[i].items: + cur_partition.append(idx) + partitions.append(cur_partition) + return partitions + + def merge(self, other): + for i in range(self.k): + self.sets[i].merge(other.sets[self.k - 1 - i]) + self.sets = sorted(self.sets, reverse=True) + + @property + def spread(self) -> float: + return self.sets[0].sum - self.sets[-1].sum + + def __lt__(self, other): + if self.spread != other.spread: + return self.spread > other.spread + return self.sets[0] > other.sets[0] + + assert len(seqlen_list) >= k_partitions, \ + f"number of items:[{len(seqlen_list)}] < k_partitions:[{k_partitions}]" + + # Sort by sequence length + sorted_seqlen_list = sorted([(seqlen, i) for i, seqlen in enumerate(seqlen_list)]) + states_pq = [] + + if equal_size: + assert len(seqlen_list) % k_partitions == 0, \ + f"{len(seqlen_list)} % {k_partitions} != 0" + for offset in range(0, len(sorted_seqlen_list), k_partitions): + items = [] + for i in range(k_partitions): + seqlen, idx = sorted_seqlen_list[offset + i] + items.append((idx, seqlen)) + heapq.heappush(states_pq, State(items=items, k=k_partitions)) + else: + for seqlen, idx in sorted_seqlen_list: + heapq.heappush(states_pq, State(items=[(idx, seqlen)], k=k_partitions)) + + # Merge states until only one remains + while len(states_pq) > 1: + state0 = heapq.heappop(states_pq) + state1 = heapq.heappop(states_pq) + state0.merge(state1) + heapq.heappush(states_pq, state0) + + final_state = states_pq[0] + partitions = final_state.get_partitions() + + # Validate and sort partitions + assert len(partitions) == k_partitions, f"{len(partitions)} != {k_partitions}" + seen_idx = set() + sorted_partitions = [] + + for i, partition in enumerate(partitions): + assert len(partition) > 0, f"the {i}-th partition is empty" + for idx in partition: + seen_idx.add(idx) + sorted_partitions.append(sorted(partition)) + + assert seen_idx == set(range(len(seqlen_list))), "Not all indices are covered" + + return sorted_partitions + + +def log_seqlen_unbalance(seqlen_list: list[int], partitions: list[list[int]], prefix): + """ + Calculate and log metrics related to sequence length imbalance before and after partitioning. + + Args: + seqlen_list (List[int]): A list of sequence lengths for each item. + partitions (List[List[int]]): A list of partitions, where each inner list contains indices + from seqlen_list assigned to that partition. + prefix (str): A prefix to be added to each metric key in the returned dictionary. + + Returns: + dict: A dictionary containing metrics related to sequence length imbalance. + """ + # Get the number of partitions + k_partition = len(partitions) + # assert len(seqlen_list) % k_partition == 0 + batch_size = len(seqlen_list) // k_partition + min_sum_seqlen = None + max_sum_seqlen = None + total_sum_seqlen = 0 + + # Iterate over each batch of sequence lengths + for offset in range(0, len(seqlen_list), batch_size): + cur_sum_seqlen = sum(seqlen_list[offset: offset + batch_size]) + if min_sum_seqlen is None or cur_sum_seqlen < min_sum_seqlen: + min_sum_seqlen = cur_sum_seqlen + if max_sum_seqlen is None or cur_sum_seqlen > max_sum_seqlen: + max_sum_seqlen = cur_sum_seqlen + total_sum_seqlen += cur_sum_seqlen + + balanced_sum_seqlen_list = [] + for partition in partitions: + cur_sum_seqlen_balanced = sum([seqlen_list[i] for i in partition]) + balanced_sum_seqlen_list.append(cur_sum_seqlen_balanced) + min_sum_seqlen_balanced = min(balanced_sum_seqlen_list) + max_sum_seqlen_balanced = max(balanced_sum_seqlen_list) + + return { + f"{prefix}/min": min_sum_seqlen, + f"{prefix}/max": max_sum_seqlen, + f"{prefix}/minmax_diff": max_sum_seqlen - min_sum_seqlen, + f"{prefix}/balanced_min": min_sum_seqlen_balanced, + f"{prefix}/balanced_max": max_sum_seqlen_balanced, + f"{prefix}/mean": total_sum_seqlen / len(partitions), + } + + +def batch_balance(batch: DataProto, dp_size, minibatch_size, logging_prefix="global_seqlen", keep_minibatch=False): + """ + ref: https://github.com/volcengine/verl/blob/2c0fcbe52a9230281329e7197501f4dc67f0a5d8/verl/trainer/ppo/ray_trainer.py#L1018 + Reorder the data on single controller such that each dp rank gets similar total tokens""" + attention_mask = batch.batch["attention_mask"] + batch_size = attention_mask.shape[0] + global_seqlen_lst = batch.batch["attention_mask"].view(batch_size, -1).sum(-1) # (train_batch_size,) + + def calculate_workload(seq_len_list): + return 24576 * seq_len_list + seq_len_list * seq_len_list + + workload_lst = calculate_workload(global_seqlen_lst) + world_size = dp_size + if keep_minibatch: + # Decouple the DP balancing and mini-batching. + minibatch_num = len(workload_lst) // minibatch_size + global_partition_lst = [[] for _ in range(world_size)] + for i in range(minibatch_num): + rearrange_minibatch_lst = get_seqlen_balanced_partitions( + workload_lst[i * minibatch_size: (i + 1) * minibatch_size], + k_partitions=world_size, + equal_size=True, + ) + for j, part in enumerate(rearrange_minibatch_lst): + global_partition_lst[j].extend([x + minibatch_size * i for x in part]) + else: + global_partition_lst = get_seqlen_balanced_partitions( + workload_lst, k_partitions=world_size, equal_size=True + ) + # Place smaller micro-batches at both ends to reduce the bubbles in pipeline parallel. + for idx, partition in enumerate(global_partition_lst): + partition.sort(key=lambda x: (workload_lst[x], x)) + ordered_partition = partition[::2] + partition[1::2][::-1] + global_partition_lst[idx] = ordered_partition + # reorder based on index. The data will be automatically equally partitioned by dispatch function + global_idx = torch.tensor([j for partition in global_partition_lst for j in partition]) + batch.reorder(global_idx) + global_balance_stats = log_seqlen_unbalance( + seqlen_list=global_seqlen_lst.detach().cpu().tolist(), partitions=global_partition_lst, prefix=logging_prefix + ) + metrics = {} + metrics.update(global_balance_stats) + return metrics + diff --git a/roll/utils/logging.py b/roll/utils/logging.py index 3d8f769d0..167bed6d2 100644 --- a/roll/utils/logging.py +++ b/roll/utils/logging.py @@ -7,6 +7,7 @@ def is_roll_debug_mode(): return os.getenv("ROLL_DEBUG", os.getenv("RAY_PROFILING", "0")) == "1" +logging.basicConfig(force=True, level=logging.DEBUG if is_roll_debug_mode() else logging.INFO) class CustomFormatter(logging.Formatter): def format(self, record): @@ -81,8 +82,14 @@ def get_logger() -> logging.Logger: handler.setFormatter(formatter) handler.set_name(_logger_name) _logger.addHandler(handler) + err_handler = logging.StreamHandler(sys.stderr) + err_handler.setFormatter(formatter) + err_handler.set_name(_logger_name) + err_handler.setLevel(logging.ERROR) + _logger.addHandler(err_handler) reset_file_logger_handler(_logger, log_dir, formatter) logger = _logger + logger.propagate = False return _logger diff --git a/roll/utils/metrics/metrics_manager.py b/roll/utils/metrics/metrics_manager.py index 3d2ba15ce..6b75661c2 100644 --- a/roll/utils/metrics/metrics_manager.py +++ b/roll/utils/metrics/metrics_manager.py @@ -1,8 +1,8 @@ from typing import Dict, Any, List, Optional import torch import numpy as np -from ray.util.timer import _Timer from codetiming import Timer +from contextlib import contextmanager from roll.utils.functionals import masked_mean, reduce_metrics @@ -414,3 +414,39 @@ def add_domain_all_metrics(self, global_step, batch_grouped: Dict[str, Any]) -> token_metrics = self.add_token_metrics(batch=domain_batch) self.add_domain_metrics(domain, token_metrics) self.metrics = original_metrics + +class DurationTracker: + def __init__(self): + self._clear() + + def observe(self, duration: float): + self.count += 1 + self.total += duration + self.min_time = min(self.min_time, duration) + self.max_time = max(self.max_time, duration) + self.mean = self.total / self.count if self.count > 0 else 0.0 + + @contextmanager + def track(self): + try: + with Timer(logger=None) as timer: + yield + finally: + self.observe(timer.last) + + def _clear(self): + self.count = 0 + self.total = 0.0 + self.min_time = float('inf') + self.max_time = float('-inf') + self.mean = 0.0 + + def log(self): + summary = { + 'count': self.count, + 'min': self.min_time if self.min_time != float('inf') else 0.0, + 'max': self.max_time if self.max_time != float('-inf') else 0.0, + 'mean': round(self.mean, 6), + } + self._clear() + return summary diff --git a/roll/utils/send_recv_utils.py b/roll/utils/send_recv_utils.py index d816d1ff4..6eab849f5 100644 --- a/roll/utils/send_recv_utils.py +++ b/roll/utils/send_recv_utils.py @@ -1,13 +1,10 @@ -from dataclasses import dataclass -from typing import TYPE_CHECKING, Dict, List, Optional - -from roll.platforms import current_platform +from typing import Dict import torch +from torch.multiprocessing import reductions -if TYPE_CHECKING: - from torch import Tensor - +from roll.platforms import current_platform +from roll.utils.cuda_ipc_utils import MultiprocessingSerializer MAX_SHARD_SIZE = 5_000_000_000 # 5GB @@ -57,19 +54,13 @@ def get_tensor_size(tensor: "torch.Tensor") -> int: return tensor.numel() * tensor.element_size() -@dataclass -class StackedTensors: - tensors: Optional[List["torch.Tensor"]] - dim: int = 0 - - class TensorBucket: def __init__(self, bucket_size, device="cuda"): self.buffer = torch.empty(bucket_size, dtype=torch.int8, device=device) self.device = device self.bucket_size = bucket_size self.write_index = 0 - self.tensors_meta = {} + self.tensors_meta = [] def push_tensor(self, tensor: "torch.Tensor", tensor_start: int, name: str): required_bytes = get_tensor_size(tensor) - tensor_start @@ -79,12 +70,16 @@ def push_tensor(self, tensor: "torch.Tensor", tensor_start: int, name: str): self.buffer[bucket_start : bucket_start + save_bytes].copy_( tensor_bytes[tensor_start : tensor_start + save_bytes] ) - self.tensors_meta[name] = { - "bucket_start": bucket_start, - "tensor_start": tensor_start, - "save_bytes": save_bytes, - "tensor_meta": torch.empty_like(tensor, device="meta"), - } + self.tensors_meta.append( + { + "name": name, + "bucket_start": bucket_start, + "tensor_start": tensor_start, + "save_bytes": save_bytes, + "shape": list(tensor.shape), + "dtype": tensor.dtype, + } + ) self.write_index += save_bytes return save_bytes @@ -95,12 +90,16 @@ def pop_tensor(self, named_tensors: Dict[str, "torch.Tensor"]): @staticmethod def pop_tensor_in_buffer(named_tensors: Dict[str, "torch.Tensor"], tensors_meta, buffer: "torch.Tensor"): - for name, meta in tensors_meta.items(): - meta = tensors_meta[name] + for meta in tensors_meta: + name = meta["name"] bucket_start, tensor_start, save_bytes = meta["bucket_start"], meta["tensor_start"], meta["save_bytes"] tensor = named_tensors.get(name, None) if tensor is None: - tensor = torch.empty_like(meta["tensor_meta"], device=buffer.device) + tensor = torch.empty( + torch.Size(meta["shape"]), + dtype=meta["dtype"], + device=buffer.device, + ) named_tensors[name] = tensor tensor.view(-1).view(torch.int8)[tensor_start : tensor_start + save_bytes].copy_( buffer[bucket_start : bucket_start + save_bytes] @@ -108,7 +107,7 @@ def pop_tensor_in_buffer(named_tensors: Dict[str, "torch.Tensor"], tensors_meta, return named_tensors def drop(self): - self.tensors_meta = {} + self.tensors_meta = [] self.write_index = 0 def is_full(self): @@ -135,21 +134,6 @@ def pop_last_bucket(self): return self.bucket.tensors_meta, self.bucket.buffer return None, None - @staticmethod - def meta_to_dict(meta_infos): - """ - Convert tensor_meta from torch.Tensor of meta device to dict - """ - for _, meta_info in meta_infos.items(): - t = meta_info["tensor_meta"] - tensor_meta = { - "shape": list(t.shape), - "dtype": t.dtype, - "layout": t.layout, - "device": t.device, - } - meta_info["tensor_meta"] = tensor_meta - class RecvBucketManager: def __init__(self): @@ -158,7 +142,8 @@ def __init__(self): def process_bucket(self, tensors_meta, buffer): self.waiting_tensors = TensorBucket.pop_tensor_in_buffer(self.waiting_tensors, tensors_meta, buffer) finished_tensors = {} - for name, meta in tensors_meta.items(): + for meta in tensors_meta: + name = meta["name"] tensor_start, save_bytes = meta["tensor_start"], meta["save_bytes"] if tensor_start + save_bytes == get_tensor_size(self.waiting_tensors[name]): finished_tensors[name] = self.waiting_tensors.pop(name) @@ -167,15 +152,128 @@ def process_bucket(self, tensors_meta, buffer): def clear(self): assert len(self.waiting_tensors) == 0 - @staticmethod - def dict_to_meta(meta_infos): - for _, meta_info in meta_infos.items(): - tensor_meta = meta_info["tensor_meta"] - assert tensor_meta["device"] == torch.device("meta") - t = torch.empty( - torch.Size(tensor_meta["shape"]), - dtype=tensor_meta["dtype"], - layout=tensor_meta["layout"], - device=tensor_meta["device"], - ) - meta_info["tensor_meta"] = t + +# ref: https://github.com/sgl-project/sglang/blob/main/python/sglang/srt/utils/patch_torch.py +def monkey_patch_torch_reductions(): + """Monkey patching before Torch https://github.com/pytorch/pytorch/pull/149248 is fixed""" + + # Currently, NPU does not support UUID. This has been temporarily commented out, with support expected in the fourth quarter. + if current_platform.device_type == "npu": + return + + if hasattr(reductions, "_reduce_tensor_original"): + return + + reductions._reduce_tensor_original = reductions.reduce_tensor + reductions._rebuild_cuda_tensor_original = reductions.rebuild_cuda_tensor + + reductions.reduce_tensor = _reduce_tensor_modified + reductions.rebuild_cuda_tensor = _rebuild_cuda_tensor_modified + + reductions.init_reductions() + + +_REDUCE_TENSOR_ARG_DEVICE_INDEX = 6 + + +def _reduce_tensor_modified(*args, **kwargs): + output_fn, output_args = reductions._reduce_tensor_original(*args, **kwargs) + output_args = _modify_tuple(output_args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_to_uuid) + return output_fn, output_args + + +def _rebuild_cuda_tensor_modified(*args): + args = _modify_tuple(args, _REDUCE_TENSOR_ARG_DEVICE_INDEX, _device_from_maybe_uuid) + return reductions._rebuild_cuda_tensor_original(*args) + + +def _device_to_uuid(device: int) -> str: + return str(torch.cuda.get_device_properties(device).uuid) + + +def _device_from_maybe_uuid(device_maybe_uuid) -> int: + if isinstance(device_maybe_uuid, int): + return device_maybe_uuid + + if isinstance(device_maybe_uuid, str): + for device in range(torch.cuda.device_count()): + if str(torch.cuda.get_device_properties(device).uuid) == device_maybe_uuid: + return device + raise Exception("Invalid device_uuid=" + device_maybe_uuid) + + raise Exception(f"Unknown type: {device_maybe_uuid=}") + + +def _modify_tuple(t, index: int, modifier): + return *t[:index], modifier(t[index]), *t[index + 1 :] + + +def _bucket_named_tensors(named_tensors: list[tuple[str, torch.Tensor]]) -> tuple[torch.Tensor, list[dict]]: + if not named_tensors: + raise ValueError("Cannot create empty tensor bucket") + + tensors_meta = [] + flattened_tensors = [] + + current_idx = 0 + for i, (name, tensor) in enumerate(named_tensors): + flattened = tensor.flatten().view(torch.int8) + + numel = flattened.numel() + metadata = { + "name": name, + "shape": list(tensor.shape), # Convert to list for serialization + "dtype": tensor.dtype, + "start_idx": current_idx, + "end_idx": current_idx + numel, + "numel": numel, + } + tensors_meta.append(metadata) + flattened_tensors.append(flattened) + current_idx += numel + + flattened_tensor = torch.cat(flattened_tensors, dim=0) + return flattened_tensor, tensors_meta + + +def named_tensors_from_bucket(bucket: "torch.Tensor", tensors_meta: list[dict]) -> list[tuple[str, torch.Tensor]]: + reconstructed = [] + for i, meta in enumerate(tensors_meta): + tensor = bucket[meta["start_idx"] : meta["end_idx"]].view(meta["dtype"]).reshape(torch.Size(meta["shape"])) + reconstructed.append((meta["name"], tensor)) + return reconstructed + + +def serialize_named_weights(named_weights: list[tuple[str, torch.Tensor]], infer_strategy: str): + if infer_strategy == "sglang": + from sglang.srt.weight_sync.tensor_bucket import FlattenedTensorBucket + + try: + from sglang.srt.utils.patch_torch import ( + monkey_patch_torch_reductions as sglang_monkey_patch_torch_reductions, + ) # type: ignore + except ImportError: + from sglang.srt.patch_torch import ( + monkey_patch_torch_reductions as sglang_monkey_patch_torch_reductions, + ) # type: ignore + + sglang_monkey_patch_torch_reductions() + bucket = FlattenedTensorBucket(named_weights) + flattened_tensor_data = { + "flattened_tensor": bucket.get_flattened_tensor(), + "metadata": bucket.get_metadata(), + } + serialized_tensors = MultiprocessingSerializer.serialize(flattened_tensor_data) + return serialized_tensors + + bucket, tensors_meta = _bucket_named_tensors(named_weights) + + # PumpkinComment: + # FSDP2 will fail if using CPUOffload Policy without this check + if not getattr(bucket, "is_cuda", False): + bucket = bucket.to(current_platform.device_type).contiguous() + + monkey_patch_torch_reductions() + + serialized_tensors = MultiprocessingSerializer.serialize({"bucket": bucket, "tensors_meta": tensors_meta}) + return serialized_tensors diff --git a/roll/utils/sequence_packing.py b/roll/utils/sequence_packing.py index ec9c6f3a0..e485f1387 100644 --- a/roll/utils/sequence_packing.py +++ b/roll/utils/sequence_packing.py @@ -1,356 +1,343 @@ -import torch - -from roll.distributed.scheduler.protocol import DataProto -from roll.platforms import current_platform -from roll.utils.constants import IGNORE_INDEX +from __future__ import annotations +from typing import TYPE_CHECKING -""" -Loss computation wrappers for sequence packing training. -Handles unpacking model outputs and aligning with original sequence boundaries for loss calculation. -""" +if TYPE_CHECKING: + from roll.distributed.scheduler.protocol import DataProto + from roll.utils.functionals import get_seqlen_balanced_partitions - -# TODO: use view of tensor in loss caculating instead of copy -class SequencePackingLossWrapper: +import torch +import math +import copy +from dataclasses import field, dataclass, asdict +from typing import Iterator, Tuple, Dict, List +import torch.distributed as dist +from roll.configs.worker_config import SequencePackingConfig + +def make_micro_batch_iter_for_sequence_packing(mini_batch, tp_size, cp_size, vp_size, is_train=False, dp_group=None, + micro_batch_size=None, config: SequencePackingConfig = None): + packer = get_sequence_packing_packer(config) + return packer.make_micro_batch_iter_for_sequence_packing(mini_batch, tp_size, cp_size, vp_size, is_train, dp_group, micro_batch_size) + +def restore_results_order( + results: Dict[str, torch.Tensor], + partition_indices_list: List[List[int]], + config: SequencePackingConfig = None + ) -> Dict[str, torch.Tensor]: + packer = get_sequence_packing_packer(config) + return packer.restore_results_order(results, partition_indices_list) + + +def get_sequence_packing_packer(config: SequencePackingConfig = None): + """Factory function to get the appropriate sequence packing algorithm.""" + if config==None: + config = SequencePackingConfig() + if config.algorithm == 'load_balance': + return LoadBalancePacker(config) + elif config.algorithm == 'none': + return SequencePackingPacker(config) + else: + raise ValueError(f"Illegal sequence packing algorithm {config.algorithm}," + f" algorithm must be in ['none', 'load_balance']") + + +class SequencePackingPacker: """ - Base wrapper for computing loss on packed sequences. - - In sequence packing, multiple sequences are concatenated and padded to form a single packed sequence. - This wrapper handles: - 1. Unpacking model outputs back to individual sequences - 2. Aligning original data (labels, masks) with unpacked outputs - 3. Computing loss on properly aligned data + Sequence Packing Packer """ - def __init__( - self, - strategy, - loss_func, - ): + def __init__(self, config: SequencePackingConfig = None): + self.config = config if config is not None else SequencePackingConfig() + + def get_pad_factor(self, cp_size, tp_size): + """Calculate padding factor based on parallelism configuration.""" + pad_factor = cp_size * 2 * tp_size if cp_size > 1 else tp_size + pad_factor = math.lcm(16, pad_factor) + return pad_factor + + @staticmethod + def calculate_workload(seqlen: int) -> float: """ - Args: - strategy: Training strategy containing model and distributed config - loss_func: Loss function to apply - cu_seqlens_q: Cumulative sequence lengths of original (unpadded) sequences - cu_seqlens_q_padded: Cumulative sequence lengths after padding for packing - logger: Optional logger + Calculate workload (simulating Transformer FLOPs). + FLOPs ∝ 6 * hidden_size * seqlen + seqlen^2 + Using hidden_size=4096 as reference (7B model) """ - self.strategy = strategy - self.loss_func = loss_func - self.cu_seqlens = None - self.cu_seqlens_padded = None - self.logger = None - - def set_packing_params(self, cu_seqlens, cu_seqlens_padded, logger): - self.cu_seqlens = cu_seqlens - self.cu_seqlens_padded = cu_seqlens_padded - self.logger = logger - - def _unpack_output_tensor(self, output_tensor): + return 24576 * seqlen + seqlen * seqlen + + @staticmethod + def ceildiv(a: int, b: int) -> int: + """Ceiling division.""" + return -(a // -b) + + def make_micro_batch_iter_for_sequence_packing( + self, + mini_batch: DataProto, + tp_size, cp_size, vp_size, is_train=False, + dp_group=None, micro_batch_size=None + ) -> Iterator[DataProto]: + assert micro_batch_size is not None, "SequencePackingPacker: micro_batch_size is None" + mini_batch_size = len(mini_batch) + mini_batch.meta_info['partition_indices_list'] = [] + num_microbatches = mini_batch_size // micro_batch_size + mini_batch.meta_info['num_micro_batchs'] = num_microbatches + return iter(mini_batch.chunk(chunks=num_microbatches)) + + @staticmethod + def restore_results_order( + results: Dict[str, torch.Tensor], + partition_indices_list: List[List[int]] + ) -> Dict[str, torch.Tensor]: + return results + + + +class LoadBalancePacker(SequencePackingPacker): + @staticmethod + def roundup_divisible(a: int, b: int) -> int: + """Round up a to be divisible by b.""" + return ((a + b - 1) // b) * b + + @staticmethod + def get_device_name(): + """Get current device name.""" + if torch.cuda.is_available(): + return f"cuda:{torch.cuda.current_device()}" + return "cpu" + + @staticmethod + def calculate_workload_batch(seqlen_tensor: torch.Tensor) -> torch.Tensor: """ - Unpack model output tensor from packed format back to individual sequences. - - The packed output contains multiple sequences concatenated together. This method - splits them back using padded cumulative sequence lengths, accounting for context - parallelism partitioning. + Calculate workload for a batch of sequences. Args: - output_tensor: Packed model output with shape (batch=1, packed_seq_len, hidden_dim) + seqlen_tensor: Tensor of sequence lengths Returns: - List of unpacked tensors, one per original sequence, each with shape - (batch=1, padded_seq_len, hidden_dim) + Tensor of workloads """ - cp_size = self.strategy.worker.rank_info.cp_size - - # Calculate sequence boundaries in the packed tensor - # Padded cumulative lengths mark where each sequence starts/ends after packing - padded_cu_seqlens = self.cu_seqlens_padded - - # Adjust for context parallelism: each rank only holds a portion of the sequence - seq_starts = padded_cu_seqlens[:-1] // cp_size - seq_ends = padded_cu_seqlens[1:] // cp_size - - # Extract each sequence from the packed tensor - unpacked_output_tensor_list = [] - for seq_idx, (seq_start, seq_end) in enumerate(zip(seq_starts, seq_ends)): - unpacked_output_tensor_list.append(output_tensor[:, seq_start:seq_end, :]) - return unpacked_output_tensor_list - - def _pad_tensor_to_target_length(self, tensor, target_length, pad_val=0, pad_dim=0): + return 24576 * seqlen_tensor + seqlen_tensor * seqlen_tensor + + def make_micro_batch_iter_for_sequence_packing( + self, + mini_batch: DataProto, + tp_size: int, + cp_size: int, + vp_size: int, + is_train=False, + dp_group=None, + micro_batch_size=None + ) -> Iterator[DataProto]: """ - Pad tensor along the specified dimension to reach the target length by padding on the right. + Split mini_batch into micro batches with sequence packing strategy. - Args: - tensor: Input tensor to pad - target_length: Desired length along pad_dim - pad_val: Value to use for padding - pad_dim: Dimension to pad along + This function: + 1. Calculates the optimal number of micro batches based on max_packed_sequence_length + 2. Ensures all DP ranks have the same number of micro batches + 3. Ensures the number of micro batches is divisible by vp_size + 4. Balances workload across micro batches using Karmarkar-Karp algorithm + 5. Optimizes scheduling by placing smaller batches at edges - Returns: - Padded tensor with length target_length along pad_dim + Args: + mini_batch: Input mini batch data containing: + - batch: TensorDict with tensors including 'input_ids' and 'attention_mask' + - non_tensor_batch: Dict with non-tensor data + - meta_info: Dict with metadata + tp_size: Tensor parallel size + cp_size: Context parallel size + vp_size: Virtual pipeline parallel size (must divide num_micro_batches) + max_packed_sequence_length: Maximum total sequence length per micro batch + dp_group: Data parallel process group for synchronization + + Yields: + DataProto: Micro batches with balanced workload + + Raises: + AssertionError: If max_packed_sequence_length < max sequence length in batch """ - seq_len = tensor.shape[pad_dim] - - if target_length > seq_len: - pad_size = target_length - seq_len - - # Construct padding specification for torch.nn.functional.pad - # Format: [pad_left, pad_right] for each dim from last to first - pad_list = [0, 0] * tensor.ndim - pad_list[2 * (tensor.ndim - 1 - pad_dim) + 1] = pad_size - - tensor = torch.nn.functional.pad(tensor, pad_list, value=pad_val) - - return tensor - - def _align_to_unpacked_output_tensor_shape(self, tensor, pad_val=0): + assert dp_group is not None, "LoadBalancePacker: dp_group is None" + # Calculate effective sequence lengths for each sample + # For regular tensors, use attention mask + attention_mask = mini_batch.batch["attention_mask"] + max_seq_len = attention_mask.shape[-1] + seq_len_effective: torch.Tensor = attention_mask.sum(dim=1) + pad_factor = self.get_pad_factor(cp_size, tp_size) + seq_len_effective = ((seq_len_effective + pad_factor - 1) // pad_factor) * pad_factor + + if is_train: + max_packed_sequence_length = self.config.max_packed_sequence_length_train + else: + max_packed_sequence_length = self.config.max_packed_sequence_length_forward + assert max_packed_sequence_length is not None, "LoadBalancePacker: max_packed_sequence_length is None" + # Validate that max_packed_sequence_length is sufficient + assert max_packed_sequence_length >= max_seq_len, ( + f"max_packed_sequence_length ({max_packed_sequence_length}) must be >= " + f"max sequence length in batch ({max_seq_len})" + ) + + batch_size = len(seq_len_effective) + total_seqlen = seq_len_effective.sum().item() + + # Step 2: Calculate initial number of micro batches + # Base calculation: how many batches do we need to fit all tokens? + num_micro_batches = max(1, self.ceildiv(total_seqlen, max_packed_sequence_length)) + + # Cannot have more micro batches than samples + num_micro_batches = min(num_micro_batches, batch_size) + + if is_train: + min_num_micro_batches = self.config.min_num_micro_batches_train + else: + min_num_micro_batches = self.config.min_num_micro_batches_forward + num_micro_batches = max(num_micro_batches, min_num_micro_batches) + + # Step 3: Synchronize across DP ranks (all ranks must have same count) + if dist.is_initialized() and dp_group is not None: + num_micro_batches_tensor = torch.tensor( + [num_micro_batches], + device=self.get_device_name() + ) + # Use MAX to ensure all ranks can accommodate their data + dist.all_reduce( + num_micro_batches_tensor, + op=dist.ReduceOp.MAX, + group=dp_group + ) + num_micro_batches = num_micro_batches_tensor.cpu().item() + + # Step 4: Round up to be divisible by vp_size + if vp_size > 1: + num_micro_batches = self.roundup_divisible(num_micro_batches, vp_size) + + # Step 5: Calculate workload for load balancing + # Use squared sequence length as proxy for attention computation cost + workloads = self.calculate_workload_batch(seq_len_effective) + + from roll.utils.functionals import get_seqlen_balanced_partitions + # Step 6: Partition samples into micro batches with balanced workload + micro_batch_indices = get_seqlen_balanced_partitions( + seqlen_list=workloads.tolist(), + k_partitions=num_micro_batches, + equal_size=False # Allow variable sizes for better balance + ) + + # Step 7: Sort and reorder for better pipeline scheduling + # Sort by workload (descending) to identify large and small batches + micro_batch_indices_with_workload = [ + ( + partition, + sum(workloads[idx].item() for idx in partition), + partition[0] if partition else 0 # tie-breaker + ) + for partition in micro_batch_indices + ] + + micro_batch_indices_with_workload.sort( + key=lambda x: (x[1], x[2]), + reverse=True + ) + + # Reorder: place smaller batches at both ends to reduce pipeline bubbles + # Pattern: [small, large, large, ..., large, small] + sorted_indices = [x[0] for x in micro_batch_indices_with_workload] + reordered_indices = sorted_indices[::2][::-1] + sorted_indices[1::2] + + mini_batch.meta_info['partition_indices_list'] = reordered_indices.copy() + + # Step 8: Generate micro batches + generated_count = 0 + + for partition in reordered_indices: + if len(partition) == 0: + # Skip empty partitions (shouldn't happen but be safe) + continue + + # Use DataProto's select_idxs method to create micro batch + micro_batch_proto = mini_batch.select_idxs(partition) + + # Add metadata about this micro batch + micro_batch_proto.meta_info = copy.deepcopy(mini_batch.meta_info) + micro_batch_proto.meta_info['micro_batch_idx'] = generated_count + micro_batch_proto.meta_info['is_padding_batch'] = False + micro_batch_proto.meta_info['partition_indices'] = partition + micro_batch_proto.meta_info['num_micro_batchs'] = num_micro_batches + micro_batch_proto.meta_info['mini_batch_size'] = mini_batch.batch.batch_size[0] + + yield micro_batch_proto + generated_count += 1 + + # Verify we generated the correct number of micro batches + assert generated_count == num_micro_batches, ( + f"Generated {generated_count} micro batches but expected {num_micro_batches}" + ) + + @staticmethod + def restore_results_order( + results: Dict[str, torch.Tensor], + partition_indices_list: List[List[int]] + ) -> Dict[str, torch.Tensor]: """ - Align original data tensors (labels, masks) to match unpacked output shape. + Restore computation results to their original order after load-balanced partitioning. - Original data comes in shape (batch, max_seq_len, ...) where batch contains multiple - sequences with varying actual lengths. This method: - 1. Extracts each sequence's valid portion (up to its original unpadded length) - 2. Pads it to match the padded length used during packing - - This ensures original data aligns with unpacked model outputs for loss computation. + During load balancing, samples are reordered into partitions by sequence length. + This function reverses that reordering to match the original input order. Args: - tensor: Original data tensor with shape (batch, seq_len, ...) - pad_val: Value used for padding (e.g., IGNORE_INDEX for labels, 0 for masks) + results: Dict of computation results where first dimension is in partitioned order + e.g., {'logits': [total_batch, ...], 'loss': [total_batch]} + partition_indices_list: List of original indices for each partition + (from mini_batch.meta_info['partition_indices_list']) Returns: - List of aligned tensors, each with shape (1, padded_seq_len, ...) matching - the corresponding unpacked output tensor - """ - # Get original unpadded sequence lengths (actual data before packing) - unpadded_seq_lengths = self.cu_seqlens[1:] - self.cu_seqlens[:-1] - - # Get padded sequence lengths (after padding during packing) - padded_seq_lengths = self.cu_seqlens_padded[1:] - self.cu_seqlens_padded[:-1] + Dict with same keys but tensors reordered to original sample order - source_seq_lengths = unpadded_seq_lengths # Valid data length - target_seq_lengths = padded_seq_lengths # Target length after packing - - aligned_tensor_list = [] - for seq_idx, (source_len, target_len) in enumerate( - zip(source_seq_lengths, target_seq_lengths) - ): - # Extract valid portion: truncate to original unpadded length - seq_tensor = tensor[seq_idx:seq_idx + 1, :source_len] + Example: + # Create micro batches with load balancing + micro_batches_iter = packer.make_micro_batch_iter_for_sequence_packing( + mini_batch=mini_batch, ... + ) + partition_indices_list = mini_batch.meta_info['partition_indices_list'] - # Pad to match the padded length used in packing - seq_tensor = self._pad_tensor_to_target_length(seq_tensor, target_len, pad_val=pad_val, pad_dim=1) + # Compute (results are concatenated across partitions) + results = model(micro_batches_iter) # {'logits': [total_batch, ...]} - # Keep batch dimension (1) to match unpacked output format - aligned_tensor_list.append(seq_tensor) + # Restore original order + restored = LoadBalancePacker.restore_results_order( + results, partition_indices_list + ) + """ + if not results: + return {} - return aligned_tensor_list + # Flatten partition indices to get current -> original mapping + original_indices = [] + for partition_indices in partition_indices_list: + original_indices.extend(partition_indices) - def __call__(self, data: DataProto, output_tensor: torch.Tensor): - return self.loss_func(data, output_tensor) + # Build inverse mapping: original position -> current position + # original_indices[current_pos] = original_pos + # reorder_indices[original_pos] = current_pos + total_samples = len(original_indices) + reorder_indices = [0] * total_samples + for current_pos, original_pos in enumerate(original_indices): + reorder_indices[original_pos] = current_pos + reorder_indices_tensor = torch.tensor(reorder_indices, dtype=torch.long) -# SFT -class SequencePackingSFTLossWrapper(SequencePackingLossWrapper): - """ - Wrapper for SFT loss computation with packed sequences. + # Reorder each tensor result + restored_results = {} + for key, tensor in results.items(): + if isinstance(tensor, torch.Tensor) and tensor.dim() > 0: + assert tensor.shape[0] == total_samples, \ + f"Tensor '{key}' batch size {tensor.shape[0]} != total samples {total_samples}" - For SFT, labels are already packed in the same format as model outputs, - so we can directly compute loss without unpacking. - """ + restored_results[key] = tensor[reorder_indices_tensor] + else: + # Scalar or non-tensor, keep as-is + restored_results[key] = tensor - def __call__(self, data: DataProto, output_tensor: torch.Tensor): - # Use pre-packed labels that match the packed output format - labels = data.meta_info['labels_packed'] - return self.loss_func(DataProto.from_dict(tensors={'labels': labels}), output_tensor) + return restored_results -# Distillation -class SequencePackingDistillForwardWrapper(SequencePackingLossWrapper): - """ - Wrapper for teacher model forward pass in distillation with packed sequences. - Computes teacher logits from packed outputs and prepares them for student training: - 1. Unpacks teacher outputs to individual sequences - 2. Computes full vocabulary logits or topk logits for each sequence - 3. Pads logits back to original max sequence length for easy alignment with student - """ - def __init__(self, strategy, loss_func): - super().__init__(strategy, loss_func) - self.forward_func = loss_func - def __call__(self, data: DataProto, output_tensor: torch.Tensor, non_loss_data: bool = True): - """ - Compute teacher logits from packed outputs. - - Args: - data: Input data protocol - output_tensor: Packed teacher model outputs - non_loss_data: Flag indicating this is for data generation, not loss computation - - Returns: - Tuple of (dummy_loss, dict with teacher logits and topk indices) - """ - # Step 1: Unpack teacher outputs to individual sequences - unpacked_output_tensor_list = self._unpack_output_tensor(output_tensor) - - # Step 2: Compute logits for each sequence - # Gather across tensor/context parallel ranks to get full logits - teacher_topk_probs_list = [] - teacher_topk_log_probs_list = [] - teacher_topk_indices_list = [] - teacher_topk_inf_mask_list = [] - for idx, unpacked_output_tensor in enumerate(unpacked_output_tensor_list): - # Compute logits with full vocabulary (or topk for efficiency) - teacher_topk_probs, teacher_topk_log_probs, teacher_topk_indices, teacher_topk_inf_mask = self.strategy.op_compute_topk_probs_and_indices( - unpacked_output_tensor, - topk=self.strategy.worker.pipeline_config.logits_topk, - target_vocab_size=self.strategy.worker.pipeline_config.target_vocab_size, - kd_temperature=self.strategy.worker.pipeline_config.kd_temperature, - teacher_temperature=self.strategy.worker.pipeline_config.teacher_temperature - ) - # Step 3: Pad each sequence's logits to max sequence length - # This makes them easy to align with original student data later - max_length = self.strategy.worker.pipeline_config.sequence_length - teacher_topk_probs = self._pad_tensor_to_target_length(teacher_topk_probs, max_length, pad_val=0, pad_dim=1) - teacher_topk_log_probs = self._pad_tensor_to_target_length(teacher_topk_log_probs, max_length, pad_val=0, pad_dim=1) - teacher_topk_indices = self._pad_tensor_to_target_length(teacher_topk_indices, max_length, pad_val=0, pad_dim=1) - teacher_topk_inf_mask = self._pad_tensor_to_target_length(teacher_topk_inf_mask, max_length, pad_val=1, pad_dim=1) - - teacher_topk_probs_list.append(teacher_topk_probs) - teacher_topk_log_probs_list.append(teacher_topk_log_probs) - teacher_topk_indices_list.append(teacher_topk_indices) - teacher_topk_inf_mask_list.append(teacher_topk_inf_mask) - - # Concatenate all sequences back into batch format - teacher_topk_probs = torch.cat(teacher_topk_probs_list, dim=0) - teacher_topk_log_probs = torch.cat(teacher_topk_log_probs_list, dim=0) - teacher_topk_indices = torch.cat(teacher_topk_indices_list, dim=0) - teacher_topk_inf_mask = torch.cat(teacher_topk_inf_mask_list, dim=0) - - # Return dummy loss (teacher forward doesn't compute loss) and teacher outputs - return torch.tensor(0., device=output_tensor.device), { - 'topk_probs': teacher_topk_probs.detach(), - 'topk_log_probs': teacher_topk_log_probs.detach(), - 'topk_indices': teacher_topk_indices.detach(), - 'topk_inf_mask': teacher_topk_inf_mask.detach() - } - - -class SequencePackingDistillLossWrapper(SequencePackingLossWrapper): - """ - Wrapper for computing distillation loss with packed sequences. - - Combines language modeling loss and distillation loss: - 1. Unpacks student model outputs to individual sequences - 2. Aligns original labels and teacher outputs with unpacked student outputs - 3. Computes both standard LM loss and KL divergence with teacher for each sequence - 4. Combines losses with configurable weighting - """ - - def __call__(self, data: DataProto, output_tensor: torch.Tensor): - """ - Compute combined distillation and language modeling loss. - - Args: - data: Input data containing original labels and masks - output_tensor: Packed student model outputs - - Returns: - Tuple of (total_loss, metrics_dict) - """ - # Step 1: Compute student logits from packed outputs - # Keep them partitioned across tensor/context parallel for memory efficiency - student_logits = output_tensor - - # Step 2: Unpack student logits to individual sequences (still cp-partitioned) - student_logits_list = self._unpack_output_tensor(student_logits) - - # Step 3: Get original data from dataloader (not packed) - labels = data.batch['labels_for_loss'] - attention_mask = data.batch['attention_mask'] - - # Step 4: Align original data with unpacked outputs - # Truncate to original length and pad to match packing padding - aligned_labels_list = self._align_to_unpacked_output_tensor_shape(labels, pad_val=IGNORE_INDEX) - aligned_attention_mask_list = self._align_to_unpacked_output_tensor_shape(attention_mask, pad_val=0) - - # Step 5: Get and align teacher outputs (pre-computed in teacher forward pass) - if self.strategy.worker.teacher_probs_iterator is not None: - teacher_probs = next(self.strategy.worker.teacher_probs_iterator) - aligned_teacher_probs_list = self._align_to_unpacked_output_tensor_shape(teacher_probs) - else: - teacher_probs = None - if self.strategy.worker.teacher_log_probs_iterator is not None: - teacher_log_probs = next(self.strategy.worker.teacher_log_probs_iterator) - aligned_teacher_log_probs_list = self._align_to_unpacked_output_tensor_shape(teacher_log_probs) - else: - teacher_log_probs = None - if self.strategy.worker.teacher_topk_indices_iterator is not None: - teacher_topk_indices = next(self.strategy.worker.teacher_topk_indices_iterator) - aligned_teacher_topk_indices_list = self._align_to_unpacked_output_tensor_shape(teacher_topk_indices) - else: - teacher_topk_indices = None - if self.strategy.worker.teacher_inf_mask_iterator is not None: - teacher_inf_mask = next(self.strategy.worker.teacher_inf_mask_iterator) - aligned_teacher_inf_mask_list = self._align_to_unpacked_output_tensor_shape(teacher_inf_mask) - else: - teacher_inf_mask = None - - - # Step 6: Accumulate losses across all sequences in the batch - total_gpt_loss = torch.tensor(0, device=current_platform.device_type, dtype=torch.float32) - total_distill_loss = torch.tensor(0, device=current_platform.device_type, dtype=torch.float32) - total_valid_tokens = 0 - total_valid_tokens_distill = 0 - - batch_size = len(student_logits_list) - for idx in range(batch_size): - # Get aligned data for this sequence - single_student_logits = student_logits_list[idx] - single_label = aligned_labels_list[idx] - single_teacher_probs = aligned_teacher_probs_list[idx] if teacher_probs is not None else None - single_teacher_log_probs = aligned_teacher_log_probs_list[idx] if teacher_log_probs is not None else None - single_teacher_topk_indices = aligned_teacher_topk_indices_list[idx] if teacher_topk_indices is not None else None - single_teacher_inf_mask = aligned_teacher_inf_mask_list[idx] if teacher_inf_mask is not None else None - - # Compute standard language modeling loss (cross-entropy with labels) - local_gpt_loss, local_valid_tokens = self.strategy.op_compute_language_loss_from_logits( - single_student_logits, single_label, - reduction="sum") - total_gpt_loss += local_gpt_loss - total_valid_tokens += local_valid_tokens - - # Compute distillation loss (KL divergence between student and teacher) - local_distill_loss, local_valid_tokens_distill = self.strategy.op_compute_various_divergence( - self.strategy.worker.kl_loss_func, - single_student_logits, single_teacher_probs, - single_teacher_log_probs, single_teacher_topk_indices, - single_teacher_inf_mask, single_label, - attention_mask=None, reduction="sum") - - total_distill_loss += local_distill_loss - total_valid_tokens_distill += local_valid_tokens_distill - - # Step 7: Normalize losses by number of valid tokens - if total_valid_tokens == 0: - total_valid_tokens = 1 - if total_valid_tokens_distill == 0: - total_valid_tokens_distill = 1 - gpt_loss = total_gpt_loss / total_valid_tokens - distill_loss = total_distill_loss / total_valid_tokens_distill - - # Step 8: Combine losses with configured weighting - # loss = (1 - α) * LM_loss + α * distill_loss - loss = ((1 - self.strategy.worker.pipeline_config.distill_loss_weight) * gpt_loss - + self.strategy.worker.pipeline_config.distill_loss_weight * distill_loss) - - student_metrics = { - "train/loss": loss.detach().item(), - "train/train_distill_loss": distill_loss.detach().item(), - "train/train_student_loss": gpt_loss.detach().item(), - } - return loss, student_metrics diff --git a/roll/utils/str_utils.py b/roll/utils/str_utils.py index 04dbd2db7..7ca5abfbc 100644 --- a/roll/utils/str_utils.py +++ b/roll/utils/str_utils.py @@ -1,4 +1,11 @@ +import io +import os import re +import sys +import dataclasses +from typing import Any, Optional + +from omegaconf import OmegaConf def contains_renderable_field(s: str, key: str) -> bool: """ @@ -18,4 +25,62 @@ def contains_renderable_field(s: str, key: str) -> bool: raise TypeError("Input 'key' must be a string.") pattern = r"\{" + re.escape(key) + r"(?!\w).*\}" - return re.search(pattern, s) is not None \ No newline at end of file + return re.search(pattern, s) is not None + + +def print_pipeline_config(config_obj: Any, enable_color: bool = False) -> None: + def convert_to_dict(obj): + if dataclasses.is_dataclass(obj): + return {f.name: convert_to_dict(getattr(obj, f.name)) for f in dataclasses.fields(obj)} + if isinstance(obj, (set, frozenset)): + try: + return sorted(list(obj), key=str) + except TypeError: + return list(obj) + if isinstance(obj, dict): + return {k: convert_to_dict(v) for k, v in obj.items()} + if isinstance(obj, (list, tuple)): + return [convert_to_dict(item) for item in obj] + return obj + + buf = io.StringIO() + + ANSI_RESET = "\033[0m" + ANSI_MAGENTA = "\033[95m" + ANSI_CYAN = "\033[96m" + ANSI_GREEN = "\033[92m" + ANSI_YELLOW = "\033[93m" + COLORS_BY_LEVEL = [ANSI_CYAN, ANSI_GREEN, ANSI_YELLOW, ANSI_MAGENTA] + + use_color = enable_color and sys.stdout.isatty() and os.getenv("NO_COLOR") is None + + def wrap(text: str, color: Optional[str]) -> str: + if use_color and color: + return f"{color}{text}{ANSI_RESET}" + return text + + def bprint(s: str, color: Optional[str] = None): + print(wrap(s, color), file=buf) + + def colorize_yaml(yaml_text: str) -> str: + colored_lines = [] + for line in yaml_text.splitlines(): + stripped = line.lstrip() + indent = len(line) - len(stripped) + level_color = COLORS_BY_LEVEL[(indent // 2) % len(COLORS_BY_LEVEL)] + if ":" in stripped: + key, rest = stripped.split(":", 1) + rest = rest.rstrip() + suffix = f": {rest.strip()}" if rest.strip() else ":" + colored_lines.append(f"{' ' * indent}{wrap(key, level_color)}{suffix}") + else: + colored_lines.append(f"{' ' * indent}{wrap(stripped, level_color)}") + return "\n".join(colored_lines) + + bprint("\n====== Pipeline Config ======", ANSI_MAGENTA) + bprint("------ merged & post-init ------", ANSI_CYAN) + config_dict = convert_to_dict(config_obj) + yaml_text = OmegaConf.to_yaml(OmegaConf.create(config_dict), resolve=True).rstrip() + bprint(colorize_yaml(yaml_text)) + bprint("====== End Config ======", ANSI_MAGENTA) + print(buf.getvalue()) \ No newline at end of file diff --git a/roll/utils/taskgroups.py b/roll/utils/taskgroups.py new file mode 100644 index 000000000..d7fd737c0 --- /dev/null +++ b/roll/utils/taskgroups.py @@ -0,0 +1,298 @@ +# borrow from cpython +# https://github.com/python/cpython/blob/3.12/Lib/asyncio/taskgroups.py +# +# Copyright (c) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, +# 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023 Python Software Foundation; +# All Rights Reserved +# +# Adapted with permission from the EdgeDB project; +# license: PSFL. +# +# +# This implementation of TaskGroup does not match all semantic of cpython TaskGroup. +# On exception in task group, we still cancel parent task but will not uncancel it. +# So we will raise `asyncio.CancelledError` if any exceptions other than +# `asyncio.CancelledError` raised within a task. +# +# And bacause python 3.11 and above need Task.uncancel() in addition to catching +# `asyncio.CancelledError` to suppress the cancellation, users should not suppress +# cancellation when using this implementation of TaskGroup to keep compatibility. + + +__all__ = ("TaskGroup",) + +from asyncio import events +from asyncio import exceptions +from asyncio import tasks + + +class TaskGroup: + """Asynchronous context manager for managing groups of tasks. + + Example use: + + async with asyncio.TaskGroup() as group: + task1 = group.create_task(some_coroutine(...)) + task2 = group.create_task(other_coroutine(...)) + print("Both tasks have completed now.") + + All tasks are awaited when the context manager exits. + + Any exceptions other than `asyncio.CancelledError` raised within + a task will cancel all remaining tasks and wait for them to exit. + The first exception is then raised (BaseExceptionGroup is not + supported on python 3.10). + + This implementation of TaskGroup does not match all semantic of cpython TaskGroup. + On exception in task group, we still cancel parent task but will not uncancel it. + So we will raise `asyncio.CancelledError` if any exceptions other than + `asyncio.CancelledError` raised within a task. + + And bacause python 3.11 and above need Task.uncancel() in addition to catching + `asyncio.CancelledError` to suppress the cancellation, users should not suppress + cancellation when using this implementation of TaskGroup to keep compatibility. + """ + def __init__(self): + self._entered = False + self._exiting = False + self._aborting = False + self._loop = None + self._parent_task = None + self._parent_cancel_requested = False + self._tasks = set() + self._errors = [] + self._base_error = None + self._on_completed_fut = None + + def __repr__(self): + info = [''] + if self._tasks: + info.append(f'tasks={len(self._tasks)}') + if self._errors: + info.append(f'errors={len(self._errors)}') + if self._aborting: + info.append('cancelling') + elif self._entered: + info.append('entered') + + info_str = ' '.join(info) + return f'' + + async def __aenter__(self): + if self._entered: + raise RuntimeError( + f"TaskGroup {self!r} has already been entered") + if self._loop is None: + self._loop = events.get_running_loop() + self._parent_task = tasks.current_task(self._loop) + if self._parent_task is None: + raise RuntimeError( + f'TaskGroup {self!r} cannot determine the parent task') + self._entered = True + + return self + + async def __aexit__(self, et, exc, tb): + tb = None + try: + return await self._aexit(et, exc) + finally: + # Exceptions are heavy objects that can have object + # cycles (bad for GC); let's not keep a reference to + # a bunch of them. It would be nicer to use a try/finally + # in __aexit__ directly but that introduced some diff noise + self._parent_task = None + self._errors = None + self._base_error = None + exc = None + + async def _aexit(self, et, exc): + self._exiting = True + + if (exc is not None and + self._is_base_error(exc) and + self._base_error is None): + self._base_error = exc + + propagate_cancellation_error = \ + exc if et is exceptions.CancelledError else None + # TODO not supported on python 3.10 + # if self._parent_cancel_requested: + # # If this flag is set we *must* call uncancel(). + # if self._parent_task.uncancel() == 0: + # # If there are no pending cancellations left, + # # don't propagate CancelledError. + # propagate_cancellation_error = None + + if et is not None: + if not self._aborting: + # Our parent task is being cancelled: + # + # async with TaskGroup() as g: + # g.create_task(...) + # await ... # <- CancelledError + # + # or there's an exception in "async with": + # + # async with TaskGroup() as g: + # g.create_task(...) + # 1 / 0 + # + self._abort() + + # We use while-loop here because "self._on_completed_fut" + # can be cancelled multiple times if our parent task + # is being cancelled repeatedly (or even once, when + # our own cancellation is already in progress) + while self._tasks: + if self._on_completed_fut is None: + self._on_completed_fut = self._loop.create_future() + + try: + await self._on_completed_fut + except exceptions.CancelledError as ex: + if not self._aborting: + # Our parent task is being cancelled: + # + # async def wrapper(): + # async with TaskGroup() as g: + # g.create_task(foo) + # + # "wrapper" is being cancelled while "foo" is + # still running. + propagate_cancellation_error = ex + self._abort() + + self._on_completed_fut = None + + assert not self._tasks + + if self._base_error is not None: + try: + raise self._base_error + finally: + exc = None + + # Propagate CancelledError if there is one, except if there + # are other errors -- those have priority. + try: + if propagate_cancellation_error and not self._errors: + try: + raise propagate_cancellation_error + finally: + exc = None + finally: + propagate_cancellation_error = None + + if et is not None and et is not exceptions.CancelledError: + self._errors.append(exc) + + if self._errors: + try: + # TODO not supported on python 3.10 + # raise BaseExceptionGroup( + # 'unhandled errors in a TaskGroup', + # self._errors, + # ) from None + raise self._errors[0] + finally: + exc = None + + + def create_task(self, coro, *, name=None, context=None): + """Create a new task in this group and return it. + + Similar to `asyncio.create_task`. + """ + if not self._entered: + raise RuntimeError(f"TaskGroup {self!r} has not been entered") + if self._exiting and not self._tasks: + raise RuntimeError(f"TaskGroup {self!r} is finished") + if self._aborting: + raise RuntimeError(f"TaskGroup {self!r} is shutting down") + if context is None: + task = self._loop.create_task(coro) + else: + task = self._loop.create_task(coro, context=context) + tasks._set_task_name(task, name) + + # Always schedule the done callback even if the task is + # already done (e.g. if the coro was able to complete eagerly), + # otherwise if the task completes with an exception then it will cancel + # the current task too early. gh-128550, gh-128588 + + self._tasks.add(task) + task.add_done_callback(self._on_task_done) + try: + return task + finally: + # gh-128552: prevent a refcycle of + # task.exception().__traceback__->TaskGroup.create_task->task + del task + + # Since Python 3.8 Tasks propagate all exceptions correctly, + # except for KeyboardInterrupt and SystemExit which are + # still considered special. + + def _is_base_error(self, exc: BaseException) -> bool: + assert isinstance(exc, BaseException) + return isinstance(exc, (SystemExit, KeyboardInterrupt)) + + def _abort(self): + self._aborting = True + + for t in self._tasks: + if not t.done(): + t.cancel() + + def _on_task_done(self, task): + self._tasks.discard(task) + + if self._on_completed_fut is not None and not self._tasks: + if not self._on_completed_fut.done(): + self._on_completed_fut.set_result(True) + + if task.cancelled(): + return + + exc = task.exception() + if exc is None: + return + + self._errors.append(exc) + if self._is_base_error(exc) and self._base_error is None: + self._base_error = exc + + if self._parent_task.done(): + # Not sure if this case is possible, but we want to handle + # it anyways. + self._loop.call_exception_handler({ + 'message': f'Task {task!r} has errored out but its parent ' + f'task {self._parent_task} is already completed', + 'exception': exc, + 'task': task, + }) + return + + if not self._aborting and not self._parent_cancel_requested: + # If parent task *is not* being cancelled, it means that we want + # to manually cancel it to abort whatever is being run right now + # in the TaskGroup. But we want to mark parent task as + # "not cancelled" later in __aexit__. Example situation that + # we need to handle: + # + # async def foo(): + # try: + # async with TaskGroup() as g: + # g.create_task(crash_soon()) + # await something # <- this needs to be canceled + # # by the TaskGroup, e.g. + # # foo() needs to be cancelled + # except Exception: + # # Ignore any exceptions raised in the TaskGroup + # pass + # await something_else # this line has to be called + # # after TaskGroup is finished. + self._abort() + self._parent_cancel_requested = True + self._parent_task.cancel() diff --git a/roll/utils/tracking.py b/roll/utils/tracking.py index dafda4156..785eca881 100644 --- a/roll/utils/tracking.py +++ b/roll/utils/tracking.py @@ -1,4 +1,5 @@ import json +from functools import wraps from typing import Optional, Dict, Any import torch @@ -10,6 +11,48 @@ tracker_registry: Dict[str, Any] = {} +def _strip_metric_tag(values: Dict[str, Any]) -> Dict[str, Any]: + """ + Strip reducer tags from metric keys before logging. + + We may annotate metric keys with reducer tags for internal aggregation: + "actor/pg_loss@sum", "actor/kl_loss@mean", ... + Dashboards (TensorBoard/W&B/...) should log clean names, so we remove "@...": + "actor/pg_loss@sum" -> "actor/pg_loss" + + - Only strips the last "@tag" part (rsplit("@", 1)) + - Recursively strips nested dict keys (e.g. add_scalars) + - Returns a new dict (does not mutate the input) + """ + def strip_key(k: str) -> str: + return k.rsplit("@", 1)[0] if isinstance(k, str) and "@" in k else k + + out: Dict[str, Any] = {} + for k, v in values.items(): + nk = strip_key(k) + if isinstance(v, dict): + v = _strip_metric_tag(v) + out[nk] = v + return out + + +def strip_at_tag_in_log(func): + """ + Decorator for Tracker.log(...). + + Purpose: + Remove "@tag" suffixes from metric keys right before sending them to the + logging backend. This is name-cleaning only (no reduction happens here). + """ + @wraps(func) + def wrapper(self, values: dict, step: Optional[int] = None, **kwargs): + if isinstance(values, dict): + values = _strip_metric_tag(values) + return func(self, values, step, **kwargs) + return wrapper + + + class BaseTracker: def log(self, values: dict, step: Optional[int], **kwargs): @@ -35,6 +78,7 @@ def __init__(self, config: dict, **kwargs): self.writer.add_hparams(hparam_dict=self.config, metric_dict={}) self.writer.flush() + @strip_at_tag_in_log def log(self, values: dict, step: Optional[int], **kwargs): for k, v in values.items(): if isinstance(v, (int, float)): @@ -68,6 +112,7 @@ def __init__(self, config: dict, **kwargs): self.run.config.update(config, allow_val_change=True) + @strip_at_tag_in_log def log(self, values: dict, step: Optional[int], **kwargs): self.run.log(values, step=step, **kwargs) @@ -92,6 +137,7 @@ def __init__(self, config: dict, **kwargs): self.run = swanlab.init(project=project, workspace=workspace, experiment_name=experiment_name, description=description, tags=tags, logdir=logdir, **kwargs) + @strip_at_tag_in_log def log(self, values: dict, step: Optional[int], **kwargs): self.run.log(values, step=step, **kwargs) @@ -104,6 +150,7 @@ class StdoutTracker(BaseTracker): def __init__(self, config: dict, **kwargs): self.config = config + @strip_at_tag_in_log def log(self, values: dict, step: Optional[int], **kwargs): logger.info(f"metrics_tag: {json.dumps({'step': step, 'metrics': values})}") diff --git a/roll/utils/train_infer_corrections.py b/roll/utils/train_infer_corrections.py new file mode 100644 index 000000000..99107df98 --- /dev/null +++ b/roll/utils/train_infer_corrections.py @@ -0,0 +1,255 @@ +from __future__ import annotations +from typing import Dict, Tuple, Optional + +import torch + +from roll.utils.functionals import masked_mean, masked_sum, agg_loss +from roll.pipeline.agentic.utils import compute_segment_masked_mean +from roll.configs.base_config import TrainInferCorrectionConfig +from roll.utils.logging import get_logger + +logger = get_logger() + + +def _compute_all_granularity(old_log_probs, infer_log_probs, response_mask) -> dict: + """Compute importance ratios and probability differences at multiple granularities.""" + response_mask = response_mask.long() + log_ratio = old_log_probs - infer_log_probs + + ratio_token = log_ratio.exp() + diff_token = old_log_probs.exp() - infer_log_probs.exp() + + # Geometric mean (per sequence, then broadcast to token level) + log_ratio_geo = masked_mean(log_ratio, response_mask, dim=-1) # [B] + ratio_geometric = log_ratio_geo.exp().unsqueeze(-1).expand_as(ratio_token) + diff_geometric = masked_mean(diff_token, response_mask, dim=-1).unsqueeze(-1).expand_as(diff_token) + + # Sequence-level sum (then broadcast to token level) + log_ratio_seq = masked_sum(log_ratio, response_mask, dim=-1) # [B] + ratio_sequence = log_ratio_seq.exp().unsqueeze(-1).expand_as(ratio_token) + diff_sequence = masked_sum(diff_token, response_mask, dim=-1).unsqueeze(-1).expand_as(diff_token) + + # Segment-level mean (computed per segment within each sequence) + log_ratio_segment = compute_segment_masked_mean(log_ratio, response_mask) # [B, T] + ratio_segment = log_ratio_segment.exp() + diff_segment = compute_segment_masked_mean(diff_token, response_mask) + + return { + "ratio": { + "token": ratio_token, + "geometric": ratio_geometric, + "sequence": ratio_sequence, + "segment": ratio_segment, + }, + "diff": { + "token": diff_token, + "geometric": diff_geometric, + "sequence": diff_sequence, + "segment": diff_segment, + }, + } + + +def _infer_global_valid_samples_from_mask(mask: torch.Tensor) -> float: + """Count the number of samples that contain at least one valid token.""" + valid_samples = (mask.sum(dim=-1) > 0).float().sum().detach().item() + return max(float(valid_samples), 1.0) + + +def _infer_global_valid_tokens_from_mask(mask: torch.Tensor) -> float: + """Count the total number of valid tokens across all samples.""" + valid_tokens = mask.float().sum().detach().item() + return max(float(valid_tokens), 1.0) + + +def compute_train_infer_correction( + cfg: TrainInferCorrectionConfig, + response_mask: torch.Tensor, # [B, T] + old_log_probs: torch.Tensor, # [B, T] + infer_log_probs: torch.Tensor, # [B, T] + global_valid_samples: Optional[int] = None, # Number of valid sequences + global_valid_tokens: Optional[int] = None, # Total number of valid tokens + apply_filters: bool = True, +) -> Tuple[torch.Tensor, torch.Tensor, Dict[str, float]]: + """Compute importance sampling weights and apply optional filters based on train-infer divergence.""" + metrics: Dict[str, float] = {} + + base_mask = response_mask.long() + if global_valid_samples is None: + global_valid_samples = _infer_global_valid_samples_from_mask(base_mask) + if global_valid_tokens is None: + global_valid_tokens = _infer_global_valid_tokens_from_mask(base_mask) + + stats = _compute_all_granularity( + old_log_probs=old_log_probs, + infer_log_probs=infer_log_probs, + response_mask=base_mask, + ) + ratio = stats["ratio"] + diff = stats["diff"] + + # 1) Importance Sampling (IS) Weight Handling + if cfg.is_weight.enabled: + is_weight = ratio[cfg.is_weight.weight_type] + ub = cfg.is_weight.upper_bound + if ub is not None: + # Log the fraction of weights clipped due to upper bound + metrics["actor/is_weight_clipfrac@sum"] = agg_loss( + loss_mat=(is_weight > ub).float(), + loss_mask=base_mask, + loss_agg_mode='token-mean', + batch_num_tokens=global_valid_tokens, + global_valid_samples=global_valid_samples + ).detach().item() + is_weight = is_weight.clamp(min=0.0, max=ub) + if cfg.is_weight.detach: + is_weight = is_weight.detach() + else: + is_weight = torch.ones_like(ratio["token"]).detach() + + # 2) Apply Filters (if enabled) + filter_mask = torch.ones_like(base_mask) + recorded_val_metrics = set() # Avoid duplicate metric logging for the same granularity + + if apply_filters: + for i, f in enumerate(cfg.filters): + if not f.enabled: + continue + + agg = f.agg_type + + # --- Ratio-based Filter --- + if f.ratio_enabled: + m_ratio = (ratio[agg] >= f.ratio_low).float() * (ratio[agg] <= f.ratio_high).float() + + # Log pass rate of this filter over currently active tokens + metrics[f"actor/train_infer_{agg}_ratio_mask_mean@sum"] = agg_loss( + loss_mat=m_ratio, + loss_mask=base_mask, + loss_agg_mode='token-mean', + batch_num_tokens=global_valid_tokens, + ).detach().item() + + # Log mean value of the ratio at this granularity (for monitoring) + val_key = f"actor/train_infer_ratio_{agg}_mean@sum" + if val_key not in recorded_val_metrics: + metrics[val_key] = agg_loss( + loss_mat=ratio[agg], + loss_mask=base_mask, + loss_agg_mode="seq-mean-token-mean", + global_valid_samples=global_valid_samples, + ).detach().item() + recorded_val_metrics.add(val_key) + + filter_mask = filter_mask * m_ratio + + # --- Difference-based Filter --- + if f.diff_enabled: + m_diff = (diff[agg] >= f.diff_low).float() * (diff[agg] <= f.diff_high).float() + + # Log pass rate of this filter + metrics[f"actor/train_infer_{agg}_diff_mask_mean"] = agg_loss( + loss_mat=m_diff, + loss_mask=base_mask, + loss_agg_mode='token-mean', + batch_num_tokens=global_valid_tokens, + ).detach().item() + + # Log mean value of the difference at this granularity + val_key = f"actor/train_infer_diff_{agg}_mean@sum" + if val_key not in recorded_val_metrics: + metrics[val_key] = agg_loss( + loss_mat=diff[agg], + loss_mask=base_mask, + loss_agg_mode="seq-mean-token-mean", + global_valid_samples=global_valid_samples, + ).detach().item() + recorded_val_metrics.add(val_key) + + filter_mask = filter_mask * m_diff + + # 3) Final overall pass rate after all filters + if apply_filters and cfg.filters: + metrics["actor/train_infer_final_mask_mean"] = masked_mean( + base_mask*filter_mask.float(), base_mask + ).detach().item() + + return is_weight, filter_mask, metrics + + +def apply_train_infer_correction_to_batch( + pipeline_config, + batch, + stat_mask_key='response_mask', + update_mask_keys: Optional[list] = None, +): + """Apply train-infer correction to a batch at the pipeline level. + + This function is designed for pipeline-level usage where masks are in their + original shape [B, T]. It handles slicing internally and updates the original + masks with the computed filter mask. + + Args: + pipeline_config: Pipeline configuration containing train_infer_correction config + batch: DataProto batch to modify + stat_mask_key: Key of mask used for computing train-infer statistics (diff, ratio) + update_mask_keys: List of mask keys to update with computed filter mask. + If None, defaults to ['response_mask']. + + Note: + For worker-level usage, use compute_train_infer_correction() directly, + as it works with already-sliced tensors [B, T-1] and provides more flexibility. + """ + # Check if required fields are present + if "old_log_probs" not in batch.batch or "infer_logprobs" not in batch.batch: + missing_fields = [] + if "old_log_probs" not in batch.batch: + missing_fields.append("'old_log_probs'") + if "infer_logprobs" not in batch.batch: + missing_fields.append("'infer_logprobs'") + logger.warning(f"Skipping train-infer correction: {', '.join(missing_fields)} not found in batch.") + stat_mask = batch.batch[stat_mask_key][:, 1:].long() # [B, T-1] + batch.batch["train_infer_is_weight"] = torch.ones_like(stat_mask, dtype=torch.float32) + return batch, {} + + # Default: update response_mask if not specified + if update_mask_keys is None: + update_mask_keys = [stat_mask_key] + + # Get the mask for computing train-infer statistics (always sliced to [B, T-1]) + stat_mask = batch.batch[stat_mask_key][:, 1:].long() # [B, T-1] + old_lp = batch.batch["old_log_probs"] # [B, T-1] + infer_lp = batch.batch["infer_logprobs"] # [B, T-1] + + cfg = pipeline_config.train_infer_correction + + # Compute IS weights and filter mask + is_w, filter_mask, corr_metrics = compute_train_infer_correction( + cfg=cfg, + response_mask=stat_mask, + old_log_probs=old_lp, + infer_log_probs=infer_lp, + global_valid_samples=None, # Will be inferred from stat_mask + global_valid_tokens=None, # Will be inferred from stat_mask + apply_filters=True, + ) + + # Set train_infer_is_weight + batch.batch["train_infer_is_weight"] = is_w + + # Apply filter mask to all specified masks + for key in update_mask_keys: + if key in batch.batch: + mask_tensor = batch.batch[key] + # Check if mask is already sliced (shape [B, T-1]) or full (shape [B, T]) + # final_response_mask is already [:, 1:] sliced in get_sample_level_mask + if mask_tensor.shape[-1] == filter_mask.shape[-1]: + # Mask is already sliced (e.g., final_response_mask) + batch.batch[key] = mask_tensor.long() * filter_mask.long() + else: + # Mask is full shape (e.g., response_mask), apply to [:, 1:] part + batch.batch[key][:, 1:] = mask_tensor[:, 1:].long() * filter_mask.long() + else: + logger.warning(f"Mask key '{key}' not found in batch, skipping update.") + + return batch, corr_metrics diff --git a/tests/agentic/env/test_mcp_client.py b/tests/agentic/env/test_mcp_client.py new file mode 100644 index 000000000..3280e21dc --- /dev/null +++ b/tests/agentic/env/test_mcp_client.py @@ -0,0 +1,94 @@ +import pytest +import json +from roll.pipeline.agentic.env.mcp.mcp_client import MCPClient + +@pytest.mark.asyncio +async def test_sokoban_mcp_server_interaction(): + async with MCPClient("http://sokoban-mcp.alibaba-inc.com/sse") as client: + tools_list = await client.tools() + tool_names = [tool.name for tool in tools_list] + assert "reset" in tool_names, "reset tool not found in server tools" + assert "play" in tool_names, "play tool not found in server tools" + # call reset without seed + raw_reset_result = await client.call_tool("reset") + reset_result = parse_call_tool_result(raw_reset_result) + assert "Observation" in reset_result + print("Reset observation:\n", reset_result["Observation"]) + # call reset with seed=2 + seed = 2 + raw_reset_seed_result = await client.call_tool("reset", {"seed": seed}) + reset_seed_result = parse_call_tool_result(raw_reset_seed_result) + assert "Observation" in reset_seed_result + assert reset_seed_result["Observation"] == "######\n#_#_P#\n#_#X_#\n#___O#\n#____#\n######" + print(f"Reset with seed={seed} observation:\n", reset_seed_result["Observation"]) + + # call play with action=3 (left) + await call_play_and_parse(client, 3, + expected_obs="######\n#_#P_#\n#_#X_#\n#___O#\n#____#\n######" + ) + # call play with action=2 (down) + await call_play_and_parse(client, 2, + expected_obs="######\n#_#__#\n#_#P_#\n#__XO#\n#____#\n######" + ) + # call play with action=4 (right) + await call_play_and_parse(client, 4, + expected_obs="######\n#_#__#\n#_#_P#\n#__XO#\n#____#\n######" + ) + # call play with action=2 (down) + await call_play_and_parse(client, 2, + expected_obs="######\n#_#__#\n#_#__#\n#__XS#\n#____#\n######" + ) + # call play with action=2 (down) + await call_play_and_parse(client, 2, + expected_obs="######\n#_#__#\n#_#__#\n#__XO#\n#___P#\n######" + ) + # call play with action=3 (left) + await call_play_and_parse(client, 3, + expected_obs="######\n#_#__#\n#_#__#\n#__XO#\n#__P_#\n######" + ) + # call play with action=3 (left) + await call_play_and_parse(client, 3, + expected_obs="######\n#_#__#\n#_#__#\n#__XO#\n#_P__#\n######" + ) + # call play with action=1 (up) + await call_play_and_parse(client, 1, + expected_obs="######\n#_#__#\n#_#__#\n#_PXO#\n#____#\n######" + ) + # call play with action=4 (right) + await call_play_and_parse(client, 4, + expected_obs="######\n#_#__#\n#_#__#\n#__P√#\n#____#\n######", + reward=10.9, + done=True, + success=True + ) + +def parse_call_tool_result(call_tool_result): + """ + Extract the JSON string from CallToolResult + """ + content_list = getattr(call_tool_result, "content", []) + text_json_str = None + for content_item in content_list: + if hasattr(content_item, "type") and content_item.type == "text": + text_json_str = content_item.text + break + if not text_json_str: + raise ValueError("No 'text' content found in CallToolResult") + return json.loads(text_json_str) + +async def call_play_and_parse(client, action_code, expected_obs, reward=-0.1, done=False, success=False, effective=True): + raw = await client.call_tool("play", {"action": action_code}) + res = parse_call_tool_result(raw) + assert res["Observation"] == expected_obs + assert res["Reward"] == reward + assert res.get("Game End") is done + server_info = res.get("info", {}) + assert server_info.get("action_is_effective") is effective + assert server_info.get("success") is success + print(f"Action {action_code} Observation:\n{res['Observation']}") + print(f"Game ended: {res['Game End']} \ninfo: {res['info']}") + return res + +if __name__ == "__main__": + import asyncio + asyncio.run(test_sokoban_mcp_server_interaction()) diff --git a/tests/agentic/env/test_sokoban_mcp.py b/tests/agentic/env/test_sokoban_mcp.py new file mode 100644 index 000000000..f1fe54fde --- /dev/null +++ b/tests/agentic/env/test_sokoban_mcp.py @@ -0,0 +1,159 @@ +import pytest +from unittest.mock import MagicMock +from roll.pipeline.agentic.env.mcp.mcp_client import MCPClient +from roll.pipeline.agentic.env.mcp import SokobanMCPEnv + +# Configuration +SERVER_URL = "http://sokoban-mcp.alibaba-inc.com/sse" +MOCK_SERVER_URL = "http://mock-sokoban-server.test" +TEST_SEED = 2 +TEST_ACTION_STR = "Left" + +MOCK_ENV_INSTRUCTION = "Solve the puzzle." +MOCK_ACTION_LOOKUP = {1: "Up", 2: "Down", 3: "Left", 4: "Right"} +MOCK_FORMAT_PENALTY = -0.15 + +# ============================================================================= +# / Pytest Fixtures / +# ============================================================================= +@pytest.fixture(scope="function") +def real_sokoban_env(): + """ + Provides a SokobanMCPEnv instance connected to the REAL server. + Use this fixture ONLY for integration tests. + """ + print("\n[Fixture Setup] Creating SokobanMCPEnv instance for integration test...") + + env = SokobanMCPEnv( + server_url=SERVER_URL, + env_instruction=MOCK_ENV_INSTRUCTION, + action_lookup=MOCK_ACTION_LOOKUP, + format_penalty=MOCK_FORMAT_PENALTY, + ) + yield env + +@pytest.fixture +def isolated_mock_env(): + """ + Provides a mocked env where the automatic __init__ async logic is disabled, + allowing for isolated testing of individual methods like step() and reset(). + """ + env = SokobanMCPEnv( + server_url=MOCK_SERVER_URL, + env_instruction=MOCK_ENV_INSTRUCTION, + action_lookup=MOCK_ACTION_LOOKUP, + format_penalty=MOCK_FORMAT_PENALTY, + client=MagicMock(spec_set=MCPClient), + ) + env._last_obs = "A previous observation state." + yield env + +# ============================================================================= +# / Integration Tests (Requires Real Server) / +# ============================================================================= +def test_sokoban_mcp_env_with_valid_action(real_sokoban_env: SokobanMCPEnv): + """Integration test for SokobanMCPEnv with real server connection""" + # 1. Test environment reset + obs, info = real_sokoban_env.reset(seed=TEST_SEED) + print(f"Initial state (seed={TEST_SEED}):\n{obs}") + + # Validate initial state + assert "Solve the puzzle" in obs, "Observation should contain the instruction string." + assert "######\n#_#_P#" in info['suffix'], "Initial state in 'suffix' mismatch" + + # 2. Test single action execution + llm_output_action = f"{TEST_ACTION_STR}" + + # ACT: Pass the full, tagged string to the step function. + obs, reward, terminated, truncated, info = real_sokoban_env.step(llm_output_action) + print(f"After action {TEST_ACTION_STR}:\n{obs}") + print(f"Reward: {reward}, Terminated: {terminated}, Success: {info.get('success', False)}") + + # Validate post-action state + assert f"you moved {TEST_ACTION_STR}" in obs, "Feedback text should confirm the executed action." + assert "######\n#_#P_#" in info['suffix'], "Post-action state in 'suffix' mismatch" + assert reward == -0.1, "Reward value mismatch" + assert not terminated, "Game should not be terminated after one action" + assert not truncated, "Game should not be truncated after one action" + assert not info['metrics']["success"], "Game should not be successful after one action" + +# ============================================================================= +# / Unit Tests - Environment Interaction (`reset`, `step`) / +# ============================================================================= +def test_reset_wraps_connection_error_in_runtime_error(isolated_mock_env: SokobanMCPEnv): + """ + Tests that reset raises an error if the connection fails during its execution. + """ + # ARRANGE + env = isolated_mock_env + + # This mock is for the call inside reset's _run_async_logic + env._run_async_logic = MagicMock(side_effect=ConnectionError("Server is down!")) + # We expect reset() to catch ConnectionError and raise RuntimeError + with pytest.raises(RuntimeError, match="Failed to reset the environment due to a server or network issue"): + env.reset(seed=TEST_SEED) + +def test_step_handles_invalid_action(isolated_mock_env: SokobanMCPEnv): + """ + Tests that the step() method's first error handling block correctly catches + ANY ValueError raised by the parse_action method and calls the error handler. + """ + # ARRANGE + env = isolated_mock_env + + env.parse_action = MagicMock(return_value={"action": None, "action_content": "Go Up"}) + + obs, reward, terminated, truncated, info = env.step("Go Up") + + # Check the final output to confirm the error handling flow completed. + assert "provided an invalid action" in obs + assert reward == MOCK_FORMAT_PENALTY, "Reward should be the format penalty" + assert not terminated + assert not truncated + assert info["metrics"]["action_is_valid"] is False + assert info["metrics"]["action_is_effective"] is False + assert info["metrics"]["success"] is False + assert "The game state has not changed. Please provide a valid action in the correct format." in info["suffix"], "Suffix should contain the old state" + +# ============================================================================ +# / Unit Tests - Pure Functions and Parsers / +# ============================================================================= +def test_parse_action_simple_logic(isolated_mock_env: SokobanMCPEnv): + """Tests the generic parse_action method from the MCPEnv base class.""" + env = isolated_mock_env + # --- Path 1: SUCCESS (Valid action) --- + action_info = env.parse_action("Up") + assert action_info["action"] == 1 + assert action_info["action_content"] == "Up" + + # === BASIC FORMATTING FAILURES === + + # --- Path 2: FAILURE (No tags) --- + action_info = env.parse_action("Up") + assert action_info["action"] is None + + # --- Path 3: FAILURE (Content is not valid) --- + action_info = env.parse_action("move left") + assert action_info["action"] is None + +def test_process_parsed_json_logic(isolated_mock_env: SokobanMCPEnv): + """ + Unit test for the game-specific process_parsed_json method. + """ + isolated_mock_env._last_obs = "Previous state" + + success_response = { + "Observation": "New state", + "Reward": 1.0, + "Game End": True, + "info": {"success": True, "action_is_effective": True} + } + obs, terminated, truncated, info = isolated_mock_env._process_parsed_json(success_response) + + assert "New state" in obs + assert terminated + assert not truncated + assert info["metrics"]["success"] + assert info["metrics"]["action_is_effective"] + assert info["metrics"]["format_penalty"] == 0.0 + assert info["reward_from_server"] == 1.0 \ No newline at end of file diff --git a/tests/agentic/env/test_sokoban_sandbox.py b/tests/agentic/env/test_sokoban_sandbox.py new file mode 100644 index 000000000..c60c81f63 --- /dev/null +++ b/tests/agentic/env/test_sokoban_sandbox.py @@ -0,0 +1,88 @@ +from roll.pipeline.agentic.env.sandbox import SokobanSandboxEnv +import traceback + +def test_sandbox(): + """ + Main function to run an interactive test session with the SokobanSandboxEnv. + """ + try: + env = SokobanSandboxEnv() + print("--- Initialization Successful! ---") + + # Initial reset to start the first game + obs, info = env.reset(seed=1) + print_game_state(obs, info) + + while True: + keyboard = input("Enter action(up, down, left, right), render, reset, or enter exit to quit): ").strip().lower() + + if not keyboard: + continue + + if keyboard == "exit": + break + + if keyboard == "render": + print(env.render()) + continue + + if keyboard.startswith("reset"): + parts = keyboard.split() + seed = None + if len(parts) > 1: + try: + seed = int(parts[1]) + print(f"--- Resetting with seed: {seed} ---") + except (ValueError, IndexError): + print("Invalid seed provided. Resetting with a random seed.") + + obs, info = env.reset(seed=seed) + print_game_state(obs, info) + continue + + # Wrap the action in the format expected by the LLM parser + action = f"{keyboard}" + obs, reward, terminated, truncated, info = env.step(action) + print_game_state(obs, info) + print(f"Reward: {reward:.2f}, Terminated: {terminated}, Truncated: {truncated}") + + if terminated or truncated: + print("\n!!! GAME OVER !!!Starting a new game...") + obs, info = env.reset() + print_game_state(obs, info) + + except Exception as e: + print("\n!!! An error occurred during SokobanSandboxEnv initialization !!!") + # traceback.format_exc() is more informative than just printing the exception 'e' + print("--- Full Traceback ---") + print(traceback.format_exc()) + print("--- End of Traceback ---") + + finally: + if env: + print("\n--- Closing environment ---") + env.close() + +def print_game_state(obs, info): + """ + A helper function to neatly print the current game state. + + Args: + obs (str): The observation string, which contains rules or turn feedback. + info (dict): The info dictionary, which should contain the game map. + """ + print("\n" + "="*20 + " CURRENT STATE " + "="*20) + + # Print the observation (game rules or turn feedback) + print("\n[Observation]") + print(obs) + + # Extract and print the game map from the info dictionary + game_map = info.get('suffix', 'No map data found in info.') + print("\n[Map]") + print(game_map.strip()) # .strip() removes potential leading/trailing whitespace + + print("="*55 + "\n") + +if __name__ == "__main__": + test_sandbox() \ No newline at end of file diff --git a/tests/agentic/env_manager/test_traj_env_manager_debug.py b/tests/agentic/env_manager/test_traj_env_manager_debug.py new file mode 100644 index 000000000..5d328c85f --- /dev/null +++ b/tests/agentic/env_manager/test_traj_env_manager_debug.py @@ -0,0 +1,152 @@ +""" +usage: + +conda create -n python310_torch260_em python=3.10 + +pip3 install torch torchvision torchaudio py-cpuinfo +pip install -r requirements_em_local_debug.txt + +python tests/agentic/env_manager/test_traj_env_manager.py +""" +import threading + +import ray + +from roll.distributed.scheduler.rollout_scheduler import GroupQueueManager +from roll.distributed.scheduler.protocol import DataProto +from roll.models.model_providers import default_tokenizer_provider, default_processor_provider, get_extra_data_provider +from roll.pipeline.agentic.agentic_config import AgenticConfig +from roll.pipeline.agentic.env_manager.step_env_manager import StepEnvManager +from roll.pipeline.agentic.env_manager.traj_env_manager import TrajEnvManager +from roll.pipeline.agentic.env_manager.vl_traj_env_manager import VLTrajEnvManager +from roll.utils.import_utils import safe_import_class +from tests.agentic.env_manager.config_load_utils import make_pipeline_config + + +def test_debug_traj_env_manager(): + ray.init(log_to_driver=True) + current_step = 0 + + config_path = "" + config_name = "traj_env_manager_debug" + + pipeline_config: AgenticConfig = make_pipeline_config(config_path, config_name, AgenticConfig) + + pipeline_config.model_download_type = "MODELSCOPE" + pipeline_config.async_generation_ratio = 2 + + worker_config = pipeline_config.train_env_manager + tokenizer = default_tokenizer_provider(model_args=worker_config.model_args) + generate_scheduler = None + + output_queue = GroupQueueManager.remote(config=pipeline_config, env_manager_config=worker_config, mode="train") + + ray.get(output_queue.advance_step.remote(current_step)) + + env_config = worker_config.env_configs[0][0] + env_manager_cls = safe_import_class(env_config["env_manager_cls"]) + env_manager = env_manager_cls(worker_config=worker_config, + pipeline_config=pipeline_config, + env_config=worker_config.env_configs[0][0], + tokenizer=tokenizer, + generate_scheduler=generate_scheduler, + output_queue=output_queue, + thread_lock=threading.Lock(), + mode="train") + env_manager.update_step(global_step=current_step) + + data = DataProto(meta_info={"seed": 0}) + thread = threading.Thread(target=env_manager.run_rollout_loop, args=(data,), daemon=False) + thread.start() + + batch = ray.get(output_queue.get_batch.remote(batch_size=pipeline_config.rollout_batch_size, current_step=current_step)) + print(batch) + print(f"batch_size: {len(batch)}") + env_manager.stop() + + +def test_debug_vl_traj_env_manager(): + ray.init(log_to_driver=True) + current_step = 0 + + config_path = "" + config_name = "vl_traj_env_manager_debug" + + pipeline_config: AgenticConfig = make_pipeline_config(config_path, config_name, AgenticConfig) + pipeline_config.model_download_type = "MODELSCOPE" + pipeline_config.async_generation_ratio = 2 + worker_config = pipeline_config.train_env_manager + tokenizer = default_tokenizer_provider(model_args=worker_config.model_args) + processor = default_processor_provider(model_args=worker_config.model_args) + extra_data_provider = get_extra_data_provider(worker_config.model_args.model_name_or_path, processor=processor) + generate_scheduler = None + + output_queue = GroupQueueManager.remote(config=pipeline_config, env_manager_config=worker_config, mode="train") + + ray.get(output_queue.advance_step.remote(current_step)) + env_manager = VLTrajEnvManager(worker_config=worker_config, + pipeline_config=pipeline_config, + env_config=worker_config.env_configs[0][0], + tokenizer=tokenizer, + processor=processor, + generate_scheduler=generate_scheduler, + output_queue=output_queue, + thread_lock=threading.Lock(), + extra_data_provider=extra_data_provider, + mode="train") + env_manager.update_step(global_step=current_step) + + data = DataProto(meta_info={"seed": 0}) + thread = threading.Thread(target=env_manager.run_rollout_loop, args=(data,)) + thread.start() + + print("pipeline_config.rollout_batch_size: ", pipeline_config.rollout_batch_size) + batch = ray.get(output_queue.get_batch.remote(batch_size=pipeline_config.rollout_batch_size, current_step=0)) + # print(batch) + print(f"batch_size: {len(batch)}") + env_manager.stop() + + +def test_debug_step_env_manager(): + ray.init(log_to_driver=True) + current_step = 0 + + config_path = "" + config_name = "step_env_manager_debug" + + pipeline_config: AgenticConfig = make_pipeline_config(config_path, config_name, AgenticConfig) + + pipeline_config.model_download_type = "MODELSCOPE" + pipeline_config.async_generation_ratio = 2 + + worker_config = pipeline_config.train_env_manager + tokenizer = default_tokenizer_provider(model_args=worker_config.model_args) + generate_scheduler = None + + output_queue = GroupQueueManager.remote(config=pipeline_config, env_manager_config=worker_config, mode="train") + + ray.get(output_queue.advance_step.remote(current_step)) + env_manager = StepEnvManager(worker_config=worker_config, + pipeline_config=pipeline_config, + env_config=worker_config.env_configs[0][0], + tokenizer=tokenizer, + generate_scheduler=generate_scheduler, + output_queue=output_queue, + thread_lock=threading.Lock(), + mode="train") + env_manager.update_step(global_step=current_step) + + data = DataProto(meta_info={"seed": 0}) + thread = threading.Thread(target=env_manager.run_rollout_loop, args=(data,)) + thread.start() + + batch = ray.get(output_queue.get_batch.remote(batch_size=pipeline_config.rollout_batch_size, current_step=current_step)) + # print(batch) + print(f"batch_size: {len(batch)}") + env_manager.stop() + + +if __name__ == '__main__': + test_debug_traj_env_manager() + # test_debug_vl_traj_env_manager() + # test_debug_step_env_manager() \ No newline at end of file diff --git a/tests/agentic/env_manager/traj_env_manager_debug.yaml b/tests/agentic/env_manager/traj_env_manager_debug.yaml index 59b5777e1..f893019ea 100644 --- a/tests/agentic/env_manager/traj_env_manager_debug.yaml +++ b/tests/agentic/env_manager/traj_env_manager_debug.yaml @@ -39,6 +39,12 @@ custom_envs: ${custom_env.deep_math} CodeContest: ${custom_env.CodeContest} + SweNativeEnv: + ${custom_env.SweNativeEnv} + SokobanNativeEnv: + ${custom_env.SokobanNativeEnv} + RockTBNativeEnvTrain: + ${custom_env.RockTBNativeEnvTrain} actor_infer: generating_args: @@ -145,7 +151,100 @@ custom_env: timeout: 5 sandbox_type: none keep_error_last_line: false + SweNativeEnv: + env_type: "swe_native_env" + max_steps: 10 + max_tokens_per_step: 8192 + env_manager_cls: roll.pipeline.agentic.env_manager.agent_native_env_manager.AgentNativeStepEnvManager + agent_system_template: "agent_system_template placeholder" + agent_template: "agent_template placeholder" + env_config: + dataset_name: data/swe/data.jsonl + train_idx_range: [1, 1] + val_idx_range: [0, 0] + tools: [ "Edit", "glob", "list_directory", "read_file", "Shell" ] + traj_dir: "./traj/trainset/" + swe_requirement_dir: "" + base_dir: "./logs" + + max_steps: 5 + mode: "train" + xrl_authorization: t-r8c4rjh0por8gwc5 + sandbox_base_url: https://xrl-sandbox.alibaba-inc.com + user_id: '410435' + experiment_id: "test" + SokobanNativeEnv: + env_type: "sokoban_native_env" + max_steps: 10 + max_tokens_per_step: 128 + env_manager_cls: roll.pipeline.agentic.env_manager.agent_native_env_manager.AgentNativeStepEnvManager + agent_system_template: "agent_system_template placeholder" + agent_template: "agent_template placeholder" + env_config: + max_steps: 10 + + RockTBNativeEnvTrain: + env_type: "rock_tb_native_env" + max_steps: 10 + max_tokens_per_step: 8192 + env_manager_cls: roll.pipeline.agentic.env_manager.agent_native_env_manager.AgentNativeStepEnvManager + agent_system_template: "agent_system_template placeholder" + agent_template: "agent_template placeholder" + env_config: + dataset_name: data/terminal_bench.jsonl + train_idx_range: [13, 13] + tools: ~ + max_steps: 10 + mode: "train" + xrl_authorization: t-j99eljub5bst4q9p + sandbox_base_url: https://xrl-sandbox.alibaba-inc.com + user_id: '481404' + experiment_id: "test" + test_files: ["output/terminal-bench/terminal-bench"] + agent_config: + agent_type: "iflow-cli" + version: "0.0.1" + pre_init_bash_cmd_list: + - command: 'echo "118.31.38.66 github.com" | tee -a /etc/hosts' + timeout_seconds: 30 + - command: "apt-get update" + timeout_seconds: 600 + - command: "apt-get install -y curl git wget xz-utils" + timeout_seconds: 600 + - command: "apt-get install -y build-essential libc6-dev patch procps" + timeout_seconds: 600 + model_service_config: + model_service_install_cmd: 'pip install "rl_rock[model-service]==1.0.0" -i https://artlab.alibaba-inc.com/1/pypi/simple --trusted-host artlab.alibaba-inc.com' + iflow_cli_install_cmd: "wget --retry-connrefused --tries=10 --waitretry=2 -O ~/iflow-cli.tgz 'http://cloud.iflow.cn/iflow-cli/iflow-ai-iflow-cli-for-roll-0-4-4-v5.tgz' && npm i -g ~/iflow-cli.tgz && ln -s /opt/nodejs/bin/iflow /usr/local/bin/iflow" + iflow_settings: + selectedAuthType: "openai-compatible" + apiKey: "123" + baseUrl: "http://127.0.0.1:8080/v1/" + modelName: "Qwen3-Coder-Plus" + searchApiKey: "88888888" + disableAutoUpdate: True + shellTimeout: 360000 + tokensLimit: 128000 + coreTools: [ + "Edit", + "exit_plan_mode", + "glob", + "list_directory", + "multi_edit", + "plan", + "read plan", + "read_file", + "read_many_files", + "save_memory", + "Search", + "Shell", + "task", + "web_fetch", + "web_search", + "write_file", + "xml_escape", + ] cli_agent_system_template: You're a helpful assistant. You are a good game player. You are aiming to get high reward in the game. cli_agent_template: | diff --git a/tests/agentic/test_segment_masked_mean.py b/tests/agentic/test_segment_masked_mean.py new file mode 100644 index 000000000..0b66b9169 --- /dev/null +++ b/tests/agentic/test_segment_masked_mean.py @@ -0,0 +1,197 @@ +import torch +import pytest +from roll.pipeline.agentic.agentic_actor_worker import compute_segment_masked_mean + + +def test_single_segment(): + """测试单段连续的1""" + # mask: [0, 0, 1, 1, 1, 0, 0] + # tensor: [0, 0, 2, 4, 6, 0, 0] + # 期望: 第2-4位置的mean是 (2+4+6)/3 = 4.0 + mask = torch.tensor([[0, 0, 1, 1, 1, 0, 0]], dtype=torch.long) + tensor = torch.tensor([[0, 0, 2, 4, 6, 0, 0]], dtype=torch.float32) + + result = compute_segment_masked_mean(tensor, mask) + + expected = torch.tensor([[0, 0, 4.0, 4.0, 4.0, 0, 0]], dtype=torch.float32) + torch.testing.assert_close(result, expected) + + +def test_multiple_segments(): + """测试多段连续的1,中间有0分隔""" + # mask: [0, 1, 1, 0, 1, 1, 1, 0] + # tensor: [0, 1, 2, 0, 3, 4, 5, 0] + # 第一段(位置1-2): mean = (1+2)/2 = 1.5 + # 第二段(位置4-6): mean = (3+4+5)/3 = 4.0 + mask = torch.tensor([[0, 1, 1, 0, 1, 1, 1, 0]], dtype=torch.long) + tensor = torch.tensor([[0, 1, 2, 0, 3, 4, 5, 0]], dtype=torch.float32) + + result = compute_segment_masked_mean(tensor, mask) + + expected = torch.tensor([[0, 1.5, 1.5, 0, 4.0, 4.0, 4.0, 0]], dtype=torch.float32) + torch.testing.assert_close(result, expected) + + +def test_starts_with_one(): + """测试以1开头的情况""" + # mask: [1, 1, 0, 1, 0] + # tensor: [2, 4, 0, 6, 0] + # 第一段(位置0-1): mean = (2+4)/2 = 3.0 + # 第二段(位置3): mean = 6.0 + mask = torch.tensor([[1, 1, 0, 1, 0]], dtype=torch.long) + tensor = torch.tensor([[2, 4, 0, 6, 0]], dtype=torch.float32) + + result = compute_segment_masked_mean(tensor, mask) + + expected = torch.tensor([[3.0, 3.0, 0, 6.0, 0]], dtype=torch.float32) + torch.testing.assert_close(result, expected) + + +def test_ends_with_one(): + """测试以1结尾的情况""" + # mask: [0, 1, 1, 1] + # tensor: [0, 2, 4, 6] + # 期望: 位置1-3的mean是 (2+4+6)/3 = 4.0 + mask = torch.tensor([[0, 1, 1, 1]], dtype=torch.long) + tensor = torch.tensor([[0, 2, 4, 6]], dtype=torch.float32) + + result = compute_segment_masked_mean(tensor, mask) + + expected = torch.tensor([[0, 4.0, 4.0, 4.0]], dtype=torch.float32) + torch.testing.assert_close(result, expected) + + +def test_all_ones(): + """测试全为1的情况""" + # mask: [1, 1, 1] + # tensor: [1, 2, 3] + # 期望: 所有位置的mean是 (1+2+3)/3 = 2.0 + mask = torch.tensor([[1, 1, 1]], dtype=torch.long) + tensor = torch.tensor([[1, 2, 3]], dtype=torch.float32) + + result = compute_segment_masked_mean(tensor, mask) + + expected = torch.tensor([[2.0, 2.0, 2.0]], dtype=torch.float32) + torch.testing.assert_close(result, expected) + + +def test_all_zeros(): + """测试全为0的情况""" + # mask: [0, 0, 0] + # tensor: [1, 2, 3] + # 期望: 所有位置都是0 + mask = torch.tensor([[0, 0, 0]], dtype=torch.long) + tensor = torch.tensor([[1, 2, 3]], dtype=torch.float32) + + result = compute_segment_masked_mean(tensor, mask) + + expected = torch.tensor([[0, 0, 0]], dtype=torch.float32) + torch.testing.assert_close(result, expected) + + +def test_single_one(): + """测试单个1的情况""" + # mask: [0, 0, 1, 0, 0] + # tensor: [0, 0, 5, 0, 0] + # 期望: 位置2的值是5.0 + mask = torch.tensor([[0, 0, 1, 0, 0]], dtype=torch.long) + tensor = torch.tensor([[0, 0, 5, 0, 0]], dtype=torch.float32) + + result = compute_segment_masked_mean(tensor, mask) + + expected = torch.tensor([[0, 0, 5.0, 0, 0]], dtype=torch.float32) + torch.testing.assert_close(result, expected) + + +def test_complex_pattern(): + """测试复杂模式:多段,开头和结尾都是1""" + # mask: [1, 1, 0, 0, 1, 1, 1, 0, 1] + # tensor: [1, 2, 0, 0, 3, 4, 5, 0, 6] + # 第一段(位置0-1): mean = (1+2)/2 = 1.5 + # 第二段(位置4-6): mean = (3+4+5)/3 = 4.0 + # 第三段(位置8): mean = 6.0 + mask = torch.tensor([[1, 1, 0, 0, 1, 1, 1, 0, 1]], dtype=torch.long) + tensor = torch.tensor([[1, 2, 0, 0, 3, 4, 5, 0, 6]], dtype=torch.float32) + + result = compute_segment_masked_mean(tensor, mask) + + expected = torch.tensor([[1.5, 1.5, 0, 0, 4.0, 4.0, 4.0, 0, 6.0]], dtype=torch.float32) + torch.testing.assert_close(result, expected) + + +def test_batch_processing(): + """测试batch处理""" + # batch_size=2 + # 样本1: mask=[0,1,1,0], tensor=[0,2,4,0] -> mean=3.0 + # 样本2: mask=[1,1,0,1], tensor=[1,3,0,5] -> 第一段mean=2.0, 第二段mean=5.0 + mask = torch.tensor([ + [0, 1, 1, 0], + [1, 1, 0, 1] + ], dtype=torch.long) + tensor = torch.tensor([ + [0, 2, 4, 0], + [1, 3, 0, 5] + ], dtype=torch.float32) + + result = compute_segment_masked_mean(tensor, mask) + + expected = torch.tensor([ + [0, 3.0, 3.0, 0], + [2.0, 2.0, 0, 5.0] + ], dtype=torch.float32) + torch.testing.assert_close(result, expected) + + +def test_segments_not_multiplied(): + """测试不同段之间不相乘(验证独立性)""" + # mask: [1, 1, 0, 1, 1] + # tensor: [1, 1, 0, 10, 10] + # 第一段(位置0-1): mean = (1+1)/2 = 1.0 + # 第二段(位置3-4): mean = (10+10)/2 = 10.0 + # 如果相乘,结果应该是10.0,但实际应该是各自独立 + mask = torch.tensor([[1, 1, 0, 1, 1]], dtype=torch.long) + tensor = torch.tensor([[1, 1, 0, 10, 10]], dtype=torch.float32) + + result = compute_segment_masked_mean(tensor, mask) + + # 验证第一段是1.0,第二段是10.0,不相乘 + assert result[0, 0].item() == pytest.approx(1.0) + assert result[0, 1].item() == pytest.approx(1.0) + assert result[0, 3].item() == pytest.approx(10.0) + assert result[0, 4].item() == pytest.approx(10.0) + + +if __name__ == "__main__": + # 运行所有测试 + test_single_segment() + print("test_single_segment passed") + + test_multiple_segments() + print("test_multiple_segments passed") + + test_starts_with_one() + print("test_starts_with_one passed") + + test_ends_with_one() + print("test_ends_with_one passed") + + test_all_ones() + print("test_all_ones passed") + + test_all_zeros() + print("test_all_zeros passed") + + test_single_one() + print("test_single_one passed") + + test_complex_pattern() + print("test_complex_pattern passed") + + test_batch_processing() + print("test_batch_processing passed") + + test_segments_not_multiplied() + print("test_segments_not_multiplied passed") + + print("\n所有测试通过!") + diff --git a/tests/datasets/test_collator.py b/tests/datasets/test_collator.py index 38113a765..8221ccdbd 100644 --- a/tests/datasets/test_collator.py +++ b/tests/datasets/test_collator.py @@ -1,93 +1,49 @@ -import numpy as np import torch +from transformers import PreTrainedTokenizerFast, AutoTokenizer -from roll.datasets.collator import DataCollatorWithPaddingForPaddedKeys, collate_fn_to_dict_list +from roll.datasets.collator import DataCollatorWithPaddingForPaddedKeys -class DummyTokenizer: - """Minimal tokenizer stub that behaves like a HF tokenizer for padding.""" +def test_data_collator_with_padding_for_padded_keys(): + tokenizer = AutoTokenizer.from_pretrained("/Users/pan/Downloads/huggingface/gpt2-imdb", padding_side="left") - def __init__(self, pad_token_id: int = 0, padding_side: str = "right"): - self.pad_token_id = pad_token_id - self.padding_side = padding_side - self.model_input_names = ["input_ids", "attention_mask", "labels"] + tokenizer.pad_token_id = tokenizer.eos_token_id - def pad( - self, - encoded_inputs, - padding=True, - max_length=None, - pad_to_multiple_of=None, - return_tensors=None, - ): - assert padding in [True, "max_length"] - target_length = max_length or max(len(feature["input_ids"]) for feature in encoded_inputs) - pad_values = {"input_ids": self.pad_token_id, "attention_mask": 0, "labels": -100} - padded = {key: [] for key in encoded_inputs[0].keys()} - for feature in encoded_inputs: - for key, value in feature.items(): - value_list = list(value) - pad_value = pad_values.get(key, 0) - padded[key].append(value_list + [pad_value] * (target_length - len(value_list))) - if return_tensors == "pt": - for key in padded: - padded[key] = torch.tensor(padded[key], dtype=torch.long) - return padded - - -def test_collate_fn_to_dict_list_merges_tensor_and_python_data(): - data_list = [ - {"input_ids": torch.tensor([[1, 2]]), "meta": {"id": "a"}}, - {"input_ids": torch.tensor([[3, 4]]), "meta": {"id": "b"}}, - ] - - output = collate_fn_to_dict_list(data_list) - - assert torch.equal(output["input_ids"], torch.tensor([[1, 2], [3, 4]])) - assert isinstance(output["meta"], np.ndarray) - assert output["meta"].shape == (2,) - assert output["meta"][0]["id"] == "a" - assert output["meta"][1]["id"] == "b" - - -def test_data_collator_with_padding_for_padded_keys_handles_unpadded_fields(): - tokenizer = DummyTokenizer(pad_token_id=9) - collator = DataCollatorWithPaddingForPaddedKeys( - tokenizer=tokenizer, - padding="max_length", - max_length=6, + max_length = 32 + data_collator = DataCollatorWithPaddingForPaddedKeys( + tokenizer=tokenizer, padding="max_length", max_length=max_length ) features = [ { - "input_ids": [1, 2, 3], - "attention_mask": [1, 1, 1], - "labels": [10, 11, 12], + "input_ids": tokenizer.encode("Hello, how are you?", return_tensors="pt").squeeze(0), + "labels": torch.tensor(1), "auxiliary": {"type": 1}, }, { - "input_ids": [4, 5], - "attention_mask": [1, 1], - "labels": [13, 14], + "input_ids": tokenizer.encode("I'm fine, thank you!", return_tensors="pt").squeeze(0), + "labels": torch.tensor(0), "auxiliary": {"type": 2}, }, + { + "input_ids": tokenizer.encode("What about you?", return_tensors="pt").squeeze(0), + "labels": torch.tensor(1), + "auxiliary": {"type": 3}, + }, ] - - batch = collator(features) - - assert batch["input_ids"].shape == (2, 6) - assert torch.equal(batch["input_ids"][0, 3:], torch.tensor([9, 9, 9])) - assert torch.equal(batch["attention_mask"][1], torch.tensor([1, 1, 0, 0, 0, 0])) - assert torch.equal(batch["labels"][1], torch.tensor([13, 14, -100, -100, -100, -100])) - - expected_position_ids = torch.tensor( - [ - [0, 1, 2, 2, 2, 2], - [0, 1, 1, 1, 1, 1], - ] - ) - assert torch.equal(batch["position_ids"], expected_position_ids) - - assert isinstance(batch["auxiliary"], np.ndarray) - assert batch["auxiliary"][0]["type"] == 1 - assert batch["auxiliary"][1]["type"] == 2 + for feature in features: + feature["attention_mask"] = [1] * len(feature["input_ids"]) + + batch = data_collator(features) + + print("Padded input_ids:") + print(batch["input_ids"]) + print("Padded attention_mask:") + print(batch["attention_mask"]) + print("Labels:") + print(batch["labels"]) + + assert ( + batch["input_ids"].shape[1] == max_length + ), f"Expected max_length {max_length}, got {batch['input_ids'].shape[1]}" + print(f"All inputs padded to length {max_length} correctly.") diff --git a/tests/distributed/executor/test_async_cluster.py b/tests/distributed/executor/test_async_cluster.py new file mode 100644 index 000000000..92533c82f --- /dev/null +++ b/tests/distributed/executor/test_async_cluster.py @@ -0,0 +1,66 @@ +from typing import Any + +import pytest +import ray +import asyncio + +from roll.configs.worker_config import WorkerConfig +from roll.distributed.executor.cluster import Cluster +from roll.distributed.executor.worker import Worker +from roll.distributed.scheduler.decorator import register, Dispatch +from roll.distributed.scheduler.resource_manager import ResourceManager + + +@ray.remote +class TestWorker(Worker): + @register(dispatch_mode=Dispatch.ONE_TO_ALL) + async def test_one_to_all(self): + return 1 + + @register(dispatch_mode=Dispatch.ONE_TO_ALL_ONE) + async def test_one_to_all_one(self): + return 1 + + @register(dispatch_mode=Dispatch.ALL_TO_ALL) + async def test_all_to_all(self): + return 1 + + @register(dispatch_mode=Dispatch.DP_MP_COMPUTE) + async def test_dp_mp_compute(self): + return 1 + + @register(dispatch_mode=Dispatch.DP_MP_DISPATCH_FIRST) + async def test_dp_mp_dispatch_first(self): + return 1 + +def test_async_cluster(): + ray.init() + resource_manager = ResourceManager(0, 1) + worker_config = WorkerConfig(name="test_worker", world_size=2) + + cluster: Any = Cluster( + name=worker_config.name, + resource_manager=resource_manager, + worker_cls=TestWorker, + worker_config=worker_config, + ) + + loop = asyncio.get_event_loop() + + ret = loop.run_until_complete(asyncio.gather(*cluster.test_one_to_all(blocking=False))) + assert ret == [1, 1] + + ret = loop.run_until_complete(asyncio.gather(*[ref.obj_ref for ref in cluster.test_one_to_all_one(blocking=False)])) + assert ret == [1, 1] + + ret = loop.run_until_complete(asyncio.gather(*cluster.test_all_to_all(blocking=False))) + assert ret == [1, 1] + + ret = loop.run_until_complete(asyncio.gather(*[ref.obj_ref for ref in cluster.test_dp_mp_compute(blocking=False)])) + assert ret == [1, 1] + + ret = loop.run_until_complete(asyncio.gather(*[ref.obj_ref for ref in cluster.test_dp_mp_dispatch_first(blocking=False)])) + assert ret == [1, 1] + +if __name__ == "__main__": + test_async_cluster() diff --git a/tests/distributed/executor/test_ray_debugger.py b/tests/distributed/executor/test_ray_debugger.py new file mode 100644 index 000000000..e356b9bf8 --- /dev/null +++ b/tests/distributed/executor/test_ray_debugger.py @@ -0,0 +1,35 @@ +""" +debug code from: https://docs.ray.io/en/latest/ray-observability/ray-distributed-debugger.html +""" +import ray +import sys + +# Add the RAY_DEBUG_POST_MORTEM=1 environment variable +# if you want to activate post-mortem debugging +ray.init( + runtime_env={ + "env_vars": {"RAY_DEBUG": "1"}, + }, + log_to_driver=True, +) + + +@ray.remote +def my_task(x): + y = x * x + print("my_task: x = {}, y = {}".format(x, y)) + breakpoint() # Add a breakpoint in the Ray task. + return y + + +@ray.remote +def post_mortem(x): + x += 1 + raise Exception("An exception is raised.") + return x + + +if len(sys.argv) == 1: + ray.get(my_task.remote(10)) +else: + ray.get(post_mortem.remote(10)) \ No newline at end of file diff --git a/tests/distributed/scheduler/test_generate_scheduler.py b/tests/distributed/scheduler/test_generate_scheduler.py new file mode 100644 index 000000000..c572ee895 --- /dev/null +++ b/tests/distributed/scheduler/test_generate_scheduler.py @@ -0,0 +1,495 @@ +import asyncio +import ray +import math +import random +from typing import List, Optional +from dataclasses import dataclass +import torch +import numpy as np +import pytest + +from roll.distributed.scheduler.generate_scheduler import ( + DynamicSamplingScheduler, + RolloutContext, + LoadBalancer, + ExperienceItem, +) +import roll.distributed.scheduler.user_defined_rollout_loop as udrl +from roll.distributed.scheduler.user_defined_rollout_loop import UserDefinedRolloutLoop as UserDefinedRolloutLoopBase +from roll.distributed.scheduler.protocol import DataProto +from roll.distributed.executor.worker import RankInfo +from roll.configs import ModelArguments +from roll.configs.worker_config import WorkerConfig +from roll.pipeline.rlvr.rlvr_config import RewardConfig, RewardFilterConfig +from roll.utils.logging import get_logger + + +logger = get_logger() + + +async def test_load_balancer(): + load_balancer = LoadBalancer(mp_rank_zero={0:0, 1:0, 2:0, 3:0}, max_running_requests=2) + + leases = [] + for i in range(8): + lease = await load_balancer.acquire(1) + assert lease._dp_rank == i % 4 + leases.append(lease) + assert load_balancer.full() + for i in range(8): + leases[i].clear() + assert load_balancer.empty() + + async def process_new_prompt(): + lease = await load_balancer.acquire(2) + await asyncio.sleep(2) + for i in range(2): + assert lease.lease == 2 - i + async with lease.lock(1) as dp_rank: + assert dp_rank == lease._dp_rank + assert lease.lease == 1 - i + return lease._dp_rank + + tasks = [asyncio.create_task(process_new_prompt()) for _ in range(4)] + await asyncio.sleep(1) + assert load_balancer.full() + await asyncio.sleep(2) + assert load_balancer.empty() + await load_balancer.wait_complete() + assert load_balancer.empty() + ret = await asyncio.gather(*tasks) + assert len(ret) == 4 and sum(ret) == 6 + assert set(ret) == set([0, 1, 2, 3]) + + tasks = [asyncio.create_task(process_new_prompt()) for _ in range(8)] + await asyncio.sleep(1) + assert load_balancer.full() + await asyncio.sleep(2) + assert load_balancer.full() + await load_balancer.wait_complete() + assert load_balancer.empty() + ret = await asyncio.gather(*tasks) + assert len(ret) == 8 and sum(ret) == 12 + assert set(ret) == set([0, 1, 2, 3]) + + async def suspended(): + while load_balancer._suspend: + load_balancer.suspend_event.clear() + await load_balancer.suspend_event.wait() + + load_balancer.suspend() + tasks = [asyncio.create_task(process_new_prompt()) for _ in range(8)] + await asyncio.sleep(1) + assert load_balancer.empty() + wait_task = asyncio.create_task(suspended()) + await asyncio.sleep(1) + assert not wait_task.done() + load_balancer.resume() + await wait_task + await asyncio.sleep(1) + assert load_balancer.full() + await load_balancer.wait_complete() + assert load_balancer.empty() + ret = await asyncio.gather(*tasks) + assert len(ret) == 8 and sum(ret) == 12 + assert set(ret) == set([0, 1, 2, 3]) + + +@ray.remote +class MockWorker: + async def generate_request(self, data: DataProto): + if "turn" not in data.meta_info: + data.meta_info["turn"] = 1 + else: + data.meta_info["turn"] += 1 + + if data.meta_info["turn"] < 3: + data.meta_info["finihsh_reasons"] = ["abort"] + else: + data.meta_info["finihsh_reasons"] = ["stop"] + + return data + + async def compute_rewards(self, data: DataProto): + return data + + async def abort_requests(self, ids): + return + +class MockCluster: + def __init__(self, workers: List[MockWorker]): + self.workers = workers + self.worker_rank_info = [RankInfo() for _ in range(4)] + self.worker_config = WorkerConfig(model_args=ModelArguments(model_type="diffusion_module")) + + def get_rank_info(self, rank): + return self.worker_rank_info[rank] + +class MockCollectFn: + def __init__(self, tokenizer): + pass + + def __call__(self, data): + assert isinstance(data, list) + assert len(data) == 1 + assert isinstance(data[0], dict) + domain = [data[0]["domain"]] + data[0]["domain"] = np.empty(len(domain), dtype=object) + data[0]["domain"][:] = domain + return data[0] + +@dataclass +class MockPipelineConfig: + is_val: bool = False + + async_generation_ratio: float = 0 + max_running_requests: int = 128 + is_num_return_sequences_expand: bool = True # this unit test only support is_num_return_sequences_expand + is_use_additional_prompts: bool = False + max_additional_running_prompts: int = 0 + user_defined_rollout_loop_cls: str = "roll.distributed.scheduler.user_defined_rollout_loop.UserDefinedRolloutLoop" + + seed: int = 0 + sequence_length: int = 0 + val_sequence_length: int = 0 + prompt_length: int = 0 + + rewards = {"default": RewardConfig(query_filter_config=RewardFilterConfig(type="no_filter"))} + +def postprocess_paused_data(pre_data, data: DataProto, sequence_length, prompt_length) -> DataProto: + return data +udrl.postprocess_paused_data = postprocess_paused_data + +def postprocess_output_data(request, data: DataProto, sequence_length) -> DataProto: + return data +udrl.postprocess_output_data = postprocess_output_data + +class UserDefinedRolloutLoopWithFilter(UserDefinedRolloutLoopBase): + def __init__(self): + super().__init__() + self.used_prompt = 0 + + async def process_new_prompt(self, context: RolloutContext) -> Optional[DataProto|List[DataProto]]: + ret = await super().process_new_prompt(context) + self.used_prompt += 1 + if self.used_prompt < 16: + return None + else: + return ret + +class UserDefinedRolloutLoopWithDynamicSamplen(UserDefinedRolloutLoopBase): + async def process_new_prompt(self, context: RolloutContext) -> Optional[DataProto|List[DataProto]]: + ret = await super().process_new_prompt(context) + assert isinstance(ret, list) + # dynamic num_return_sequences + if random.choice([True, False]): + return ret * 2 + else: + return ret[0] + +class MockDynamicSamplingScheduler(DynamicSamplingScheduler): + def __init__(self, pipeline_config): + super().__init__(pipeline_config) + self.mock_pipeline_config = pipeline_config + + async def set_scheduler(self): + actor_cluster = MockCluster([MockWorker.remote() for _ in range(4)]) + reward_clusters = {"default": MockCluster([MockWorker.remote() for _ in range(4)])} + await super().set_scheduler( + actor_cluster, + reward_clusters, + dataset=range(0,1024), + collect_fn_cls=MockCollectFn, + collect_fn_kwargs={}, + is_val=self.mock_pipeline_config.is_val, + ) + + def get_next_dataset_item(self): + return { + "prompt": torch.ones((1, 1)), + "response_level_rewards": torch.ones((1, 1)), + "domain": "default", + } + + def collect_items_as_batch(self, finished_items: List[ExperienceItem]): + batch = DataProto(meta_info={ + "finished_items": finished_items, + "metrics": {}, + }) + return batch + +async def test_val(): + logger.info("TEST test_val") + async_generation_ratio = 2 + pipeline_config = MockPipelineConfig( + is_val=True, + async_generation_ratio=async_generation_ratio, + max_running_requests=2, + is_use_additional_prompts=False, + max_additional_running_prompts=0, + ) + scheduler = MockDynamicSamplingScheduler(pipeline_config) + await scheduler.set_scheduler() + for i in range(10): + logger.info(f"pipeline step {i}") + await scheduler.pause_sampling() + data = DataProto(meta_info={"generation_config": {"num_return_sequences": 2}}) + ret = await scheduler.get_batch(data=data, global_step=i, batch_size=4) + # logger.info(f"step {i}: {ret}") + ret = ret.meta_info["finished_items"] + assert len(ret) == 8, f"{len(ret)=}" + for item in ret: + assert item.sampling_start_step == max(0, i) + assert item.prompt_id in list(range(i * 4, (i + 1) * 4)), f"{[item.prompt_id for item in ret]}" + logger.info(f"test_val step={i}, response step={[item.sampling_start_step for item in ret]}, prompt_id={[item.prompt_id for item in ret]}") + await scheduler.shutdown() + +async def test_sync(): + logger.info("TEST test_sync") + async_generation_ratio = 0 + pipeline_config = MockPipelineConfig( + async_generation_ratio=async_generation_ratio, + max_running_requests=2, + is_use_additional_prompts=False, + max_additional_running_prompts=0, + ) + scheduler = MockDynamicSamplingScheduler(pipeline_config) + await scheduler.set_scheduler() + for i in range(10): + logger.info(f"pipeline step {i}") + await scheduler.pause_sampling() + data = DataProto(meta_info={"generation_config": {"num_return_sequences": 2}}) + ret = await scheduler.get_batch(data=data, global_step=i, batch_size=4) + # logger.info(f"step {i}: {ret}") + ret = ret.meta_info["finished_items"] + assert len(ret) == 8, f"{len(ret)=}" + for item in ret: + assert item.sampling_start_step == max(0, i) + assert item.prompt_id in list(range(i * 4, (i + 1) * 4)), f"{[item.prompt_id for item in ret]}" + logger.info(f"test_sync step={i}, response step={[item.sampling_start_step for item in ret]}, prompt_id={[item.prompt_id for item in ret]}") + await scheduler.shutdown() + +async def test_sync_pause(): + logger.info("TEST test_sync_pause") + async_generation_ratio = 0 + pipeline_config = MockPipelineConfig( + async_generation_ratio=async_generation_ratio, + max_running_requests=2, + is_use_additional_prompts=False, + max_additional_running_prompts=0, + ) + scheduler = MockDynamicSamplingScheduler(pipeline_config) + await scheduler.set_scheduler() + for i in range(10): + logger.info(f"pipeline step {i}") + data = DataProto(meta_info={"generation_config": {"num_return_sequences": 2}}) + ret = await scheduler.get_batch(data=data, global_step=i, batch_size=4) + # logger.info(f"step {i}: {ret}") + ret = ret.meta_info["finished_items"] + assert len(ret) == 8, f"{len(ret)=}" + for item in ret: + assert item.sampling_start_step == max(0, i) + assert item.prompt_id in list(range(i * 4, (i + 1) * 4)), f"{[item.prompt_id for item in ret]}" + logger.info(f"test_sync_pause step={i}, response step={[item.sampling_start_step for item in ret]}, prompt_id={[item.prompt_id for item in ret]}") + await scheduler.shutdown() + +async def test_sync_filter(): + logger.info("TEST test_sync_filter") + async_generation_ratio = 0 + pipeline_config = MockPipelineConfig( + async_generation_ratio=async_generation_ratio, + max_running_requests=2, + is_use_additional_prompts=True, + max_additional_running_prompts=2, + user_defined_rollout_loop_cls="tests.distributed.scheduler.test_generate_scheduler.UserDefinedRolloutLoopWithFilter", + ) + scheduler = MockDynamicSamplingScheduler(pipeline_config) + await scheduler.set_scheduler() + for i in range(10): + logger.info(f"pipeline step {i}") + await scheduler.pause_sampling() + data = DataProto(meta_info={"generation_config": {"num_return_sequences": 2}}) + ret = await scheduler.get_batch(global_step=i, batch_size=4, data=data) + # logger.info(f"step {i}: {ret}") + ret = ret.meta_info["finished_items"] + assert len(ret) == 8, f"{len(ret)=}" + for item in ret: + assert item.sampling_start_step == max(0, i) + logger.info(f"test_sync_filter step={i}, response step={[item.sampling_start_step for item in ret]}, prompt_id={[item.prompt_id for item in ret]}") + await scheduler.shutdown() + +async def test_sync_additional_prompts(): + logger.info("TEST test_sync_additional_prompts") + async_generation_ratio = 0 + pipeline_config = MockPipelineConfig( + async_generation_ratio=async_generation_ratio, + max_running_requests=2, + is_use_additional_prompts=True, + max_additional_running_prompts=2, + ) + scheduler = MockDynamicSamplingScheduler(pipeline_config) + await scheduler.set_scheduler() + for i in range(10): + logger.info(f"pipeline step {i}") + await scheduler.pause_sampling() + data = DataProto(meta_info={"generation_config": {"num_return_sequences": 2}}) + ret = await scheduler.get_batch(data=data, global_step=i, batch_size=4) + # logger.info(f"step {i}: {ret}") + ret = ret.meta_info["finished_items"] + assert len(ret) == 8, f"{len(ret)=}" + for item in ret: + assert item.sampling_start_step == max(0, i) + logger.info(f"test_sync_additional_prompts step={i}, response step={[item.sampling_start_step for item in ret]}, prompt_id={[item.prompt_id for item in ret]}") + await scheduler.shutdown() + +async def test_sync_dynamic_num_return_sequences(): + logger.info("TEST test_sync_dynamic_num_return_sequences") + async_generation_ratio = 0 + pipeline_config = MockPipelineConfig( + async_generation_ratio=async_generation_ratio, + max_running_requests=2, + is_use_additional_prompts=True, + max_additional_running_prompts=2, + user_defined_rollout_loop_cls="tests.distributed.scheduler.test_generate_scheduler.UserDefinedRolloutLoopWithDynamicSamplen", + ) + scheduler = MockDynamicSamplingScheduler(pipeline_config) + await scheduler.set_scheduler() + for i in range(10): + logger.info(f"pipeline step {i}") + await scheduler.pause_sampling() + data = DataProto(meta_info={"generation_config": {"num_return_sequences": 2}}) + ret = await scheduler.get_batch(global_step=i, batch_size=4, data=data) + # logger.info(f"step {i}: {ret}") + ret = ret.meta_info["finished_items"] + assert len(ret) == 8, f"{len(ret)=}" + for item in ret: + assert item.sampling_start_step == max(0, i) + logger.info(f"test_sync_dynamic_num_return_sequences step={i}, response step={[item.sampling_start_step for item in ret]}, prompt_id={[item.prompt_id for item in ret]}") + await scheduler.shutdown() + +async def test_sync_dynamic_num_return_sequences_exception(): + logger.info("TEST test_sync_dynamic_num_return_sequences_exception") + async_generation_ratio = 0 + pipeline_config = MockPipelineConfig( + async_generation_ratio=async_generation_ratio, + max_running_requests=2, + is_use_additional_prompts=False, + max_additional_running_prompts=0, + user_defined_rollout_loop_cls="tests.distributed.scheduler.test_generate_scheduler.UserDefinedRolloutLoopWithDynamicSamplen", + ) + scheduler = MockDynamicSamplingScheduler(pipeline_config) + await scheduler.set_scheduler() + with pytest.raises(Exception): + for i in range(10): + logger.info(f"pipeline step {i}") + await scheduler.pause_sampling() + data = DataProto(meta_info={"generation_config": {"num_return_sequences": 2}}) + ret = await scheduler.get_batch(global_step=i, batch_size=4, data=data) + # logger.info(f"step {i}: {ret}") + ret = ret.meta_info["finished_items"] + assert len(ret) == 8, f"{len(ret)=}" + for item in ret: + assert item.sampling_start_step == max(0, i) + logger.info(f"test_sync_dynamic_num_return_sequences_exception step={i}, response step={[item.sampling_start_step for item in ret]}, prompt_id={[item.prompt_id for item in ret]}") + await scheduler.shutdown() + +async def test_1_off(): + logger.info("TEST test_1_off") + async_generation_ratio = 1 + pipeline_config = MockPipelineConfig( + async_generation_ratio=async_generation_ratio, + max_running_requests=2, + is_use_additional_prompts=False, + max_additional_running_prompts=0, + ) + scheduler = MockDynamicSamplingScheduler(pipeline_config) + await scheduler.set_scheduler() + for i in range(10): + logger.info(f"pipeline step {i}") + await scheduler.pause_sampling() + data = DataProto(meta_info={"generation_config": {"num_return_sequences": 2}}) + ret = await scheduler.get_batch(data=data, global_step=i, batch_size=4) + # logger.info(f"step {i}: {ret}") + ret = ret.meta_info["finished_items"] + assert len(ret) == 8, f"{len(ret)=}" + for item in ret: + assert item.sampling_start_step >= max(0, i - math.ceil(async_generation_ratio)) + assert item.sampling_start_step <= i + assert item.prompt_id >= max(0, i - async_generation_ratio) * 4 + assert item.prompt_id < (i + 1 + async_generation_ratio) * 4 + logger.info(f"test_1_off step={i}, response step={[item.sampling_start_step for item in ret]}, prompt_id={[item.prompt_id for item in ret]}") + await asyncio.sleep(2) + await scheduler.shutdown() + +async def test_3_off(): + logger.info("TEST test_3_off") + async_generation_ratio = 3.0 + pipeline_config = MockPipelineConfig( + async_generation_ratio=async_generation_ratio, + max_running_requests=2, + is_use_additional_prompts=False, + max_additional_running_prompts=0, + ) + scheduler = MockDynamicSamplingScheduler(pipeline_config) + await scheduler.set_scheduler() + for i in range(10): + logger.info(f"pipeline step {i}") + await scheduler.pause_sampling() + data = DataProto(meta_info={"generation_config": {"num_return_sequences": 2}}) + ret = await scheduler.get_batch(data=data, global_step=i, batch_size=4) + # logger.info(f"step {i}: {ret}") + ret = ret.meta_info["finished_items"] + assert len(ret) == 8, f"{len(ret)=}" + for item in ret: + assert item.sampling_start_step >= max(0, i - math.ceil(async_generation_ratio)) + assert item.sampling_start_step <= i + assert item.prompt_id >= max(0, i - async_generation_ratio) * 4 + assert item.prompt_id < (i + 1 + async_generation_ratio) * 4 + logger.info(f"test_3_off step={i}, response step={[item.sampling_start_step for item in ret]}, prompt_id={[item.prompt_id for item in ret]}") + await asyncio.sleep(2) + await scheduler.shutdown() + +async def test_2_5_off(): + logger.info("TEST test_2_5_off") + async_generation_ratio = 2.5 + pipeline_config = MockPipelineConfig( + async_generation_ratio=async_generation_ratio, + max_running_requests=2, + is_use_additional_prompts=False, + max_additional_running_prompts=0, + ) + scheduler = MockDynamicSamplingScheduler(pipeline_config) + await scheduler.set_scheduler() + for i in range(10): + logger.info(f"pipeline step {i}") + await scheduler.pause_sampling() + data = DataProto(meta_info={"generation_config": {"num_return_sequences": 2}}) + ret = await scheduler.get_batch(data=data, global_step=i, batch_size=4) + # logger.info(f"step {i}: {ret}") + ret = ret.meta_info["finished_items"] + assert len(ret) == 8, f"{len(ret)=}" + for item in ret: + assert item.sampling_start_step >= max(0, i - math.ceil(async_generation_ratio)) + assert item.sampling_start_step <= i + logger.info(f"test_2_5_off step={i}, response step={[item.sampling_start_step for item in ret]}, prompt_id={[item.prompt_id for item in ret]}") + await asyncio.sleep(2) + await scheduler.shutdown() + +async def test_dynamic_sampling_scheduler(): + await test_val() + await test_sync() + await test_sync_pause() + await test_sync_filter() + await test_sync_additional_prompts() + await test_sync_dynamic_num_return_sequences() + await test_sync_dynamic_num_return_sequences_exception() + await test_1_off() + await test_3_off() + await test_2_5_off() + + +if __name__ == "__main__": + ray.init() + loop = asyncio.get_event_loop() + loop.run_until_complete(test_load_balancer()) + loop.run_until_complete(test_dynamic_sampling_scheduler()) diff --git a/tests/distributed/strategy/checkpoint/fsdp_config.yaml b/tests/distributed/strategy/checkpoint/fsdp_config.yaml new file mode 100644 index 000000000..9d1f9fbf6 --- /dev/null +++ b/tests/distributed/strategy/checkpoint/fsdp_config.yaml @@ -0,0 +1,55 @@ +hydra: + run: + dir: . + output_subdir: null + +exp_name: "fsdp2_train_checkpoint_debug" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output + +checkpoint_config: + type: file_system + output_dir: ./test/checkpoint_fsdp_saving2 + +track_with: stdout + +save_steps: 1 +rollout_batch_size: 512 +prompt_length: 128 +response_length: 512 +resume_from_checkpoint: ./test/checkpoint_fsdp/20251120-154318/checkpoint-1 +pretrain: Qwen/Qwen3-0.6B + +actor_train: + model_args: + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + training_args: + learning_rate: 5.0e-7 + weight_decay: 0 + per_device_train_batch_size: 4 + gradient_accumulation_steps: 8 + warmup_steps: 5 + num_train_epochs: 1 + data_args: + template: qwen2_5 + file_name: data/comparison_gpt4_data_zh.json + dataset_dir: data + prompt: instruction + interleave_probs: "1.0" + max_samples: 4096 + strategy_args: + strategy_name: fsdp2_train + strategy_config: + param_dtype: bf16 + reduce_dtype: float32 + fsdp_size: 8 + reshard_after_forward: true + offload_policy: true + device_mapping: list(range(0,8)) + +actor_infer: + generating_args: + num_return_sequences: 1 \ No newline at end of file diff --git a/tests/distributed/strategy/checkpoint/fsdp_lora_config.yaml b/tests/distributed/strategy/checkpoint/fsdp_lora_config.yaml new file mode 100644 index 000000000..679efa953 --- /dev/null +++ b/tests/distributed/strategy/checkpoint/fsdp_lora_config.yaml @@ -0,0 +1,59 @@ +hydra: + run: + dir: . + output_subdir: null + +exp_name: "fsdp2_train_checkpoint_debug" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output + +checkpoint_config: + type: file_system + output_dir: ./test_lora/checkpoint_fsdp_saving1 + +track_with: stdout + +save_steps: 1 +rollout_batch_size: 512 +prompt_length: 128 +response_length: 512 +lora_target: o_proj,q_proj,k_proj,v_proj +pretrain: Qwen/Qwen3-0.6B + +actor_train: + model_args: + disable_gradient_checkpointing: false + dtype: bf16 + model_type: ~ + lora_target: ${lora_target} + lora_r: 16 + lora_alpha: 32 + lora_dropout: 0.05 + training_args: + learning_rate: 5.0e-7 + weight_decay: 0 + per_device_train_batch_size: 4 + gradient_accumulation_steps: 8 + warmup_steps: 5 + num_train_epochs: 1 + data_args: + template: qwen2_5 + file_name: data/comparison_gpt4_data_zh.json + dataset_dir: data + prompt: instruction + interleave_probs: "1.0" + max_samples: 4096 + strategy_args: + strategy_name: fsdp2_train + strategy_config: + param_dtype: bf16 + reduce_dtype: float32 + fsdp_size: 8 + reshard_after_forward: true + offload_policy: true + device_mapping: list(range(0,8)) + +actor_infer: + generating_args: + num_return_sequences: 1 \ No newline at end of file diff --git a/tests/distributed/strategy/checkpoint/megatron_config.yaml b/tests/distributed/strategy/checkpoint/megatron_config.yaml index f07304818..1938eab16 100644 --- a/tests/distributed/strategy/checkpoint/megatron_config.yaml +++ b/tests/distributed/strategy/checkpoint/megatron_config.yaml @@ -10,7 +10,8 @@ output_dir: ./output checkpoint_config: type: file_system - output_dir: /data/cpfs_0/xiongshaopan.xsp/models/${exp_name} + async_upload: false + output_dir: /data/ckpt_path/models/${exp_name} track_with: stdout @@ -18,11 +19,8 @@ save_steps: 1 rollout_batch_size: 512 prompt_length: 128 response_length: 512 -#resume_from_checkpoint: /data/cpfs_0/xiongshaopan.xsp/models/megatron_0.5B_ckpt/checkpoint-0 -#resume_from_checkpoint: /data/cpfs_0/xiongshaopan.xsp/models/megatron_0.5B_async_ckpt/checkpoint-1 -#resume_from_checkpoint: /data/cpfs_0/xiongshaopan.xsp/models/megatron_train_checkpoint_debug/20250305-172118/checkpoint-1 -resume_from_checkpoint: /data/cpfs_0/xiongshaopan.xsp/models/megatron_train_checkpoint_debug/20250305-212054/checkpoint-1 -pretrain: /data/cpfs_0/common/models/Qwen2.5-0.5B-Instruct +resume_from_checkpoint: false +pretrain: Qwen/Qwen2.5-7B-Instruct actor_train: model_args: @@ -36,6 +34,8 @@ actor_train: gradient_accumulation_steps: 8 warmup_steps: 5 num_train_epochs: 1 + save_hf_model: true + data_args: template: qwen2_5 file_name: data/comparison_gpt4_data_zh.json diff --git a/tests/distributed/strategy/checkpoint/test_fsdp_strategy.py b/tests/distributed/strategy/checkpoint/test_fsdp_strategy.py new file mode 100644 index 000000000..59b7f8802 --- /dev/null +++ b/tests/distributed/strategy/checkpoint/test_fsdp_strategy.py @@ -0,0 +1,54 @@ +from typing import Any + +import torch + +from roll.distributed.executor.cluster import Cluster +from roll.distributed.scheduler.initialize import init +from roll.models.model_providers import default_tokenizer_provider +from roll.pipeline.base_pipeline import BasePipeline +from roll.pipeline.base_worker import ActorWorker +from roll.pipeline.rlvr.rlvr_config import RLVRConfig +from roll.utils.logging import get_logger +from tests.distributed.strategy.make_baseline_config import make_baseline_config + +logger = get_logger() + + +class TestModelCheckpointPipeline(BasePipeline): + + def __init__(self, pipeline_config: RLVRConfig): + super().__init__(pipeline_config) + + self.tokenizer = default_tokenizer_provider( + model_args=self.pipeline_config.actor_train.model_args, + ) + max_steps = 10240 * self.pipeline_config.actor_train.training_args.num_train_epochs + self.pipeline_config.set_max_steps(max_steps=max_steps) + + self.actor_train: Any = Cluster( + name=self.pipeline_config.actor_train.name, + worker_cls=ActorWorker, + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.actor_train, + ) + self.actor_train.initialize(pipeline_config=self.pipeline_config, blocking=True) + self.set_checkpoint_clusters(self.actor_train) + + @torch.no_grad() + def run(self): + # self.actor_train.strategy.save_checkpoint(self.pipeline_config.output_dir, global_step) + self.state.log_history.append({"global_step": 1}) + self.do_checkpoint(global_step=1) + self.do_checkpoint(global_step=2, is_last_step=True) + + +if __name__ == "__main__": + init() + + ppo_config = make_baseline_config(config_path="./checkpoint", config_name="fsdp_config") + # ppo_config = make_baseline_config( + # config_path="./checkpoint", config_name="fsdp_lora_config" + # ) + + pipeline = TestModelCheckpointPipeline(ppo_config) + metric_list = pipeline.run() diff --git a/tests/distributed/strategy/checkpoint/test_megatron_stategy_ckpt.py b/tests/distributed/strategy/checkpoint/test_megatron_stategy_ckpt.py index fba2d5b8d..20f675299 100644 --- a/tests/distributed/strategy/checkpoint/test_megatron_stategy_ckpt.py +++ b/tests/distributed/strategy/checkpoint/test_megatron_stategy_ckpt.py @@ -19,10 +19,6 @@ class TestModelCheckpointPipeline(BasePipeline): def __init__(self, pipeline_config: RLVRConfig): super().__init__(pipeline_config) - self.tokenizer = default_tokenizer_provider( - model_args=self.pipeline_config.actor_train.model_args, - template_name=self.pipeline_config.actor_train.data_args.template, - ) max_steps = 10240 * self.pipeline_config.actor_train.training_args.num_train_epochs self.pipeline_config.set_max_steps(max_steps=max_steps) @@ -38,7 +34,9 @@ def __init__(self, pipeline_config: RLVRConfig): @torch.no_grad() def run(self): # self.actor_train.strategy.save_checkpoint(self.pipeline_config.output_dir, global_step) + self.state.log_history.append({}) self.do_checkpoint(global_step=1) + self.state.log_history.append({}) self.do_checkpoint(global_step=2) diff --git a/tests/distributed/strategy/context_parallel/test_fsdp2_cp_grad_equivalence.py b/tests/distributed/strategy/context_parallel/test_fsdp2_cp_grad_equivalence.py new file mode 100644 index 000000000..f5eae7711 --- /dev/null +++ b/tests/distributed/strategy/context_parallel/test_fsdp2_cp_grad_equivalence.py @@ -0,0 +1,169 @@ +import os +import socket +import tempfile +from typing import Dict + +import pytest +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + +from roll.utils.context_parallel import get_ulysses_group, set_upg_manager +from roll.utils.context_parallel.autograd_gather import ulysses_gather +from roll.utils.functionals import agg_loss, log_probs_from_logits + + +def _find_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("", 0)) + return sock.getsockname()[1] + + +def _broadcast_state_dict(module: torch.nn.Module, src: int = 0): + # Ensure identical initialization across ranks. + for _, p in module.state_dict().items(): + if torch.is_tensor(p): + dist.broadcast(p, src=src) + + +def _ddp_average_grads(module: torch.nn.Module): + for p in module.parameters(): + if p.grad is None: + continue + dist.all_reduce(p.grad, op=dist.ReduceOp.SUM) + p.grad.div_(dist.get_world_size()) + + +def _run_and_save_grads( + rank: int, + world_size: int, + cp_size: int, + loss_agg_mode: str, + master_port: int, + out_path: str, +): + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(master_port) + + torch.cuda.set_device(rank) + dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) + + set_upg_manager(ulysses_size=cp_size, rank=rank, world_size=world_size) + group = get_ulysses_group() + + torch.manual_seed(1234) + torch.cuda.manual_seed_all(1234) + + vocab = 97 + hidden = 32 + model = torch.nn.Sequential( + torch.nn.Embedding(vocab, hidden), + torch.nn.Linear(hidden, vocab, bias=False), + ).cuda() + _broadcast_state_dict(model, src=0) + + bs, seqlen = 2, 8 + assert seqlen % max(cp_size, 1) == 0 + + if rank == 0: + input_ids = torch.randint(0, vocab, (bs, seqlen), device="cuda", dtype=torch.long) + attention_mask = torch.ones((bs, seqlen), device="cuda", dtype=torch.long) + else: + input_ids = torch.empty((bs, seqlen), device="cuda", dtype=torch.long) + attention_mask = torch.empty((bs, seqlen), device="cuda", dtype=torch.long) + dist.broadcast(input_ids, src=0) + dist.broadcast(attention_mask, src=0) + + if cp_size > 1: + cp_rank = rank % cp_size + shard = seqlen // cp_size + start = cp_rank * shard + end = (cp_rank + 1) * shard + + input_ids_local = input_ids[:, start:end] + logits_local = model(input_ids_local) + + labels = input_ids[:, 1:].clone() + labels[attention_mask[:, 1:] == 0] = 0 + labels = torch.cat([labels, torch.zeros_like(labels[:, :1])], dim=1) + labels_local = labels[:, start:end] + + log_probs_local = log_probs_from_logits(logits_local, labels_local) + log_probs = ulysses_gather( + log_probs_local, + gather_dim=1, + group=group, + grad_scaler=True, + ) + log_probs = log_probs[:, :-1] * attention_mask[:, 1:] + else: + logits = model(input_ids) + labels = input_ids[:, 1:].clone() + labels[attention_mask[:, 1:] == 0] = 0 + labels = torch.cat([labels, torch.zeros_like(labels[:, :1])], dim=1) + log_probs = log_probs_from_logits(logits, labels) + log_probs = log_probs[:, :-1] * attention_mask[:, 1:] + + # PPO-style uses negative log-prob as a loss term. + response_mask = attention_mask[:, 1:].long() + loss = agg_loss(loss_mat=-log_probs, loss_mask=response_mask, loss_agg_mode=loss_agg_mode) + loss.backward() + + # Simulate DP/FSDP gradient averaging across ranks. + _ddp_average_grads(model) + + if rank == 0: + grads: Dict[str, torch.Tensor] = {} + for name, p in model.named_parameters(): + grads[name] = p.grad.detach().cpu() + torch.save({"loss": float(loss.detach().cpu()), "grads": grads}, out_path) + + dist.barrier() + dist.destroy_process_group() + + +@pytest.mark.skipif(not dist.is_available(), reason="torch.distributed is not available") +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires >= 2 CUDA devices") +@pytest.mark.parametrize("loss_agg_mode", ["token-mean", "seq-mean-token-sum"]) +def test_fsdp2_cp_grad_equivalence_vs_cp1(loss_agg_mode: str): + """ + Gradient equivalence test for CP gather semantics. + + We run twice on the same 2-GPU world: + - baseline: cp_size=1 + - CP: cp_size=2 + Both runs do a DDP-style gradient averaging across the 2 ranks. + + With autograd-friendly CP gather (slice-only backward + grad scaling), + the averaged gradients should match the cp_size=1 baseline. + """ + world_size = 2 + + with tempfile.TemporaryDirectory() as td: + out_cp1 = os.path.join(td, f"grads_cp1_{loss_agg_mode}.pt") + out_cp2 = os.path.join(td, f"grads_cp2_{loss_agg_mode}.pt") + + port1 = _find_free_port() + port2 = _find_free_port() + + mp.spawn( + _run_and_save_grads, + args=(world_size, 1, loss_agg_mode, port1, out_cp1), + nprocs=world_size, + join=True, + ) + mp.spawn( + _run_and_save_grads, + args=(world_size, 2, loss_agg_mode, port2, out_cp2), + nprocs=world_size, + join=True, + ) + + ref = torch.load(out_cp1, map_location="cpu") + cp = torch.load(out_cp2, map_location="cpu") + + assert abs(ref["loss"] - cp["loss"]) < 1e-6 + + for k in ref["grads"].keys(): + torch.testing.assert_close(cp["grads"][k], ref["grads"][k], rtol=0, atol=1e-6) diff --git a/tests/distributed/strategy/context_parallel/test_fsdp2_cp_qwen3_hf_equivalence.py b/tests/distributed/strategy/context_parallel/test_fsdp2_cp_qwen3_hf_equivalence.py new file mode 100644 index 000000000..e8514c894 --- /dev/null +++ b/tests/distributed/strategy/context_parallel/test_fsdp2_cp_qwen3_hf_equivalence.py @@ -0,0 +1,163 @@ +import os +import socket + +import pytest +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + +from roll.utils.context_parallel.globals import set_upg_manager +from roll.utils.context_parallel.monkey_patch import apply_ulysses_patch, unapply_ulysses_patch + + +def _find_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("", 0)) + return sock.getsockname()[1] + + +def _pad_to(x: torch.Tensor, target: int, *, dim: int = 1) -> torch.Tensor: + if x.size(dim) >= target: + return x + pad_len = target - x.size(dim) + pad = [0, 0] * x.ndim + pad[2 * (x.ndim - 1 - dim) + 1] = pad_len + return torch.nn.functional.pad(x, pad, value=0) + + +def _gather_seq_shards(x_local: torch.Tensor, lens: list[int], group) -> torch.Tensor: + max_len = max(lens) + x_pad = _pad_to(x_local, max_len, dim=1) + gathered = [torch.empty_like(x_pad) for _ in range(len(lens))] + dist.all_gather(gathered, x_pad, group=group) + parts = [g[:, :l] for g, l in zip(gathered, lens)] + return torch.cat(parts, dim=1) + + +def _worker_qwen3_hf_equivalence(rank: int, world_size: int, port: int, model_id: str) -> None: + transformers = pytest.importorskip("transformers") + pytest.importorskip("flash_attn") + + if not torch.cuda.is_available(): + pytest.skip("Qwen3 HF + FlashAttention2 CP equivalence test requires CUDA") + if torch.cuda.device_count() < world_size: + pytest.skip(f"Need >= {world_size} CUDA devices, got {torch.cuda.device_count()}") + + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(port) + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + + dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) + try: + torch.cuda.set_device(rank) + device = torch.device("cuda", rank) + + from transformers import AutoModelForCausalLM, AutoTokenizer + + try: + tokenizer = AutoTokenizer.from_pretrained(model_id, local_files_only=True, trust_remote_code=True) + except Exception as e: + pytest.skip(f"Tokenizer for {model_id} not available locally: {e}") + + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + try: + model = AutoModelForCausalLM.from_pretrained( + model_id, + local_files_only=True, + trust_remote_code=True, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", + ) + except Exception as e: + pytest.skip(f"Model for {model_id} not available locally (or FA2 unsupported): {e}") + + model.to(device) + model.eval() + + patch_info = apply_ulysses_patch() + if patch_info is None or (isinstance(patch_info, dict) and not patch_info.get("patched", True)): + pytest.skip("Ulysses patch was not applied (no FlashAttention2 hook patched)") + + max_len = 64 + assert max_len % world_size == 0 + + # One long "real-ish" prompt (tokenized by the real tokenizer). + text = ( + "Explain Ulysses context parallelism in Transformers with FlashAttention2. " + "Include a short example and mention sequence sharding, all-to-all, and why it preserves global attention. " + ) + for _ in range(8): + enc = tokenizer( + text, + return_tensors="pt", + padding=False, + truncation=True, + max_length=max_len, + add_special_tokens=True, + ) + if enc["input_ids"].size(1) >= max_len: + break + text = text + " Add more technical detail about rotary embeddings and KV heads." + + input_ids = enc["input_ids"][:, :max_len].to(device) + # Important for equivalence: RoPE/position embedding is applied before the FA2 hook. + position_ids = torch.arange(max_len, device=device, dtype=torch.long).unsqueeze(0) + + with torch.no_grad(): + # Baseline: CP disabled (ulysses_size=1 means the patch is a no-op). + set_upg_manager(ulysses_size=1, rank=rank, world_size=world_size) + baseline = model( + input_ids=input_ids, + position_ids=position_ids, + use_cache=False, + ).logits + + # CP: enable Ulysses group and run on local sequence shard, then gather to full logits. + set_upg_manager(ulysses_size=world_size, rank=rank, world_size=world_size) + + local_len = max_len // world_size + start = rank * local_len + end = start + local_len + + input_ids_local = input_ids[:, start:end] + position_ids_local = position_ids[:, start:end] + + logits_local = model( + input_ids=input_ids_local, + position_ids=position_ids_local, + use_cache=False, + ).logits + + group = dist.group.WORLD + lens = [local_len for _ in range(world_size)] + logits_cp_full = _gather_seq_shards(logits_local.float(), lens, group) + baseline_full = baseline.float() + + if rank == 0: + torch.testing.assert_close(logits_cp_full, baseline_full, rtol=2e-2, atol=2e-2) + finally: + try: + unapply_ulysses_patch() + except Exception: + pass + dist.destroy_process_group() + + +@pytest.mark.skipif(not dist.is_available(), reason="torch.distributed is not available") +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA + FlashAttention2") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires >= 2 CUDA devices for CP all-to-all") +def test_fsdp2_cp_qwen3_hf_logits_equivalence(): + world_size = 2 + port = _find_free_port() + model_id = os.environ.get( + "ROLL_TEST_QWEN3_MODEL_ID", "/home/dilixiati.dlxtmhte/.cache/openlm/hub/14ffd5928d24731fd670f04c645a5928" + ) + mp.spawn( + _worker_qwen3_hf_equivalence, + args=(world_size, port, model_id), + nprocs=world_size, + join=True, + ) diff --git a/tests/distributed/strategy/context_parallel/test_fsdp2_cp_qwen3_hf_rmpad_equivalence.py b/tests/distributed/strategy/context_parallel/test_fsdp2_cp_qwen3_hf_rmpad_equivalence.py new file mode 100644 index 000000000..7b3f294fc --- /dev/null +++ b/tests/distributed/strategy/context_parallel/test_fsdp2_cp_qwen3_hf_rmpad_equivalence.py @@ -0,0 +1,504 @@ +import json +import os +import socket +import time + +import pytest +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + +from roll.utils.context_parallel.globals import get_ulysses_group, set_upg_manager +from roll.utils.context_parallel.monkey_patch import apply_ulysses_patch, unapply_ulysses_patch +from roll.utils.context_parallel.rmpad_ulysses import gather_outputs_and_unpad, ulysses_pad_and_slice_inputs + +_DEBUG_LOG_PATH = os.environ.get("ROLL_DEBUG_LOG_PATH", "output/debug.log") + + +def _dbg(hypothesis_id: str, location: str, message: str, data: dict) -> None: + try: + with open(_DEBUG_LOG_PATH, "a", encoding="utf-8") as f: + f.write( + json.dumps( + { + "sessionId": "debug-session", + "runId": "pre-fix", + "hypothesisId": hypothesis_id, + "location": location, + "message": message, + "data": data, + "timestamp": int(time.time() * 1000), + }, + ensure_ascii=False, + ) + + "\n" + ) + except Exception: + pass + + +def _find_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("", 0)) + return sock.getsockname()[1] + + +def _worker_qwen3_hf_rmpad_equivalence(rank: int, world_size: int, port: int, model_id: str) -> None: + pytest.importorskip("transformers") + pytest.importorskip("flash_attn") + + if not torch.cuda.is_available(): + pytest.skip("Qwen3 HF + FlashAttention2 CP rmpad equivalence test requires CUDA") + if torch.cuda.device_count() < world_size: + pytest.skip(f"Need >= {world_size} CUDA devices, got {torch.cuda.device_count()}") + + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(port) + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + + dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) + try: + torch.cuda.set_device(rank) + device = torch.device("cuda", rank) + + from transformers import AutoModelForCausalLM, AutoTokenizer + from transformers import __version__ as transformers_version + + from flash_attn import __version__ as flash_attn_version + + try: + tokenizer = AutoTokenizer.from_pretrained(model_id, local_files_only=True, trust_remote_code=True) + except Exception as e: + pytest.skip(f"Tokenizer for {model_id} not available locally: {e}") + + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + try: + model = AutoModelForCausalLM.from_pretrained( + model_id, + local_files_only=True, + trust_remote_code=True, + torch_dtype=torch.bfloat16, + attn_implementation="flash_attention_2", + ) + except Exception as e: + pytest.skip(f"Model for {model_id} not available locally (or FA2 unsupported): {e}") + + model.to(device) + model.eval() + + patch_info = apply_ulysses_patch() + if patch_info is None or (isinstance(patch_info, dict) and not patch_info.get("patched", True)): + pytest.skip("Ulysses patch was not applied (no FlashAttention2 hook patched)") + + max_len = 64 + assert max_len % world_size == 0 + + tokenizer.padding_side = "right" + texts = [ + "Explain FlashAttention2 remove-padding (varlen) and how it interacts with rotary embeddings and position ids.", + "Relate remove-padding to Ulysses context parallelism and all-to-all. Give a small example.", + ] + enc = tokenizer( + texts, + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=max_len, + ) + input_ids = enc["input_ids"].to(device) + attention_mask = enc["attention_mask"].to(device) + position_ids = (attention_mask.long().cumsum(dim=1) - 1).clamp_min(0) + + from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input + + if rank == 0 and hasattr(model, "_update_causal_mask"): + original_update_mask = model._update_causal_mask + + def _instrumented_update_mask(attention_mask, input_tensor, cache_position, **kwargs): + result = original_update_mask(attention_mask, input_tensor, cache_position, **kwargs) + prepare_log.append( + { + "input_attn_mask_shape": ( + tuple(attention_mask.shape) if torch.is_tensor(attention_mask) else None + ), + "input_attn_mask_is_none": attention_mask is None, + "input_tensor_shape": tuple(input_tensor.shape) if torch.is_tensor(input_tensor) else None, + "output_mask_shape": tuple(result.shape) if torch.is_tensor(result) else None, + } + ) + return result + + model._update_causal_mask = _instrumented_update_mask + if rank == 0: + enc1 = tokenizer( + [texts[0]], + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=max_len, + ) + input_ids1 = enc1["input_ids"].to(device) + attention_mask1 = enc1["attention_mask"].to(device) + position_ids1 = (attention_mask1.long().cumsum(dim=1) - 1).clamp_min(0) + + with torch.no_grad(): + out_padded_1 = model( + input_ids=input_ids1, + attention_mask=attention_mask1, + position_ids=position_ids1, + use_cache=False, + ).logits.float() + input_ids_rmpad_1, indices_1, *_ = unpad_input(input_ids1.unsqueeze(-1), attention_mask1) + input_ids_rmpad_1 = input_ids_rmpad_1.transpose(0, 1) + position_ids_rmpad_1 = index_first_axis( + rearrange(position_ids1.unsqueeze(-1), "b s ... -> (b s) ..."), + indices_1, + ).transpose(0, 1) + out_rmpad_1 = model( + input_ids=input_ids_rmpad_1, + attention_mask=None, + position_ids=position_ids_rmpad_1, + use_cache=False, + ).logits.float() + out_rmpad_1 = pad_input( + hidden_states=out_rmpad_1.squeeze(0).unsqueeze(-1), + indices=indices_1, + batch=1, + seqlen=input_ids1.size(1), + ).squeeze(-1) + m1 = attention_mask1.to(torch.bool) + max_abs_1 = float((out_padded_1 - out_rmpad_1).abs()[m1].max().item()) if m1.any() else 0.0 + _dbg( + "H9", + "tests/.../test_fsdp2_cp_qwen3_hf_rmpad_equivalence.py:bs1_probe", + "padded_vs_rmpad_bs1", + {"masked_max_abs_padded_vs_rmpad_bs1": max_abs_1, "mask_sum": int(attention_mask1.sum().item())}, + ) + if rank == 0: + _dbg( + "H1", + "tests/.../test_fsdp2_cp_qwen3_hf_rmpad_equivalence.py:_worker", + "env_and_batch", + { + "model_id": str(model_id), + "transformers_version": str(transformers_version), + "flash_attn_version": str(flash_attn_version), + "world_size": int(world_size), + "max_len": int(max_len), + "mask_sum_per_sample": [int(x) for x in attention_mask.sum(dim=1).tolist()], + "pos0_first8": position_ids[0, :8].tolist(), + "pos1_first8": position_ids[1, :8].tolist(), + }, + ) + original_fa2_forward = None + call_log = {"padded": [], "rmpad": []} + original_layer_forward = None + first_layer = None + layer_call_info = {"padded": None, "rmpad": None} + prepare_log = [] + original_update_mask = None + + if rank == 0: + + def _instrumented_fa2(*args, **kwargs): + import inspect + + sig = inspect.signature(original_fa2_forward) + bound = sig.bind(*args, **kwargs) + bound.apply_defaults() + params = bound.arguments + call_log["current_mode"].append( + { + "attention_mask_is_none": params.get("attention_mask") is None, + "has_cu_seqlens_q": "cu_seqlens_q" in params, + "has_cu_seqlens_k": "cu_seqlens_k" in params, + "query_length": int(params.get("query_length", -1)) if params.get("query_length") else -1, + "query_shape": ( + tuple(params["query_states"].shape) + if torch.is_tensor(params.get("query_states")) + else None + ), + } + ) + return original_fa2_forward(*args, **kwargs) + + try: + from transformers.integrations import flash_attention as fa_module + + original_fa2_forward = fa_module._flash_attention_forward + fa_module._flash_attention_forward = _instrumented_fa2 + except Exception: + try: + import transformers.modeling_flash_attention_utils as mfu + + original_fa2_forward = mfu._flash_attention_forward + mfu._flash_attention_forward = _instrumented_fa2 + except Exception: + original_fa2_forward = None + + with torch.no_grad(): + set_upg_manager(ulysses_size=1, rank=rank, world_size=world_size) + + if rank == 0 and original_fa2_forward is not None: + call_log["current_mode"] = call_log["padded"] + + baseline_padded_out = model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + use_cache=False, + output_hidden_states=True, + ) + baseline_padded = baseline_padded_out.logits + + if rank == 0: + first_layer = model.model.layers[0].self_attn + original_layer_forward = first_layer.forward + + def _instrumented_layer_forward(hidden_states, attention_mask=None, position_ids=None, **kwargs): + layer_call_info["current"]["attn_mask_shape"] = ( + tuple(attention_mask.shape) if torch.is_tensor(attention_mask) else None + ) + layer_call_info["current"]["attn_mask_is_none"] = attention_mask is None + layer_call_info["current"]["pos_ids_shape"] = ( + tuple(position_ids.shape) if torch.is_tensor(position_ids) else None + ) + return original_layer_forward( + hidden_states, attention_mask=attention_mask, position_ids=position_ids, **kwargs + ) + + first_layer.forward = _instrumented_layer_forward + + layer_call_info["current"] = layer_call_info["padded"] = {} + with torch.no_grad(): + _ = model( + input_ids=input_ids[:1, :8], # small probe + attention_mask=attention_mask[:1, :8], + position_ids=position_ids[:1, :8], + use_cache=False, + ) + if rank == 0 and original_fa2_forward is not None: + call_log["current_mode"] = call_log["rmpad"] + + input_ids_rmpad_base, indices_base, *_ = unpad_input(input_ids.unsqueeze(-1), attention_mask) + input_ids_rmpad_base = input_ids_rmpad_base.transpose(0, 1) # (1, total_nnz) + position_ids_rmpad_base = index_first_axis( + rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), + indices_base, + ).transpose( + 0, 1 + ) # (1, total_nnz) + + if rank == 0: + layer_call_info["current"] = layer_call_info["rmpad"] = {} + with torch.no_grad(): + # Create a small packed input to probe + probe_ids = torch.cat([input_ids[0, :4], input_ids[1, :4]], dim=0).unsqueeze(0) # (1, 8) + probe_pos = torch.tensor([[0, 1, 2, 3, 0, 1, 2, 3]], device=device) # position resets + _ = model(input_ids=probe_ids, attention_mask=None, position_ids=probe_pos, use_cache=False) + + # Restore original forward + first_layer.forward = original_layer_forward + + _dbg( + "H12", + "tests/.../test_fsdp2_cp_qwen3_hf_rmpad_equivalence.py:layer_inputs", + "layer_attn_mask_comparison", + { + "padded_mask_shape": layer_call_info["padded"].get("attn_mask_shape"), + "padded_mask_is_none": layer_call_info["padded"].get("attn_mask_is_none"), + "rmpad_mask_shape": layer_call_info["rmpad"].get("attn_mask_shape"), + "rmpad_mask_is_none": layer_call_info["rmpad"].get("attn_mask_is_none"), + "padded_pos_shape": layer_call_info["padded"].get("pos_ids_shape"), + "rmpad_pos_shape": layer_call_info["rmpad"].get("pos_ids_shape"), + }, + ) + # endregion agent log + + baseline_rmpad_out = model( + input_ids=input_ids_rmpad_base, + attention_mask=None, + position_ids=position_ids_rmpad_base, + use_cache=False, + output_hidden_states=True, + ) + logits_rmpad_base = baseline_rmpad_out.logits # (1, total_nnz, vocab) + + baseline_rmpad = pad_input( + hidden_states=logits_rmpad_base.squeeze(0).unsqueeze(-1), + indices=indices_base, + batch=input_ids.size(0), + seqlen=input_ids.size(1), + ).squeeze(-1) + + # H10: locate the earliest hidden-state mismatch (after first decoder block). + if ( + rank == 0 + and getattr(baseline_padded_out, "hidden_states", None) is not None + and getattr(baseline_rmpad_out, "hidden_states", None) is not None + ): + hs_padded = baseline_padded_out.hidden_states + hs_rmpad = baseline_rmpad_out.hidden_states + # hidden_states[0] is embedding output; [1] is after first layer (for most HF decoder models). + if len(hs_padded) > 1 and len(hs_rmpad) > 1: + hs1_padded = hs_padded[1].float() + hs1_rmpad = hs_rmpad[1].float() # (1, total_nnz, hidden) + hs1_rmpad_padded = pad_input( + hidden_states=hs1_rmpad.squeeze(0).unsqueeze(-1), + indices=indices_base, + batch=input_ids.size(0), + seqlen=input_ids.size(1), + ).squeeze(-1) + m = attention_mask.to(torch.bool) + max_abs_hs1 = float((hs1_padded - hs1_rmpad_padded).abs()[m].max().item()) if m.any() else 0.0 + tok0_abs_hs1 = float((hs1_padded[0, 0] - hs1_rmpad_padded[0, 0]).abs().max().item()) + _dbg( + "H10", + "tests/.../test_fsdp2_cp_qwen3_hf_rmpad_equivalence.py:hidden_states", + "hs_layer1_diff", + {"masked_max_abs_hs1": max_abs_hs1, "sample0_tok0_max_abs_hs1": tok0_abs_hs1}, + ) + + set_upg_manager(ulysses_size=world_size, rank=rank, world_size=world_size) + + input_ids_rmpad, indices, cu_seqlens, max_seqlen, _ = unpad_input(input_ids.unsqueeze(-1), attention_mask) + input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz) + + position_ids_rmpad = index_first_axis( + rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), + indices, + ).transpose( + 0, 1 + ) # (1, total_nnz) + + if rank == 0: + _dbg( + "H2", + "tests/.../test_fsdp2_cp_qwen3_hf_rmpad_equivalence.py:unpad_input", + "rmpad_metadata", + { + "total_nnz": int(input_ids_rmpad.size(1)), + "cu_seqlens_shape": tuple(cu_seqlens.shape) if torch.is_tensor(cu_seqlens) else None, + "cu_seqlens_head": ( + cu_seqlens[: min(6, cu_seqlens.numel())].tolist() if torch.is_tensor(cu_seqlens) else None + ), + "max_seqlen": int(max_seqlen) if max_seqlen is not None else None, + "position_ids_rmpad_head": position_ids_rmpad[ + 0, : min(10, position_ids_rmpad.size(1)) + ].tolist(), + }, + ) + # endregion agent log + + input_ids_rmpad, position_ids_rmpad, pad_size = ulysses_pad_and_slice_inputs( + input_ids_rmpad, + position_ids_rmpad, + cp_size=world_size, + cp_rank=rank, + ) + + logits_rmpad_local = model( + input_ids=input_ids_rmpad, + attention_mask=None, + position_ids=position_ids_rmpad, + use_cache=False, + ).logits # (1, local_tokens, vocab) + + logits_rmpad = gather_outputs_and_unpad( + logits_rmpad_local, + gather_dim=1, + unpad_dim=1, + padding_size=pad_size, + group=get_ulysses_group(), + ) + + logits = pad_input( + hidden_states=logits_rmpad.squeeze(0).unsqueeze(-1), + indices=indices, + batch=input_ids.size(0), + seqlen=input_ids.size(1), + ).squeeze(-1) + + baseline_padded_full = baseline_padded.float() + baseline_rmpad_full = baseline_rmpad.float() + + if rank == 0: + mask = attention_mask.to(torch.bool) + diff_cp = (logits.float() - baseline_padded_full).abs() + diff_rmpad = (baseline_rmpad_full - baseline_padded_full).abs() + max_abs_cp = float(diff_cp[mask].max().item()) if mask.any() else 0.0 + max_abs_rmpad = float(diff_rmpad[mask].max().item()) if mask.any() else 0.0 + tok0_abs_cp = float(diff_cp[0, 0].max().item()) + tok0_abs_rmpad = float(diff_rmpad[0, 0].max().item()) + _dbg( + "H4", + "tests/.../test_fsdp2_cp_qwen3_hf_rmpad_equivalence.py:compare", + "masked_diff_stats", + { + "masked_max_abs_cp_vs_padded": max_abs_cp, + "masked_max_abs_rmpad_vs_padded": max_abs_rmpad, + "sample0_tok0_max_abs_cp_vs_padded": tok0_abs_cp, + "sample0_tok0_max_abs_rmpad_vs_padded": tok0_abs_rmpad, + }, + ) + if original_fa2_forward is not None: + try: + from transformers.integrations import flash_attention as fa_module + + fa_module._flash_attention_forward = original_fa2_forward + except Exception: + try: + import transformers.modeling_flash_attention_utils as mfu + + mfu._flash_attention_forward = original_fa2_forward + except Exception: + pass + _dbg( + "H11", + "tests/.../test_fsdp2_cp_qwen3_hf_rmpad_equivalence.py:fa2_calls", + "fa2_call_comparison", + { + "padded_calls": call_log["padded"][:3], + "rmpad_calls": call_log["rmpad"][:3], + "padded_count": len(call_log["padded"]), + "rmpad_count": len(call_log["rmpad"]), + }, + ) + + if original_update_mask is not None: + model._update_causal_mask = original_update_mask + _dbg( + "H13", + "tests/.../test_fsdp2_cp_qwen3_hf_rmpad_equivalence.py:prepare_calls", + "mask_generation_calls", + {"prepare_log": prepare_log[:6], "total_calls": len(prepare_log)}, + ) + + torch.testing.assert_close(logits.float()[mask], baseline_padded_full[mask], rtol=2e-2, atol=2e-2) + finally: + try: + unapply_ulysses_patch() + except Exception: + pass + dist.destroy_process_group() + + +@pytest.mark.skipif(not dist.is_available(), reason="torch.distributed is not available") +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA + FlashAttention2") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires >= 2 CUDA devices for CP all-to-all") +def test_fsdp2_cp_qwen3_hf_rmpad_logits_equivalence(): + world_size = 2 + port = _find_free_port() + model_id = os.environ.get( + "ROLL_TEST_QWEN3_MODEL_ID", + "/home/dilixiati.dlxtmhte/.cache/openlm/hub/14ffd5928d24731fd670f04c645a5928", + ) + mp.spawn( + _worker_qwen3_hf_rmpad_equivalence, + args=(world_size, port, model_id), + nprocs=world_size, + join=True, + ) diff --git a/tests/distributed/strategy/context_parallel/test_fsdp2_cp_ulysses_equivalence.py b/tests/distributed/strategy/context_parallel/test_fsdp2_cp_ulysses_equivalence.py new file mode 100644 index 000000000..6a0679212 --- /dev/null +++ b/tests/distributed/strategy/context_parallel/test_fsdp2_cp_ulysses_equivalence.py @@ -0,0 +1,94 @@ +import os + +import torch +import torch.distributed as dist + +from roll.utils.context_parallel.globals import get_ulysses_group, get_ulysses_size, set_upg_manager +from roll.utils.context_parallel.hf_flash_attention_patch import make_ulysses_flash_attention_forward + + +def _pad_to(x: torch.Tensor, target: int) -> torch.Tensor: + if x.size(1) >= target: + return x + pad_len = target - x.size(1) + pad = [0, 0] * x.ndim + pad[2 * (x.ndim - 2) + 1] = pad_len + return torch.nn.functional.pad(x, pad, value=0) + + +def _gather_seq_shards(x_local: torch.Tensor, lens: list[int], group) -> torch.Tensor: + max_len = max(lens) + x_pad = _pad_to(x_local, max_len) + gathered = [torch.empty_like(x_pad) for _ in range(len(lens))] + dist.all_gather(gathered, x_pad, group=group) + parts = [g[:, :l] for g, l in zip(gathered, lens)] + return torch.cat(parts, dim=1) + + +def original_forward(query_states, key_states, value_states, attention_mask, query_length, *args, **kwargs): + # A head-wise function that depends on the full sequence length, so CP needs correct all-to-all. + # Shape in/out: (bs, seqlen, heads, dim) + assert query_states.size(1) == query_length + global_mix = query_states.mean(dim=1, keepdim=True) # (bs, 1, heads, dim) + return query_states + global_mix + + +def main(): + backend = "gloo" + if not dist.is_initialized(): + dist.init_process_group(backend=backend) + + rank = dist.get_rank() + world = dist.get_world_size() + assert world == 2, "This smoke test expects torchrun --nproc_per_node=2" + + # Use the full world as the CP group for simplicity. + set_upg_manager(ulysses_size=world, rank=rank, world_size=world) + group = get_ulysses_group() + assert group is not None and get_ulysses_size() == world + + # Variable local lengths to simulate remove-padding imbalance. + local_len = 2 + rank # rank0=2, rank1=3 => total=5 + lens_t = torch.tensor([local_len], dtype=torch.int64) + lens_list = [torch.zeros_like(lens_t) for _ in range(world)] + dist.all_gather(lens_list, lens_t, group=group) + lens = [int(x.item()) for x in lens_list] + total_len = sum(lens) + + # Shapes + bs, heads, dim = 1, 4, 2 # heads divisible by world + + torch.manual_seed(1234) + q_local = torch.randn(bs, local_len, heads, dim) + k_local = torch.randn(bs, local_len, heads, dim) + v_local = torch.randn(bs, local_len, heads, dim) + attn_mask_local = torch.ones(bs, local_len, dtype=torch.long) + + # Wrapped call (simulates patched HF hook) + wrapped = make_ulysses_flash_attention_forward(original_forward) + out_local = wrapped(q_local, k_local, v_local, attn_mask_local, local_len) + + # Baseline: run original on the *global* sequence (cp_size=1 semantics), then slice back to local. + q_global = _gather_seq_shards(q_local, lens, group) + k_global = _gather_seq_shards(k_local, lens, group) + v_global = _gather_seq_shards(v_local, lens, group) + attn_mask_global = _gather_seq_shards(attn_mask_local.unsqueeze(-1).to(q_local.dtype), lens, group).squeeze(-1) + + baseline_global = original_forward(q_global, k_global, v_global, attn_mask_global, total_len) + + start = sum(lens[:rank]) + end = start + local_len + baseline_local = baseline_global[:, start:end] + + torch.testing.assert_close(out_local, baseline_local, rtol=0, atol=1e-6) + + if rank == 0: + print("Ulysses wrapper equivalence smoke test passed.") + + dist.barrier() + dist.destroy_process_group() + + +if __name__ == "__main__": + os.environ.setdefault("TORCH_DISTRIBUTED_DEBUG", "DETAIL") + main() diff --git a/tests/distributed/strategy/context_parallel/test_fsdp2_cp_vlm_rmpad_equivalence.py b/tests/distributed/strategy/context_parallel/test_fsdp2_cp_vlm_rmpad_equivalence.py new file mode 100644 index 000000000..faa3439ea --- /dev/null +++ b/tests/distributed/strategy/context_parallel/test_fsdp2_cp_vlm_rmpad_equivalence.py @@ -0,0 +1,484 @@ +import inspect +import os +import socket +from pathlib import Path + +import pytest +import torch +import torch.distributed as dist +import torch.multiprocessing as mp + +from roll.models.model_providers import get_extra_data_provider, load_model +from roll.utils.context_parallel.globals import get_ulysses_group, set_upg_manager +from roll.utils.context_parallel.monkey_patch import apply_ulysses_patch, unapply_ulysses_patch +from roll.utils.context_parallel.rmpad_ulysses import gather_outputs_and_unpad, ulysses_pad_inputs + +try: + # Optional debugging capture utilities used elsewhere in tests. + from tests.distributed.strategy.log_probs.layer_states_capture import is_enabled as _capture_is_enabled + from tests.distributed.strategy.log_probs.layer_states_capture import save_tensor as _capture_save_tensor +except Exception: # pragma: no cover + + def _capture_is_enabled() -> bool: + return False + + def _capture_save_tensor(*_args, **_kwargs): + return None + + +def _maybe_save_cp_gathered_tensors( + *, + rank: int, + base_logits: torch.Tensor | None, + cp_gathered_logits: torch.Tensor | None, + attention_mask: torch.Tensor | None = None, +): + """ + Opt-in persistence of gathered CP outputs to debug divergence. + + Enable either: + - CP_GATHER_SAVE_DIR=/path (saves via torch.save to that directory), OR + - LAYER_STATES_SAVE_DIR=... (uses layer_states_capture.save_tensor), plus CP_SAVE_GATHERED=1 + (handy when you already have layer-state capture configured). + + Notes: + - We save only on rank0 by default to avoid duplicate files. + - We also save a small per-token error map to quickly localize divergence. + """ + if os.getenv("CP_SAVE_GATHERED", "0") != "1": + return + if rank != 0: + return + if base_logits is None or cp_gathered_logits is None: + return + + with torch.no_grad(): + # (bs, seq, vocab) -> (bs, seq) + err_absmax = (cp_gathered_logits.float() - base_logits.float()).abs().amax(dim=-1) + if attention_mask is not None: + err_absmax = err_absmax * attention_mask.to(err_absmax.dtype) + + save_dir = os.getenv("CP_GATHER_SAVE_DIR", "").strip() + prefix = os.getenv("CP_GATHER_PREFIX", "cp_gather").strip() or "cp_gather" + step = os.getenv("LAYER_STATES_STEP", "0") + batch = os.getenv("LAYER_STATES_BATCH", "0") + + if save_dir: + out_dir = Path(save_dir) + out_dir.mkdir(parents=True, exist_ok=True) + torch.save(base_logits.detach().cpu(), out_dir / f"{prefix}_step{step}_batch{batch}_base_logits.pt") + torch.save( + cp_gathered_logits.detach().cpu(), out_dir / f"{prefix}_step{step}_batch{batch}_cp_gathered_logits.pt" + ) + torch.save(err_absmax.detach().cpu(), out_dir / f"{prefix}_step{step}_batch{batch}_cp_vs_base_err_absmax.pt") + return + + if _capture_is_enabled(): + _capture_save_tensor(base_logits, "base_logits", subdir="cp_gather") + _capture_save_tensor(cp_gathered_logits, "cp_gathered_logits", subdir="cp_gather") + _capture_save_tensor(err_absmax, "cp_vs_base_err_absmax", subdir="cp_gather") + + +def _find_free_port() -> int: + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("", 0)) + return sock.getsockname()[1] + + +def _make_dummy_pil_image(): + PIL = pytest.importorskip("PIL") + from PIL import Image + + # Deterministic small RGB image. + w, h = 32, 32 + arr = torch.arange(w * h * 3, dtype=torch.uint8).reshape(h, w, 3).numpy() + return Image.fromarray(arr, mode="RGB") + + +def _build_mm_batch(model_path: str, device: torch.device, max_len: int = 64): + transformers = pytest.importorskip("transformers") + from transformers import AutoProcessor, AutoTokenizer + + processor = AutoProcessor.from_pretrained(model_path, local_files_only=True, trust_remote_code=True) + tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True, trust_remote_code=True) + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + + img = _make_dummy_pil_image() + # Qwen-VL style models require explicit vision placeholder tokens in the text stream + # so that image/video features can be scattered into matching token positions. + text = "<|vision_start|><|image_pad|><|vision_end|> Describe the image briefly." + # Many VLM processors accept `text` + `images`; keep it simple and deterministic. + # + # Qwen3-VL is strict about multimodal token counts: if truncation clips placeholder tokens, + # it raises an error. So we disable truncation and retry with a larger max_length if needed. + last_err = None + for trial_max_len in (max_len, 128, 256, 512): + try: + model_inputs = processor( + text=[text], + images=[img], + return_tensors="pt", + padding="max_length", + truncation=False, + max_length=trial_max_len, + ) + max_len = trial_max_len + break + except ValueError as e: + last_err = e + continue + else: + raise last_err # type: ignore[misc] + model_inputs = {k: v.to(device) if torch.is_tensor(v) else v for k, v in dict(model_inputs).items()} + + input_ids = model_inputs["input_ids"] + attention_mask = model_inputs["attention_mask"] + + # Position ids: use existing ROLL provider (qwen2-vl) or default (others, incl qwen3-vl). + extra_provider = get_extra_data_provider(model_path, processor=processor) + extra_kwargs = { + "input_ids": input_ids, + "attention_mask": attention_mask, + "image_grid_thw": model_inputs.get("image_grid_thw"), + "video_grid_thw": model_inputs.get("video_grid_thw"), + } + # `get_extra_data_provider()` returns providers with different signatures: + # - Qwen2-VL-style provider expects image/video grid args + # - default provider only accepts (input_ids, attention_mask) + try: + sig = inspect.signature(extra_provider) + accepted = set(sig.parameters.keys()) + filtered_kwargs = {k: v for k, v in extra_kwargs.items() if k in accepted} + extra = extra_provider(**filtered_kwargs) + except Exception: + # Best-effort fallback (handles unexpected kwargs TypeError). + extra = extra_provider(input_ids=input_ids, attention_mask=attention_mask) + position_ids = extra["position_ids"].to(device) + # Match strategy behavior: (bs, C, seqlen) -> (C, bs, seqlen) + if position_ids.dim() == 3: + position_ids = position_ids.transpose(0, 1).contiguous() + + # Keep only tensors relevant for forward. + mm_args = {} + for k in ("pixel_values", "pixel_values_videos", "image_grid_thw", "video_grid_thw"): + if k in model_inputs and torch.is_tensor(model_inputs[k]): + mm_args[k] = model_inputs[k] + # Some VLMs have conditional vision tower paths; keep consistent with pipelines. + mm_args["force_vit_image"] = True + + return input_ids, attention_mask, position_ids, mm_args + + +def _to_rmpad(input_ids: torch.Tensor, attention_mask: torch.Tensor, position_ids: torch.Tensor): + pytest.importorskip("flash_attn") + from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input + + input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1), attention_mask) + input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz) + + if position_ids.dim() == 3: + position_ids_rmpad = ( + index_first_axis( + rearrange(position_ids, "c b s ... -> (b s) c ..."), + indices, + ) + .transpose(0, 1) + .unsqueeze(1) + ) # (C, 1, total_nnz) + else: + position_ids_rmpad = index_first_axis( + rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), + indices, + ).transpose( + 0, 1 + ) # (1, total_nnz) + + def pad_back(x_rmpad: torch.Tensor) -> torch.Tensor: + # x_rmpad: (1, total_nnz, ...) + dense = pad_input( + hidden_states=x_rmpad.squeeze(0).unsqueeze(-1), + indices=indices, + batch=input_ids.size(0), + seqlen=input_ids.size(1), + ).squeeze(-1) + return dense + + return input_ids_rmpad, position_ids_rmpad, pad_back + + +def _worker_vlm_cp_equivalence(rank: int, world_size: int, port: int, model_path: str): + pytest.importorskip("transformers") + pytest.importorskip("flash_attn") + + if not torch.cuda.is_available(): + pytest.skip("VLM CP equivalence test requires CUDA") + if torch.cuda.device_count() < world_size: + pytest.skip(f"Need >= {world_size} CUDA devices, got {torch.cuda.device_count()}") + + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(port) + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + + dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) + try: + torch.cuda.set_device(rank) + device = torch.device("cuda", rank) + + # Patch HF attention hooks for Ulysses. + patch_info = apply_ulysses_patch() + if patch_info is None or (isinstance(patch_info, dict) and not patch_info.get("patched", True)): + pytest.skip("Ulysses patch was not applied (no FlashAttention2 hook patched)") + + # Load model via ROLL provider so our VLM CP decoder patch is exercised. + from roll.configs.model_args import ModelArguments + + model_args = ModelArguments( + model_name_or_path=model_path, + attn_implementation="fa2", + dtype="bf16", + ulysses_size=world_size, # install decoder slice patch; runtime CP size controlled by set_upg_manager + ) + # Force each rank to keep weights on its own GPU. + model_args.device_map = {"": rank} + + model = load_model(model_args=model_args, is_trainable=False) + model.eval() + + input_ids, attention_mask, position_ids, mm_args = _build_mm_batch(model_path, device=device, max_len=256) + input_ids_rmpad, position_ids_rmpad, pad_back = _to_rmpad(input_ids, attention_mask, position_ids) + + # Baseline: CP disabled (ulysses_size=1 semantics) on the same world_size job. + set_upg_manager(ulysses_size=1, rank=rank, world_size=world_size) + with torch.no_grad(): + out_base = model( + input_ids=input_ids_rmpad, + attention_mask=None, + position_ids=position_ids_rmpad, + use_cache=False, + **mm_args, + ).logits # (1, total_nnz, vocab) + dense_base = pad_back(out_base) + + # CP: use slice-after-embedding (pad-only here, slice in decoder patch). + set_upg_manager(ulysses_size=world_size, rank=rank, world_size=world_size) + input_ids_pad, pos_pad, pad_size = ulysses_pad_inputs( + input_ids_rmpad, + position_ids_rmpad, + cp_size=world_size, + ) + with torch.no_grad(): + out_local = model( + input_ids=input_ids_pad, + attention_mask=None, + position_ids=pos_pad, + use_cache=False, + **mm_args, + ).logits # (1, local_tokens, vocab) + + out_full = gather_outputs_and_unpad( + out_local, + gather_dim=1, + unpad_dim=1, + padding_size=pad_size, + group=get_ulysses_group(), + ) + dense_cp = pad_back(out_full) + + _maybe_save_cp_gathered_tensors( + rank=rank, + base_logits=dense_base, + cp_gathered_logits=dense_cp, + attention_mask=attention_mask, + ) + + if rank == 0: + mask = attention_mask.to(torch.bool) + # Compare a small vocabulary slice to reduce memory pressure. + dense_base_s = dense_base[..., :64].float() + dense_cp_s = dense_cp[..., :64].float() + torch.testing.assert_close(dense_cp_s[mask], dense_base_s[mask], rtol=3e-2, atol=3e-2) + finally: + try: + unapply_ulysses_patch() + except Exception: + pass + dist.destroy_process_group() + + +def _worker_vlm_cp_equivalence_nonrmpad(rank: int, world_size: int, port: int, model_path: str): + pytest.importorskip("transformers") + pytest.importorskip("flash_attn") + + if not torch.cuda.is_available(): + pytest.skip("VLM CP equivalence test requires CUDA") + if torch.cuda.device_count() < world_size: + pytest.skip(f"Need >= {world_size} CUDA devices, got {torch.cuda.device_count()}") + + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(port) + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + + dist.init_process_group(backend="nccl", rank=rank, world_size=world_size) + try: + torch.cuda.set_device(rank) + device = torch.device("cuda", rank) + + patch_info = apply_ulysses_patch() + if patch_info is None or (isinstance(patch_info, dict) and not patch_info.get("patched", True)): + pytest.skip("Ulysses patch was not applied (no FlashAttention2 hook patched)") + + from roll.configs.model_args import ModelArguments + + model_args = ModelArguments( + model_name_or_path=model_path, + attn_implementation="fa2", + dtype="bf16", + ulysses_size=world_size, + ) + model_args.device_map = {"": rank} + model = load_model(model_args=model_args, is_trainable=False) + from tests.distributed.strategy.log_probs.apply_model_patch import apply_qwen3vl_patches + + if apply_qwen3vl_patches(): + print("Applied Qwen3VL layer states capture patches") + model.eval() + + # Use a length divisible by world_size to match CP shard requirements. + input_ids, attention_mask, position_ids, mm_args = _build_mm_batch(model_path, device=device, max_len=256) + assert input_ids.size(1) % world_size == 0 + + # Baseline (CP disabled) -> full logits. + os.environ["LAYER_STATES_SAVE_DIR"] = "./cp_layerwise_out/base" + os.environ["LAYER_STATES_PREFIX"] = "base" + os.environ["LAYER_STATES_STEP"] = "0" + os.environ["LAYER_STATES_BATCH"] = "0" + if rank == 0: # attach only one process to avoid chaos + import debugpy + + debugpy.listen(("0.0.0.0", 5679)) + print("Waiting for debugger attach on 5678...") + debugpy.wait_for_client() + debugpy.breakpoint() # or use breakpoint() after attach + set_upg_manager(ulysses_size=1, rank=rank, world_size=world_size) + with torch.no_grad(): + base_output = model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + use_cache=False, + output_hidden_states=True, + **mm_args, + ) + + base_states = base_output.hidden_states + base_layer_states = base_output.layer_states + base = base_output.logits # (bs, seq, vocab) + + # CP enabled -> decoder outputs local shard; gather to full for comparison. + os.environ["LAYER_STATES_SAVE_DIR"] = "./cp_layerwise_out/cp" + os.environ["LAYER_STATES_PREFIX"] = "cp" + set_upg_manager(ulysses_size=world_size, rank=rank, world_size=world_size) + with torch.no_grad(): + local_output = model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + use_cache=False, + output_hidden_states=True, + **mm_args, + ) + + local_states = local_output.hidden_states + local_layer_states = local_output.layer_states + local = local_output.logits # (bs, local_seq, vocab) + + # Sanity: ensure CP actually shards the sequence. + assert dist.get_world_size(get_ulysses_group()) == world_size + assert local.size(1) * world_size == input_ids.size(1), ( + f"Expected local_seq={input_ids.size(1)//world_size}, got local_seq={local.size(1)}. " + "This usually means the VLM decoder slice-after-embedding patch did not take effect." + ) + + full = gather_outputs_and_unpad( + local, + gather_dim=1, + unpad_dim=None, + padding_size=0, + group=get_ulysses_group(), + ) + + _maybe_save_cp_gathered_tensors( + rank=rank, + base_logits=base, + cp_gathered_logits=full, + attention_mask=attention_mask, + ) + + if rank == 0: + mask = attention_mask.to(torch.bool) + base_s = base.float() + full_s = full.float() + torch.testing.assert_close(full_s[mask], base_s[mask], rtol=3e-2, atol=3e-2) + finally: + try: + unapply_ulysses_patch() + except Exception: + pass + dist.destroy_process_group() + + +@pytest.mark.skipif(not dist.is_available(), reason="torch.distributed is not available") +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA + FlashAttention2") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires >= 2 CUDA devices for CP all-to-all") +@pytest.mark.parametrize( + "env_key", + [ + "ROLL_TEST_QWEN25VL_PATH", + "ROLL_TEST_QWEN3VL_PATH", + ], +) +def test_fsdp2_cp_vlm_rmpad_equivalence(env_key: str): + model_path = os.environ.get(env_key) + if not model_path: + pytest.skip(f"Set {env_key} to a local model path to run this test.") + if not os.path.exists(model_path): + pytest.skip(f"{env_key}={model_path} does not exist on this machine.") + + world_size = 2 + port = _find_free_port() + mp.spawn( + _worker_vlm_cp_equivalence, + args=(world_size, port, model_path), + nprocs=world_size, + join=True, + ) + + +@pytest.mark.skipif(not dist.is_available(), reason="torch.distributed is not available") +@pytest.mark.skipif(not torch.cuda.is_available(), reason="Requires CUDA + FlashAttention2") +@pytest.mark.skipif(torch.cuda.device_count() < 2, reason="Requires >= 2 CUDA devices for CP all-to-all") +@pytest.mark.parametrize( + "env_key", + [ + "ROLL_TEST_QWEN25VL_PATH", + "ROLL_TEST_QWEN3VL_PATH", + ], +) +def test_fsdp2_cp_vlm_nonrmpad_equivalence(env_key: str): + model_path = os.environ.get(env_key) + if not model_path: + pytest.skip(f"Set {env_key} to a local model path to run this test.") + if not os.path.exists(model_path): + pytest.skip(f"{env_key}={model_path} does not exist on this machine.") + + world_size = 2 + port = _find_free_port() + mp.spawn( + _worker_vlm_cp_equivalence_nonrmpad, + args=(world_size, port, model_path), + nprocs=world_size, + join=True, + ) diff --git a/tests/distributed/strategy/grad_norm/run_fsdp2_distributed_test.sh b/tests/distributed/strategy/grad_norm/run_fsdp2_distributed_test.sh new file mode 100644 index 000000000..73f96c37b --- /dev/null +++ b/tests/distributed/strategy/grad_norm/run_fsdp2_distributed_test.sh @@ -0,0 +1,38 @@ +#!/bin/bash + +set -e + +echo "==================================" +echo "FSDP2 Gradient Norm Distributed Test" +echo "==================================" +echo "" + +if ! command -v nvidia-smi &> /dev/null; then + echo "ERROR: nvidia-smi not found. CUDA is required for this test." + exit 1 +fi + +NUM_GPUS=$(nvidia-smi --list-gpus | wc -l) +echo "Found $NUM_GPUS GPUs" + +if [ "$NUM_GPUS" -lt 2 ]; then + echo "ERROR: This test requires at least 2 GPUs, but only $NUM_GPUS found." + exit 1 +fi + +SCRIPT_DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" + +echo "" +echo "Running FSDP2 distributed gradient norm test with 2 GPUs..." +echo "" + +torchrun \ + --nproc_per_node=2 \ + --master_port=29500 \ + "${SCRIPT_DIR}/test_fsdp2_grad_norm.py" + +echo "" +echo "==================================" +echo "Test completed successfully!" +echo "==================================" + diff --git a/tests/distributed/strategy/grad_norm/test_fsdp2_grad_norm.py b/tests/distributed/strategy/grad_norm/test_fsdp2_grad_norm.py new file mode 100644 index 000000000..ec54be625 --- /dev/null +++ b/tests/distributed/strategy/grad_norm/test_fsdp2_grad_norm.py @@ -0,0 +1,291 @@ +import os + +import torch +import torch.distributed as dist +import torch.nn as nn +from torch.distributed._composable.fsdp import fully_shard +from torch.distributed.device_mesh import init_device_mesh +from torch.distributed.fsdp import MixedPrecisionPolicy +from torch.distributed.tensor import DTensor +from torch.nn.utils.clip_grad import _get_total_norm + +from roll.platforms import current_platform + + +class SimpleModel(nn.Module): + + def __init__(self, input_size=128, hidden_size=256, output_size=64): + super().__init__() + self.fc1 = nn.Linear(input_size, hidden_size, bias=True) + self.fc2 = nn.Linear(hidden_size, hidden_size, bias=True) + self.fc3 = nn.Linear(hidden_size, output_size, bias=True) + self.relu = nn.ReLU() + + def forward(self, x): + x = self.relu(self.fc1(x)) + x = self.relu(self.fc2(x)) + x = self.fc3(x) + return x + + +def manual_compute_grad_norm(model, norm_type=2.0): + grads = [] + for param in model.parameters(): + if param.grad is not None: + # If it's a DTensor, gather to full tensor first + if isinstance(param.grad, DTensor): + grad = param.grad.full_tensor() + else: + grad = param.grad + grads.append(grad) + + if len(grads) == 0: + return torch.tensor(0.0) + + # Compute total norm + total_norm = torch.norm( + torch.stack([torch.norm(g.detach(), norm_type) for g in grads]), + norm_type, + ) + return total_norm + + +def fsdp2_compute_grad_norm(model, norm_type=2.0): + """ + Compute gradient norm using FSDP2 approach (operating on sharded gradients). + """ + parameters = list(model.parameters()) + grads = [p.grad for p in parameters if p.grad is not None] + + if not grads: + return torch.tensor(0.0, device=current_platform.current_device()) + + total_norm = _get_total_norm( + grads, norm_type, error_if_nonfinite=False, foreach=None + ) + + # Convert DTensor to full tensor to get global norm + if isinstance(total_norm, DTensor): + total_norm = total_norm.full_tensor() + + return total_norm + + +def test_gradient_norm_single_gpu(): + """Test gradient norm computation on a single GPU (no sharding).""" + + if not torch.cuda.is_available(): + print("CUDA not available, skipping test") + return + + device = torch.device("cuda") + + # Create model and data + model = SimpleModel().to(device) + batch_size = 8 + input_data = torch.randn(batch_size, 128, device=device) + target = torch.randn(batch_size, 64, device=device) + + # Forward pass + output = model(input_data) + loss = ((output - target) ** 2).mean() + + # Backward pass + loss.backward() + + # Compute gradient norm manually + manual_norm = manual_compute_grad_norm(model) + + # Compute gradient norm using PyTorch's built-in function + from torch.nn.utils import clip_grad_norm_ + + pytorch_norm = clip_grad_norm_( + model.parameters(), max_norm=float("inf") + ) + + # They should match + print(f"Manual norm: {manual_norm.item():.6f}") + print(f"PyTorch norm: {pytorch_norm.item():.6f}") + + assert torch.allclose( + manual_norm, pytorch_norm, rtol=1e-4, atol=1e-4 + ), f"Manual norm {manual_norm.item()} != PyTorch norm {pytorch_norm.item()}" + + print("✓ Single GPU gradient norm test passed!") + + +def test_gradient_norm_fsdp2_distributed(): + """ + Test gradient norm computation with FSDP2 in a distributed setting. + This test should be run with torchrun or similar launcher. + + Example: + torchrun --nproc_per_node=2 test_fsdp2_grad_norm.py + """ + + if not dist.is_initialized(): + # Initialize distributed if not already done + if not torch.cuda.is_available(): + print("CUDA not available, skipping distributed test") + return + + backend = "nccl" if torch.cuda.is_available() else "gloo" + dist.init_process_group(backend=backend) + + rank = dist.get_rank() + world_size = dist.get_world_size() + device = torch.device( + f"cuda:{rank}" if torch.cuda.is_available() else "cpu" + ) + torch.cuda.set_device(device) + + print(f"[Rank {rank}/{world_size}] Starting FSDP2 gradient norm test") + + # Set seed for reproducibility across ranks + torch.manual_seed(42) + + # Create device mesh for FSDP2 + mesh = init_device_mesh( + "cuda" if torch.cuda.is_available() else "cpu", + (world_size,), + mesh_dim_names=("fsdp",), + ) + + # Create model directly on device (not meta) + model = SimpleModel().to(device) + + # Apply FSDP2 configuration using PyTorch's fully_shard + from torch.distributed._composable.fsdp import fully_shard + + mixed_precision = MixedPrecisionPolicy( + param_dtype=torch.bfloat16, + reduce_dtype=torch.float32, + cast_forward_inputs=True, + ) + + for module in model.modules(): + if isinstance(module, nn.Linear): + fully_shard( + module, + mesh=mesh, + reshard_after_forward=True, + mp_policy=mixed_precision, + ) + + fully_shard( + model, + mesh=mesh, + reshard_after_forward=True, + mp_policy=mixed_precision, + ) + + torch.manual_seed(42 + rank) # Different data per rank + batch_size = 4 + input_data = torch.randn( + batch_size, 128, device=device, dtype=torch.bfloat16 + ) + target = torch.randn( + batch_size, 64, device=device, dtype=torch.bfloat16 + ) + + # Forward pass + output = model(input_data) + loss = ((output - target) ** 2).mean() + + print(f"[Rank {rank}] Loss: {loss.item():.6f}") + + # Backward pass + loss.backward() + + # Compute gradient norm using FSDP2 approach + fsdp2_norm = fsdp2_compute_grad_norm(model) + + print(f"[Rank {rank}] FSDP2 gradient norm: {fsdp2_norm.item():.6f}") + + all_norms = [torch.zeros_like(fsdp2_norm) for _ in range(world_size)] + dist.all_gather(all_norms, fsdp2_norm) + + if rank == 0: + print(f"\n[Rank 0] Gradient norms from all ranks:") + for r, norm in enumerate(all_norms): + print(f" Rank {r}: {norm.item():.6f}") + + for r, norm in enumerate(all_norms): + assert torch.allclose( + norm, all_norms[0], rtol=1e-3, atol=1e-5 + ), f"Rank {r} norm {norm.item()} != Rank 0 norm {all_norms[0].item()}" + + print("\n✓ FSDP2 distributed gradient norm test passed!") + + dist.barrier() + + if rank == 0: + print("\nTest completed successfully!") + + +def test_gradient_norm_consistency(): + if not torch.cuda.is_available(): + print("CUDA not available, skipping test") + return + + device = torch.device("cuda") + + # Create a very simple model for easy verification + class TinyModel(nn.Module): + def __init__(self): + super().__init__() + self.w = nn.Parameter( + torch.tensor([1.0, 2.0, 3.0], device=device) + ) + + model = TinyModel() + + loss = (model.w**2).sum() + loss.backward() + + expected_grad = torch.tensor([2.0, 4.0, 6.0], device=device) + assert torch.allclose( + model.w.grad, expected_grad + ), f"Expected grad {expected_grad}, got {model.w.grad}" + + expected_norm = torch.sqrt(torch.tensor(56.0, device=device)) + + from torch.nn.utils import clip_grad_norm_ + + pytorch_norm = clip_grad_norm_( + model.parameters(), max_norm=float("inf") + ) + + print(f"Expected norm: {expected_norm.item():.6f}") + print(f"PyTorch norm: {pytorch_norm.item():.6f}") + + assert torch.allclose( + pytorch_norm, expected_norm, rtol=1e-4, atol=1e-4 + ), f"PyTorch norm {pytorch_norm.item()} != expected {expected_norm.item()}" + + print("✓ Gradient norm consistency test passed!") + + +if __name__ == "__main__": + if "RANK" in os.environ and "WORLD_SIZE" in os.environ: + print( + f"Running in distributed mode (Rank {os.environ['RANK']}/{os.environ['WORLD_SIZE']})" + ) + test_gradient_norm_fsdp2_distributed() + else: + print("Running in single-GPU mode") + print("\n" + "=" * 60) + print("Test 1: Gradient Norm Consistency") + print("=" * 60) + test_gradient_norm_consistency() + + print("\n" + "=" * 60) + print("Test 2: Single GPU Gradient Norm") + print("=" * 60) + test_gradient_norm_single_gpu() + + print("\n" + "=" * 60) + print("All tests passed!") + print("=" * 60) + print("\nTo test distributed FSDP2, run:") + print(" torchrun --nproc_per_node=2 test_fsdp2_grad_norm.py") diff --git a/tests/distributed/strategy/grad_norm/test_grad_accumulation_scaling.py b/tests/distributed/strategy/grad_norm/test_grad_accumulation_scaling.py new file mode 100644 index 000000000..d67f7d4db --- /dev/null +++ b/tests/distributed/strategy/grad_norm/test_grad_accumulation_scaling.py @@ -0,0 +1,324 @@ +import torch +import torch.nn as nn + + +class SimpleModel(nn.Module): + """Simple model for testing.""" + + def __init__(self): + super().__init__() + self.fc = nn.Linear(10, 5, bias=True) + + def forward(self, x): + return self.fc(x) + + +def test_gradient_accumulation_without_scaling(): + """ + Test gradient accumulation WITHOUT loss scaling. + This demonstrates the problem: gradients scale with accumulation steps. + """ + print("\n" + "=" * 60) + print("Test: Gradient Accumulation WITHOUT Scaling (Incorrect)") + print("=" * 60) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + torch.manual_seed(42) + + # Test with different accumulation steps + for grad_acc_steps in [1, 2, 4]: + model = SimpleModel().to(device) + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + + # Create mini-batches + batch_size_per_step = 8 + total_batch_size = batch_size_per_step * grad_acc_steps + + torch.manual_seed(42) + x_full = torch.randn(total_batch_size, 10, device=device) + y_full = torch.randn(total_batch_size, 5, device=device) + + # Accumulate gradients WITHOUT scaling + optimizer.zero_grad() + for i in range(grad_acc_steps): + start_idx = i * batch_size_per_step + end_idx = (i + 1) * batch_size_per_step + x_mini = x_full[start_idx:end_idx] + y_mini = y_full[start_idx:end_idx] + + output = model(x_mini) + loss = ((output - y_mini) ** 2).mean() + # NO SCALING - This is the problem! + loss.backward() + + # Compute gradient norm + from torch.nn.utils import clip_grad_norm_ + + grad_norm = clip_grad_norm_( + model.parameters(), max_norm=float("inf") + ) + + print(f"grad_acc_steps={grad_acc_steps}: grad_norm={grad_norm:.6f}") + + print( + "\n⚠️ WITHOUT scaling, gradient norm increases with accumulation steps!" + ) + + +def test_gradient_accumulation_with_scaling(): + """ + Test gradient accumulation WITH loss scaling. + This demonstrates the correct approach: gradients remain consistent. + """ + print("\n" + "=" * 60) + print("Test: Gradient Accumulation WITH Scaling (Correct)") + print("=" * 60) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Use FIXED total batch size across all tests + total_batch_size = 32 + + # Test with different accumulation steps + grad_norms = {} + for grad_acc_steps in [1, 2, 4, 8]: + torch.manual_seed(42) + model = SimpleModel().to(device) + optimizer = torch.optim.SGD(model.parameters(), lr=0.01) + + # Adjust batch size per step to keep total constant + batch_size_per_step = total_batch_size // grad_acc_steps + + # Use SAME data for all configurations + torch.manual_seed(100) + x_full = torch.randn(total_batch_size, 10, device=device) + y_full = torch.randn(total_batch_size, 5, device=device) + + # Accumulate gradients WITH scaling + optimizer.zero_grad() + for i in range(grad_acc_steps): + start_idx = i * batch_size_per_step + end_idx = (i + 1) * batch_size_per_step + x_mini = x_full[start_idx:end_idx] + y_mini = y_full[start_idx:end_idx] + + output = model(x_mini) + loss = ((output - y_mini) ** 2).mean() + # CORRECT: Scale by gradient accumulation steps + scaled_loss = loss / grad_acc_steps + scaled_loss.backward() + + # Compute gradient norm + from torch.nn.utils import clip_grad_norm_ + + grad_norm = clip_grad_norm_( + model.parameters(), max_norm=float("inf") + ) + grad_norms[grad_acc_steps] = grad_norm.item() + + print(f"grad_acc_steps={grad_acc_steps}: grad_norm={grad_norm:.6f}") + + # Verify all gradient norms are similar + norm_values = list(grad_norms.values()) + max_norm = max(norm_values) + min_norm = min(norm_values) + relative_diff = (max_norm - min_norm) / min_norm + + print(f"\nRelative difference: {relative_diff*100:.2f}%") + + if relative_diff < 0.01: # Within 1% + print("✓ WITH scaling, gradient norms remain consistent!") + else: + print(f"⚠️ Gradient norms vary by {relative_diff*100:.2f}%") + print( + " Note: Small variations are expected due to different computational order" + ) + + return relative_diff < 0.05 # Allow 5% for numerical precision + + +def test_gradient_accumulation_equivalence(): + """ + Test that gradient accumulation with scaling is equivalent to full-batch training. + """ + print("\n" + "=" * 60) + print("Test: Gradient Accumulation Equivalence") + print("=" * 60) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + # Full batch training (baseline) + torch.manual_seed(42) + model_full = SimpleModel().to(device) + + total_batch_size = 32 + torch.manual_seed(100) + x_full = torch.randn(total_batch_size, 10, device=device) + y_full = torch.randn(total_batch_size, 5, device=device) + + output_full = model_full(x_full) + loss_full = ((output_full - y_full) ** 2).mean() + loss_full.backward() + + from torch.nn.utils import clip_grad_norm_ + + grad_norm_full = clip_grad_norm_( + model_full.parameters(), max_norm=float("inf") + ) + + print( + f"Full batch (batch_size={total_batch_size}): grad_norm={grad_norm_full:.6f}" + ) + + # Gradient accumulation (should match) + grad_acc_steps = 4 + batch_size_per_step = total_batch_size // grad_acc_steps + + torch.manual_seed(42) + model_acc = SimpleModel().to(device) + model_acc.zero_grad() + + torch.manual_seed(100) + x_acc = torch.randn(total_batch_size, 10, device=device) + y_acc = torch.randn(total_batch_size, 5, device=device) + + for i in range(grad_acc_steps): + start_idx = i * batch_size_per_step + end_idx = (i + 1) * batch_size_per_step + x_mini = x_acc[start_idx:end_idx] + y_mini = y_acc[start_idx:end_idx] + + output = model_acc(x_mini) + loss = ((output - y_mini) ** 2).mean() + scaled_loss = loss / grad_acc_steps + scaled_loss.backward() + + grad_norm_acc = clip_grad_norm_( + model_acc.parameters(), max_norm=float("inf") + ) + + print( + f"Gradient accumulation (steps={grad_acc_steps}, batch_size={batch_size_per_step}): grad_norm={grad_norm_acc:.6f}" + ) + + # Compare + relative_diff = abs(grad_norm_full - grad_norm_acc) / grad_norm_full + print(f"\nRelative difference: {relative_diff*100:.2f}%") + + # They should be very close (within numerical precision) + if torch.allclose(grad_norm_full, grad_norm_acc, rtol=1e-3, atol=1e-5): + print("✓ Gradient accumulation matches full-batch training!") + return True + else: + print(f"⚠️ Mismatch: {grad_norm_full:.6f} vs {grad_norm_acc:.6f}") + return False + + +def test_gradient_accumulation_impact_on_norm(): + """ + Demonstrate the impact of gradient accumulation on gradient norms. + """ + print("\n" + "=" * 60) + print("Summary: Impact of Gradient Accumulation on Gradient Norms") + print("=" * 60) + + device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + + print("\nScenario: Training with gradient_accumulation_steps=4") + print("-" * 60) + + grad_acc_steps = 4 + batch_size_per_step = 8 + total_batch_size = batch_size_per_step * grad_acc_steps + + torch.manual_seed(42) + x = torch.randn(total_batch_size, 10, device=device) + y = torch.randn(total_batch_size, 5, device=device) + + # WITHOUT scaling + torch.manual_seed(42) + model_no_scale = SimpleModel().to(device) + model_no_scale.zero_grad() + + for i in range(grad_acc_steps): + start = i * batch_size_per_step + end = (i + 1) * batch_size_per_step + loss = ((model_no_scale(x[start:end]) - y[start:end]) ** 2).mean() + loss.backward() + + from torch.nn.utils import clip_grad_norm_ + + norm_no_scale = clip_grad_norm_( + model_no_scale.parameters(), max_norm=float("inf") + ) + + # WITH scaling + torch.manual_seed(42) + model_with_scale = SimpleModel().to(device) + model_with_scale.zero_grad() + + for i in range(grad_acc_steps): + start = i * batch_size_per_step + end = (i + 1) * batch_size_per_step + loss = ((model_with_scale(x[start:end]) - y[start:end]) ** 2).mean() + (loss / grad_acc_steps).backward() + + norm_with_scale = clip_grad_norm_( + model_with_scale.parameters(), max_norm=float("inf") + ) + + print(f"WITHOUT loss scaling: grad_norm = {norm_no_scale:.6f}") + print(f"WITH loss scaling: grad_norm = {norm_with_scale:.6f}") + print(f"\nRatio (without/with): {norm_no_scale / norm_with_scale:.2f}x") + print(f"Expected ratio: {grad_acc_steps:.2f}x") + + # The ratio should match the gradient accumulation steps + ratio = norm_no_scale / norm_with_scale + expected_ratio = float(grad_acc_steps) + + if abs(ratio - expected_ratio) < 0.1: + print( + f"\n✓ Without scaling, gradients are {grad_acc_steps}x larger!" + ) + + return abs(ratio - expected_ratio) < 0.1 + + +if __name__ == "__main__": + print("\n" + "=" * 80) + print("GRADIENT ACCUMULATION SCALING TESTS") + print("=" * 80) + + # Run all tests + test_gradient_accumulation_without_scaling() + + test1_passed = test_gradient_accumulation_with_scaling() + test2_passed = test_gradient_accumulation_equivalence() + test3_passed = test_gradient_accumulation_impact_on_norm() + + print("\n" + "=" * 80) + print("SUMMARY") + print("=" * 80) + print( + f"Gradient accumulation consistency: {'✓ PASS' if test1_passed else '✗ FAIL'}" + ) + print( + f"Full-batch equivalence: {'✓ PASS' if test2_passed else '✗ FAIL'}" + ) + print( + f"Scaling impact verification: {'✓ PASS' if test3_passed else '✗ FAIL'}" + ) + + if test1_passed and test2_passed and test3_passed: + print("\n✓ All tests passed!") + print("\nKEY TAKEAWAY:") + print( + " Always scale loss by 1/gradient_accumulation_steps to maintain" + ) + print( + " consistent gradient magnitudes regardless of accumulation settings." + ) + else: + print("\n✗ Some tests failed") + + print("=" * 80) diff --git a/tests/distributed/strategy/grad_norm/test_grad_norm_unit.py b/tests/distributed/strategy/grad_norm/test_grad_norm_unit.py new file mode 100644 index 000000000..cd6c20f47 --- /dev/null +++ b/tests/distributed/strategy/grad_norm/test_grad_norm_unit.py @@ -0,0 +1,222 @@ +import pytest +import torch +import torch.nn as nn + + +class TestGradientNormBasic: + """Basic unit tests for gradient norm computation.""" + + def test_simple_parameter_grad_norm(self): + """Test gradient norm with a single parameter.""" + # Create a parameter with known gradient + param = nn.Parameter(torch.tensor([1.0, 2.0, 3.0])) + + # Manually set gradient: [2, 4, 6] + param.grad = torch.tensor([2.0, 4.0, 6.0]) + + # Expected L2 norm: sqrt(4 + 16 + 36) = sqrt(56) ≈ 7.4833 + expected_norm = torch.sqrt(torch.tensor(56.0)) + + # Compute using PyTorch + from torch.nn.utils import clip_grad_norm_ + + computed_norm = clip_grad_norm_([param], max_norm=float("inf")) + + assert torch.allclose( + computed_norm, expected_norm, rtol=1e-5, atol=1e-5 + ), f"Computed norm {computed_norm:.6f} != expected {expected_norm:.6f}" + + def test_multiple_parameters_grad_norm(self): + """Test gradient norm with multiple parameters.""" + # Create parameters + param1 = nn.Parameter( + torch.tensor([3.0, 4.0]) + ) # grad will be [1, 0] + param2 = nn.Parameter( + torch.tensor([1.0, 2.0]) + ) # grad will be [0, 1] + + param1.grad = torch.tensor([1.0, 0.0]) + param2.grad = torch.tensor([0.0, 1.0]) + + # Expected L2 norm: sqrt(1^2 + 0^2 + 0^2 + 1^2) = sqrt(2) ≈ 1.4142 + expected_norm = torch.sqrt(torch.tensor(2.0)) + + from torch.nn.utils import clip_grad_norm_ + + computed_norm = clip_grad_norm_( + [param1, param2], max_norm=float("inf") + ) + + assert torch.allclose( + computed_norm, expected_norm, rtol=1e-5, atol=1e-5 + ), f"Computed norm {computed_norm:.6f} != expected {expected_norm:.6f}" + + def test_model_grad_norm(self): + """Test gradient norm computation through a simple model.""" + + class TinyModel(nn.Module): + def __init__(self): + super().__init__() + self.w1 = nn.Parameter(torch.tensor([1.0, 2.0])) + self.w2 = nn.Parameter(torch.tensor([3.0])) + + model = TinyModel() + + # Create a simple loss: L = w1[0]^2 + w1[1]^2 + w2[0]^2 + # Gradients: dL/dw1 = [2*w1[0], 2*w1[1]] = [2, 4] + # dL/dw2 = [2*w2[0]] = [6] + loss = (model.w1**2).sum() + (model.w2**2).sum() + loss.backward() + + # Verify gradients + assert torch.allclose( + model.w1.grad, torch.tensor([2.0, 4.0]) + ), f"w1.grad = {model.w1.grad}, expected [2, 4]" + assert torch.allclose( + model.w2.grad, torch.tensor([6.0]) + ), f"w2.grad = {model.w2.grad}, expected [6]" + + # Expected norm: sqrt(4 + 16 + 36) = sqrt(56) + expected_norm = torch.sqrt(torch.tensor(56.0)) + + from torch.nn.utils import clip_grad_norm_ + + computed_norm = clip_grad_norm_( + model.parameters(), max_norm=float("inf") + ) + + assert torch.allclose( + computed_norm, expected_norm, rtol=1e-5, atol=1e-5 + ), f"Computed norm {computed_norm:.6f} != expected {expected_norm:.6f}" + + def test_grad_clipping(self): + """Test that gradient clipping works correctly.""" + + # Create parameter with large gradient + param = nn.Parameter(torch.tensor([3.0, 4.0])) + param.grad = torch.tensor([3.0, 4.0]) # norm = 5.0 + + max_norm = 2.5 + from torch.nn.utils import clip_grad_norm_ + + total_norm = clip_grad_norm_([param], max_norm=max_norm) + + # Total norm before clipping should be 5.0 + assert torch.allclose( + total_norm, torch.tensor(5.0), rtol=1e-5 + ), f"Total norm {total_norm:.6f} != 5.0" + + # After clipping, gradient should be scaled by max_norm / total_norm = 2.5 / 5.0 = 0.5 + expected_grad = torch.tensor([1.5, 2.0]) # [3, 4] * 0.5 + assert torch.allclose( + param.grad, expected_grad, rtol=1e-5, atol=1e-5 + ), f"Clipped grad {param.grad} != expected {expected_grad}" + + # Verify clipped norm + clipped_norm = torch.norm(param.grad) + assert torch.allclose( + clipped_norm, torch.tensor(max_norm), rtol=1e-5, atol=1e-5 + ), f"Clipped norm {clipped_norm:.6f} != max_norm {max_norm}" + + @pytest.mark.skipif( + not torch.cuda.is_available(), reason="CUDA not available" + ) + def test_grad_norm_cuda(self): + """Test gradient norm computation on CUDA.""" + + device = torch.device("cuda") + + class SimpleModel(nn.Module): + def __init__(self): + super().__init__() + self.linear = nn.Linear(10, 5, bias=True) + + model = SimpleModel().to(device) + + # Forward pass + x = torch.randn(8, 10, device=device) + y_target = torch.randn(8, 5, device=device) + y_pred = model(x) + loss = ((y_pred - y_target) ** 2).mean() + + # Backward pass + loss.backward() + + # Compute gradient norm + from torch.nn.utils import clip_grad_norm_ + + grad_norm = clip_grad_norm_( + model.parameters(), max_norm=float("inf") + ) + + # Verify it's a valid number + assert torch.isfinite( + grad_norm + ), f"Gradient norm is not finite: {grad_norm}" + assert ( + grad_norm > 0 + ), f"Gradient norm should be positive, got {grad_norm}" + + # Manual computation + total_norm_sq = 0.0 + for param in model.parameters(): + if param.grad is not None: + param_norm = torch.norm(param.grad) + total_norm_sq += param_norm**2 + manual_norm = torch.sqrt(total_norm_sq) + + assert torch.allclose( + grad_norm, manual_norm, rtol=1e-4, atol=1e-5 + ), f"Computed norm {grad_norm:.6f} != manual norm {manual_norm:.6f}" + + +class TestGradientNormEdgeCases: + """Test edge cases in gradient norm computation.""" + + def test_zero_gradients(self): + """Test gradient norm with zero gradients.""" + param = nn.Parameter(torch.tensor([1.0, 2.0, 3.0])) + param.grad = torch.zeros_like(param) + + from torch.nn.utils import clip_grad_norm_ + + grad_norm = clip_grad_norm_([param], max_norm=1.0) + + assert torch.allclose( + grad_norm, torch.tensor(0.0) + ), f"Zero gradient should have norm 0, got {grad_norm}" + + def test_no_gradients(self): + """Test gradient norm when no parameters have gradients.""" + param = nn.Parameter(torch.tensor([1.0, 2.0, 3.0])) + # Don't set grad (None) + + from torch.nn.utils import clip_grad_norm_ + + grad_norm = clip_grad_norm_([param], max_norm=1.0) + + assert torch.allclose( + grad_norm, torch.tensor(0.0) + ), f"No gradient should have norm 0, got {grad_norm}" + + def test_mixed_gradients(self): + """Test gradient norm when some parameters have gradients and others don't.""" + param1 = nn.Parameter(torch.tensor([3.0, 4.0])) + param2 = nn.Parameter(torch.tensor([1.0, 2.0])) + + param1.grad = torch.tensor([3.0, 4.0]) # norm = 5 + # param2.grad is None + + from torch.nn.utils import clip_grad_norm_ + + grad_norm = clip_grad_norm_([param1, param2], max_norm=float("inf")) + + expected_norm = torch.tensor(5.0) + assert torch.allclose( + grad_norm, expected_norm, rtol=1e-5 + ), f"Computed norm {grad_norm:.6f} != expected {expected_norm:.6f}" + + +if __name__ == "__main__": + pytest.main([__file__, "-v", "-s"]) diff --git a/tests/distributed/strategy/log_probs/analyze_layer_divergence.py b/tests/distributed/strategy/log_probs/analyze_layer_divergence.py new file mode 100644 index 000000000..7ca932781 --- /dev/null +++ b/tests/distributed/strategy/log_probs/analyze_layer_divergence.py @@ -0,0 +1,612 @@ +import argparse +import json +from pathlib import Path +from typing import Dict, List, Optional, Tuple + +import numpy as np +import torch +from tqdm import tqdm + + +def load_layer_states( + state_dir: Path, prefix: str, global_step: int, batch_idx: int = 0, subdir: str = "layers" +) -> Dict: + """Load all layer states for a given step and batch.""" + layer_states = {} + + # Look in subdirectory (layers or embeddings) + search_dir = state_dir / subdir if subdir else state_dir + + if not search_dir.exists(): + return layer_states + + # Find all files matching the pattern + pattern = f"{prefix}_step{global_step}_batch{batch_idx}_*.pt" + state_files = list(search_dir.glob(pattern)) + + for state_file in state_files: + # Parse filename patterns: + # - {prefix}_step{step}_batch{batch}_layer_states_{layer_key}_{state_key}.pt + # - {prefix}_step{step}_batch{batch}_{direct_key}.pt (e.g., inputs_embeds) + stem = state_file.stem + prefix_pattern = f"{prefix}_step{global_step}_batch{batch_idx}_" + + if not stem.startswith(prefix_pattern): + continue + + # Remove prefix to get the key part + key_part = stem[len(prefix_pattern) :] + + # Check if it's a layer_states file + # Pattern: layer_states_layer_{N}_{state_key} + # Example: layer_states_layer_0_before_attn + if key_part.startswith("layer_states_"): + parts = key_part.split("_") + # parts = ['layer', 'states', 'layer', '0', 'before', 'attn', ...] + if len(parts) >= 4 and parts[0] == "layer" and parts[1] == "states": + # parts[2] = "layer", parts[3] = layer number + layer_key = f"{parts[2]}_{parts[3]}" # e.g., "layer_0" + if len(parts) > 4: + state_key = "_".join(parts[4:]) # e.g., "before_attn" + else: + state_key = "hidden_state" + + if layer_key not in layer_states: + layer_states[layer_key] = {} + layer_states[layer_key][state_key] = torch.load(state_file) + else: + # Direct key (e.g., inputs_embeds, visual_image_embeds) + layer_states[key_part] = torch.load(state_file) + + return layer_states + + +def compute_tensor_diff(tensor1: torch.Tensor, tensor2: torch.Tensor, mask: Optional[torch.Tensor] = None) -> Dict: + """Compute various difference metrics between two tensors.""" + transposed = None + if tensor1.shape != tensor2.shape: + # Common layout mismatch between frameworks: + # - Megatron often uses [S, B, H] + # - HF/FSDP often uses [B, S, H] + # Try swapping the first two dims for 2D/3D tensors. + if tensor1.dim() in (2, 3) and tensor2.dim() == tensor1.dim(): + if tensor1.transpose(0, 1).shape == tensor2.shape: + tensor1 = tensor1.transpose(0, 1).contiguous() + transposed = "tensor1" + if mask is not None and mask.dim() >= 2 and mask.shape == tensor2.transpose(0, 1).shape: + mask = mask.transpose(0, 1).contiguous() + elif tensor2.transpose(0, 1).shape == tensor1.shape: + tensor2 = tensor2.transpose(0, 1).contiguous() + transposed = "tensor2" + if mask is not None and mask.dim() >= 2 and mask.shape == tensor2.transpose(0, 1).shape: + mask = mask.transpose(0, 1).contiguous() + + if tensor1.shape != tensor2.shape: + return { + "shape_mismatch": True, + "shape1": list(tensor1.shape), + "shape2": list(tensor2.shape), + } + + # Handle bool tensors (e.g., visual_pos_masks) + if tensor1.dtype == torch.bool or tensor2.dtype == torch.bool: + if tensor1.dtype != tensor2.dtype: + return { + "dtype_mismatch": True, + "dtype1": str(tensor1.dtype), + "dtype2": str(tensor2.dtype), + } + # For bool tensors, compute element-wise equality + equal = tensor1 == tensor2 + if mask is not None: + if mask.shape != tensor1.shape: + mask = mask.expand_as(tensor1) + equal = equal | (~mask) # Consider masked positions as equal + num_different = (~equal).sum().item() + total = equal.numel() + return { + "is_bool": True, + "num_different": num_different, + "total": total, + "match_rate": (total - num_different) / total if total > 0 else 1.0, + } + + diff = tensor1 - tensor2 + abs_diff = diff.abs() + + if mask is not None: + if mask.shape != tensor1.shape: + # Try to broadcast mask + mask = mask.expand_as(tensor1) + abs_diff_masked = abs_diff * mask + max_diff = abs_diff_masked.max().item() + mean_diff = abs_diff_masked[mask > 0].mean().item() if mask.any() else 0.0 + max_abs_value = torch.max(tensor1.abs(), tensor2.abs())[mask > 0].max().item() if mask.any() else 0.0 + else: + max_diff = abs_diff.max().item() + mean_diff = abs_diff.mean().item() + max_abs_value = torch.max(tensor1.abs(), tensor2.abs()).max().item() + + # Relative error + relative_error = max_diff / (max_abs_value + 1e-10) + + # Cosine similarity + tensor1_flat = tensor1.flatten() + tensor2_flat = tensor2.flatten() + cos_sim = torch.nn.functional.cosine_similarity(tensor1_flat.unsqueeze(0), tensor2_flat.unsqueeze(0)).item() + + return { + "max_diff": max_diff, + "mean_diff": mean_diff, + "relative_error": relative_error, + "cosine_similarity": cos_sim, + "shape_mismatch": False, + "transposed": transposed, + } + + +def compare_layer_states(fsdp_states: Dict, hf_states: Dict, attention_mask: Optional[torch.Tensor] = None) -> Dict: + """Compare layer states between FSDP2 and HF. + + Handles both: + - Nested structure: {layer_0: {before_attn: tensor, ...}, ...} (layer states) + - Flat structure: {inputs_embeds: tensor, ...} (embeddings) + """ + comparison = {} + + # Get all keys (union of both) + all_keys = set(fsdp_states.keys()) | set(hf_states.keys()) + + for key in sorted(all_keys): + if key not in fsdp_states or key not in hf_states: + comparison[key] = {"missing": True} + continue + + fsdp_value = fsdp_states[key] + hf_value = hf_states[key] + + # Check if this is a nested structure (layer states) or flat (embeddings) + if isinstance(fsdp_value, dict) and isinstance(hf_value, dict): + # Nested structure: layer states + layer_comparison = {} + + # Compare each state within the layer + all_state_keys = set(fsdp_value.keys()) | set(hf_value.keys()) + for state_key in sorted(all_state_keys): + if state_key not in fsdp_value or state_key not in hf_value: + layer_comparison[state_key] = {"missing": True} + continue + + fsdp_tensor = fsdp_value[state_key] + hf_tensor = hf_value[state_key] + + if isinstance(fsdp_tensor, torch.Tensor) and isinstance(hf_tensor, torch.Tensor): + # Skip comparison for visual_pos_masks (they're metadata, just check if they match) + if state_key == "visual_pos_masks": + if fsdp_tensor.shape == hf_tensor.shape and fsdp_tensor.dtype == hf_tensor.dtype: + match = (fsdp_tensor == hf_tensor).all().item() + layer_comparison[state_key] = { + "is_mask": True, + "match": match, + "shape": list(fsdp_tensor.shape), + } + else: + layer_comparison[state_key] = { + "is_mask": True, + "match": False, + "shape_mismatch": True, + "shape1": list(fsdp_tensor.shape), + "shape2": list(hf_tensor.shape), + } + else: + # Create mask for this state if attention_mask is provided + # Note: layer states might have different shapes, so we need to be careful + mask = None + if attention_mask is not None and state_key not in ( + "visual_pos_masks", + "deepstack_visual_embeds", + ): + # Try to create appropriate mask based on tensor shape + if len(fsdp_tensor.shape) >= 2: + # attention_mask is [B, S] + if ( + fsdp_tensor.shape[0] == attention_mask.shape[0] + and fsdp_tensor.shape[1] == attention_mask.shape[1] + ): + # [B, S, ...] + mask = attention_mask + elif ( + fsdp_tensor.shape[0] == attention_mask.shape[1] + and fsdp_tensor.shape[1] == attention_mask.shape[0] + ): + # [S, B, ...] + mask = attention_mask.transpose(0, 1) + if mask is not None: + mask = mask.unsqueeze(-1) + while mask.dim() < fsdp_tensor.dim(): + mask = mask.unsqueeze(-1) + mask = mask.expand_as(fsdp_tensor) + + diff_stats = compute_tensor_diff(fsdp_tensor, hf_tensor, mask) + layer_comparison[state_key] = diff_stats + else: + layer_comparison[state_key] = {"type_mismatch": True} + + comparison[key] = layer_comparison + elif isinstance(fsdp_value, torch.Tensor) and isinstance(hf_value, torch.Tensor): + # Flat structure: direct tensor comparison (embeddings) + # Skip bool tensors (masks) + if fsdp_value.dtype == torch.bool or hf_value.dtype == torch.bool: + if fsdp_value.shape == hf_value.shape and fsdp_value.dtype == hf_value.dtype: + match = (fsdp_value == hf_value).all().item() + comparison[key] = { + "is_mask": True, + "match": match, + "shape": list(fsdp_value.shape), + } + else: + comparison[key] = { + "is_mask": True, + "match": False, + "shape_mismatch": True, + "shape1": list(fsdp_value.shape), + "shape2": list(hf_value.shape), + } + else: + # Create mask if attention_mask is provided + mask = None + if attention_mask is not None: + if len(fsdp_value.shape) >= 2: + if ( + fsdp_value.shape[0] == attention_mask.shape[0] + and fsdp_value.shape[1] == attention_mask.shape[1] + ): + mask = attention_mask + elif ( + fsdp_value.shape[0] == attention_mask.shape[1] + and fsdp_value.shape[1] == attention_mask.shape[0] + ): + mask = attention_mask.transpose(0, 1) + if mask is not None: + mask = mask.unsqueeze(-1) + while mask.dim() < fsdp_value.dim(): + mask = mask.unsqueeze(-1) + mask = mask.expand_as(fsdp_value) + + diff_stats = compute_tensor_diff(fsdp_value, hf_value, mask) + comparison[key] = diff_stats + else: + comparison[key] = {"type_mismatch": True} + + return comparison + + +def find_divergence_point(comparison: Dict, threshold: float = 1e-5) -> Optional[int]: + """Find the first point where divergence exceeds threshold. + + Supports both: + - Nested layer structure: {layer_0: {before_attn: {max_diff: ...}, ...}, ...} + - Flat tensor structure: {inputs_embeds: {max_diff: ...}, ...} + (e.g., if some tensors were saved directly under `layers/` without the `layer_states_` prefix) + """ + for layer_idx, (layer_key, layer_comp) in enumerate(sorted(comparison.items())): + # Defensive: some keys may map to non-dicts in malformed/partial outputs. + if not isinstance(layer_comp, dict): + continue + + # Flat diff-stats dict (max_diff/mean_diff/...) at top level + if "max_diff" in layer_comp and isinstance(layer_comp.get("max_diff", None), (int, float)): + if layer_comp.get("max_diff", 0) > threshold: + return layer_idx, layer_key, "__tensor__" + continue + + # Nested layer dict case + if "missing" in layer_comp: + continue + + for state_key, state_comp in layer_comp.items(): + if not isinstance(state_comp, dict): + continue + if "missing" in state_comp or "type_mismatch" in state_comp: + continue + + if state_comp.get("max_diff", 0) > threshold: + return layer_idx, layer_key, state_key + + return None + + +def analyze_divergence( + fsdp_dir: Path, + hf_dir: Path, + inputs_dir: Path, + output_file: Path, + fsdp_prefix: str = "fsdp2", + hf_prefix: str = "hf", + fsdp_name: str = "FSDP2", + hf_name: str = "HF", + global_step: int = 0, + batch_idx: int = 0, + threshold: float = 1e-5, +): + """Main analysis function.""" + print(f"Analyzing divergence for step {global_step}, batch {batch_idx}") + + # Load embeddings first + print("Loading embeddings...") + fsdp_embeddings = load_layer_states(fsdp_dir, fsdp_prefix, global_step, batch_idx, subdir="embeddings") + hf_embeddings = load_layer_states(hf_dir, hf_prefix, global_step, batch_idx, subdir="embeddings") + print(f"{fsdp_name} embeddings: {list(fsdp_embeddings.keys())}") + print(f"{hf_name} embeddings: {list(hf_embeddings.keys())}") + + # Load layer states + print(f"Loading {fsdp_name} layer states...") + fsdp_states = load_layer_states(fsdp_dir, fsdp_prefix, global_step, batch_idx, subdir="layers") + print(f"Loaded {len(fsdp_states)} layers from {fsdp_name}") + + print(f"Loading {hf_name} layer states...") + hf_states = load_layer_states(hf_dir, hf_prefix, global_step, batch_idx, subdir="layers") + print(f"Loaded {len(hf_states)} layers from {hf_name}") + + # Load attention mask if available + attention_mask = None + mask_file = inputs_dir / f"input_step{global_step}_batch{batch_idx}_attention_mask.pt" + if mask_file.exists(): + attention_mask = torch.load(mask_file) + print(f"Loaded attention mask: {attention_mask.shape}") + + # Compare embeddings first + print("Comparing embeddings...") + embedding_comparison = compare_layer_states(fsdp_embeddings, hf_embeddings, attention_mask) + + # Compare states + print("Comparing layer states...") + comparison = compare_layer_states(fsdp_states, hf_states, attention_mask) + + # Find divergence point + divergence_point = find_divergence_point(comparison, threshold) + + # Generate summary + summary = { + "global_step": global_step, + "batch_idx": batch_idx, + "fsdp_prefix": fsdp_prefix, + "hf_prefix": hf_prefix, + "fsdp_name": fsdp_name, + "hf_name": hf_name, + "num_fsdp_layers": len(fsdp_states), + "num_hf_layers": len(hf_states), + "divergence_threshold": threshold, + "divergence_point": divergence_point, + "embedding_comparison": embedding_comparison, + "layer_comparison": comparison, + } + + # Add per-layer summary + layer_summaries = [] + for layer_key, layer_comp in sorted(comparison.items()): + if not isinstance(layer_comp, dict): + continue + + # Flat diff-stats dict at top level (treat as a "layer" summary entry too) + if "max_diff" in layer_comp and isinstance(layer_comp.get("max_diff", None), (int, float)): + layer_summaries.append( + { + "layer": layer_key, + "max_diff": float(layer_comp.get("max_diff", 0.0)), + "mean_diff": float(layer_comp.get("mean_diff", 0.0)), + "max_relative_error": float(layer_comp.get("relative_error", 0.0)), + "min_cosine_similarity": float(layer_comp.get("cosine_similarity", 1.0)), + } + ) + continue + + if "missing" in layer_comp: + continue + + layer_max_diff = 0.0 + layer_mean_diff = 0.0 + layer_max_relative_error = 0.0 + layer_min_cosine_sim = 1.0 + + for state_key, state_comp in layer_comp.items(): + if not isinstance(state_comp, dict): + continue + if "missing" in state_comp or "type_mismatch" in state_comp: + continue + + layer_max_diff = max(layer_max_diff, state_comp.get("max_diff", 0)) + layer_mean_diff = max(layer_mean_diff, state_comp.get("mean_diff", 0)) + layer_max_relative_error = max(layer_max_relative_error, state_comp.get("relative_error", 0)) + layer_min_cosine_sim = min(layer_min_cosine_sim, state_comp.get("cosine_similarity", 1.0)) + + layer_summaries.append( + { + "layer": layer_key, + "max_diff": layer_max_diff, + "mean_diff": layer_mean_diff, + "max_relative_error": layer_max_relative_error, + "min_cosine_similarity": layer_min_cosine_sim, + } + ) + + summary["layer_summaries"] = layer_summaries + + # Save results + with open(output_file, "w") as f: + json.dump(summary, f, indent=2, default=str) + + print(f"\nAnalysis complete. Results saved to {output_file}") + + # Analyze embedding divergence + print("\n" + "=" * 80) + print("EMBEDDING ANALYSIS") + print("=" * 80) + if embedding_comparison: + print("\nEmbedding differences:") + max_emb_diff = 0.0 + max_emb_rel_err = 0.0 + for emb_key, emb_stats in sorted(embedding_comparison.items()): + if "is_mask" in emb_stats: + print(f" {emb_key}: ✓ Match (metadata)") + continue + if "shape_mismatch" in emb_stats and emb_stats["shape_mismatch"]: + print(f" {emb_key}: ✗ SHAPE MISMATCH") + continue + max_diff = emb_stats.get("max_diff", 0) + rel_err = emb_stats.get("relative_error", 0) + cos_sim = emb_stats.get("cosine_similarity", 1.0) + max_emb_diff = max(max_emb_diff, max_diff) + max_emb_rel_err = max(max_emb_rel_err, rel_err) + + # Determine severity + severity = "" + if max_diff > 0.01 or rel_err > 0.01: + severity = " ⚠️ HIGH" + elif max_diff > 0.001 or rel_err > 0.001: + severity = " ⚠️ MEDIUM" + + print( + f" {emb_key}: max_diff={max_diff:.6f}, " + f"rel_error={rel_err:.6f}, " + f"cos_sim={cos_sim:.6f}{severity}" + ) + + print(f"\nEmbedding Summary:") + print(f" Max absolute difference: {max_emb_diff:.6f}") + print(f" Max relative error: {max_emb_rel_err:.6f}") + + if max_emb_diff > 0.01 or max_emb_rel_err > 0.01: + print(f"\n ⚠️ WARNING: Significant divergence detected at EMBEDDING phase!") + print(f" This is likely the ROOT CAUSE of logprobs differences.") + print(f" Possible causes:") + print(f" 1. Different input_ids or tokenization") + print(f" 2. Different visual encoder outputs (vision model differences)") + print(f" 3. Different embedding layer weights (model loading/initialization)") + print(f" 4. Numerical precision differences in embedding computation") + print(f" → Check if input_ids are identical between FSDP2 and HF") + print(f" → Check if pixel_values are processed identically") + elif max_emb_diff > 0.001: + print(f"\n ⚠️ Moderate differences at embedding phase") + print(f" These may accumulate through layers") + else: + print(f"\n ✓ Embeddings are very similar (differences likely numerical precision)") + + print("\n" + "=" * 80) + print("LAYER-BY-LAYER ANALYSIS") + print("=" * 80) + print(f"\nDivergence point (threshold={threshold}): {divergence_point}") + print("\nLayer summaries (showing divergence progression):") + + # Analyze embedding divergence + print("\n" + "=" * 80) + print("EMBEDDING ANALYSIS") + print("=" * 80) + if embedding_comparison: + print("\nEmbedding differences:") + for emb_key, emb_stats in sorted(embedding_comparison.items()): + if "is_mask" in emb_stats: + continue + if "shape_mismatch" in emb_stats and emb_stats["shape_mismatch"]: + print(f" {emb_key}: SHAPE MISMATCH") + continue + max_diff = emb_stats.get("max_diff", 0) + rel_err = emb_stats.get("relative_error", 0) + cos_sim = emb_stats.get("cosine_similarity", 1.0) + print(f" {emb_key}: max_diff={max_diff:.6f}, " f"rel_error={rel_err:.6f}, " f"cos_sim={cos_sim:.6f}") + + # Check if embeddings show significant divergence + max_emb_diff = max( + ( + emb_stats.get("max_diff", 0) + for emb_stats in embedding_comparison.values() + if "is_mask" not in emb_stats and not emb_stats.get("shape_mismatch", False) + ), + default=0, + ) + max_emb_rel_err = max( + ( + emb_stats.get("relative_error", 0) + for emb_stats in embedding_comparison.values() + if "is_mask" not in emb_stats and not emb_stats.get("shape_mismatch", False) + ), + default=0, + ) + + print(f"\nEmbedding summary:") + print(f" Max absolute difference: {max_emb_diff:.6f}") + print(f" Max relative error: {max_emb_rel_err:.6f}") + + if max_emb_diff > 1e-3 or max_emb_rel_err > 0.01: + print(f" ⚠️ WARNING: Significant divergence detected at embedding phase!") + print(f" This suggests differences in:") + print(f" - Input token embeddings (check if input_ids are identical)") + print(f" - Visual encoder outputs (check vision model implementation)") + print(f" - Embedding layer weights (check model initialization/loading)") + else: + print(f" ✓ Embeddings are very similar (differences likely due to numerical precision)") + + print("\n" + "=" * 80) + print("LAYER-BY-LAYER ANALYSIS") + print("=" * 80) + print("\nLayer summaries (showing divergence progression):") + for i, layer_summary in enumerate(layer_summaries[:20]): # Show first 20 layers + layer_name = layer_summary["layer"] + max_diff = layer_summary["max_diff"] + rel_err = layer_summary["max_relative_error"] + cos_sim = layer_summary["min_cosine_similarity"] + + # Mark significant divergence + marker = "" + if max_diff > threshold: + marker = " ⚠️ DIVERGED" + elif max_diff > threshold / 10: + marker = " ⚠️ WARNING" + + print( + f" [{i:2d}] {layer_name}: max_diff={max_diff:.6e}, " + f"rel_error={rel_err:.6e}, " + f"cos_sim={cos_sim:.6f}{marker}" + ) + + if len(layer_summaries) > 20: + print(f" ... ({len(layer_summaries) - 20} more layers)") + + return summary + + +def main(): + parser = argparse.ArgumentParser(description="Analyze layer state divergence between FSDP2 and HF") + parser.add_argument("--fsdp-dir", type=str, required=True, help="Directory containing FSDP2 layer states") + parser.add_argument("--hf-dir", type=str, required=True, help="Directory containing HF layer states") + parser.add_argument("--inputs-dir", type=str, required=True, help="Directory containing input tensors") + parser.add_argument("--output", type=str, default="divergence_analysis.json", help="Output JSON file") + parser.add_argument("--step", type=int, default=0, help="Global step to analyze") + parser.add_argument("--batch", type=int, default=0, help="Batch index to analyze") + parser.add_argument("--threshold", type=float, default=1e-5, help="Divergence threshold") + parser.add_argument("--fsdp-prefix", type=str, default="fsdp2", help="Prefix used for FSDP2 saved tensors") + parser.add_argument( + "--hf-prefix", type=str, default="hf", help="Prefix used for baseline saved tensors (hf/megatron)" + ) + parser.add_argument("--fsdp-name", type=str, default="FSDP2", help="Display name for FSDP side") + parser.add_argument("--hf-name", type=str, default="HF", help="Display name for baseline side") + + args = parser.parse_args() + + analyze_divergence( + fsdp_dir=Path(args.fsdp_dir), + hf_dir=Path(args.hf_dir), + inputs_dir=Path(args.inputs_dir), + output_file=Path(args.output), + fsdp_prefix=args.fsdp_prefix, + hf_prefix=args.hf_prefix, + fsdp_name=args.fsdp_name, + hf_name=args.hf_name, + global_step=args.step, + batch_idx=args.batch, + threshold=args.threshold, + ) + + +if __name__ == "__main__": + main() diff --git a/tests/distributed/strategy/log_probs/apply_model_patch.py b/tests/distributed/strategy/log_probs/apply_model_patch.py new file mode 100644 index 000000000..44c5dc5f8 --- /dev/null +++ b/tests/distributed/strategy/log_probs/apply_model_patch.py @@ -0,0 +1,744 @@ +import os + +import torch +import torch.nn.functional as F +from numpy import save + +# Try to import the capture utilities +try: + from tests.distributed.strategy.log_probs.layer_states_capture import is_enabled, save_dict, save_tensor +except ImportError: + # If not available, create no-op functions + def is_enabled(): + return False + + def save_tensor(tensor, name, subdir=""): + pass + + def save_dict(data, name, subdir=""): + pass + + +def apply_qwen3vl_patches(): + """Apply patches to Qwen3VL model classes.""" + try: + from transformers.models.qwen3_vl.modeling_qwen3_vl import ( + ALL_ATTENTION_FUNCTIONS, + Callable, + CustomBaseModelOutputWithPast, + DynamicCache, + Qwen3VLModel, + Qwen3VLTextDecoderLayer, + Qwen3VLTextMLP, + Qwen3VLTextModel, + Qwen3VLVisionAttention, + Qwen3VLVisionBlock, + Qwen3VLVisionModel, + apply_rotary_pos_emb_vision, + create_causal_mask, + eager_attention_forward, + ) + + # Patch Qwen3VLTextModel.forward + original_text_model_forward = Qwen3VLTextModel.forward + + def patched_text_model_forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + past_key_values=None, + inputs_embeds=None, + use_cache=None, + cache_position=None, + visual_pos_masks=None, + deepstack_visual_embeds=None, + **kwargs, + ): + # Capture inputs_embeds + if inputs_embeds is not None and is_enabled(): + save_tensor(inputs_embeds, "inputs_embeds", subdir="embeddings") + + # Capture visual embeddings + if deepstack_visual_embeds is not None and is_enabled(): + for i, visual_embed in enumerate(deepstack_visual_embeds): + save_tensor(visual_embed, f"deepstack_visual_embeds_{i}", subdir="embeddings") + + if visual_pos_masks is not None and is_enabled(): + save_tensor(visual_pos_masks, "visual_pos_masks", subdir="embeddings") + + # Call original forward + if is_enabled(): + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError("You must specify exactly one of input_ids or inputs_embeds") + + # torch.jit.trace() doesn't support cache objects in the output + if use_cache and past_key_values is None and not torch.jit.is_tracing(): + past_key_values = DynamicCache(config=self.config) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + if cache_position is None: + past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0 + cache_position = torch.arange( + past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device + ) + + # the hard coded `3` is for temporal, height and width. + if position_ids is None: + position_ids = cache_position.view(1, 1, -1).expand(3, inputs_embeds.shape[0], -1) + elif position_ids.ndim == 2: + position_ids = position_ids[None, ...].expand(3, position_ids.shape[0], -1) + + if position_ids.ndim == 3 and position_ids.shape[0] == 4: + text_position_ids = position_ids[0] + position_ids = position_ids[1:] + else: + text_position_ids = position_ids[0] + + attention_mask = create_causal_mask( + config=self.config, + input_embeds=inputs_embeds, + attention_mask=attention_mask, + cache_position=cache_position, + past_key_values=past_key_values, + position_ids=text_position_ids, + ) + + hidden_states = inputs_embeds + + # create position embeddings to be shared across the decoder layers + position_embeddings = self.rotary_emb(hidden_states, position_ids) + + # decoder layers + layer_states = {} + for layer_idx, decoder_layer in enumerate(self.layers): + layer_outputs, layer_state = decoder_layer( + hidden_states, + attention_mask=attention_mask, + position_ids=text_position_ids, + past_key_values=past_key_values, + cache_position=cache_position, + position_embeddings=position_embeddings, + layer_ids=layer_idx, + **kwargs, + ) + hidden_states = layer_outputs + layer_states[f"layer_{layer_idx}"] = layer_state + layer_states[f"layer_{layer_idx}_visual_pos_masks"] = visual_pos_masks + + # add visual features to the hidden states of first several layers + if deepstack_visual_embeds is not None and layer_idx in range(len(deepstack_visual_embeds)): + layer_states[f"layer_{layer_idx}_deepstack_visual_embeds"] = deepstack_visual_embeds[layer_idx] + hidden_states = self._deepstack_process( + hidden_states, + visual_pos_masks, + deepstack_visual_embeds[layer_idx], + ) + layer_states[f"layer_{layer_idx}_deepstack"] = hidden_states + + hidden_states = self.norm(hidden_states) + + return CustomBaseModelOutputWithPast( + last_hidden_state=hidden_states, + past_key_values=past_key_values, + layer_states=layer_states, + ) + else: + output = original_text_model_forward( + self, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + use_cache=use_cache, + cache_position=cache_position, + visual_pos_masks=visual_pos_masks, + deepstack_visual_embeds=deepstack_visual_embeds, + **kwargs, + ) + + # Capture layer_states + if hasattr(output, "layer_states") and output.layer_states is not None and is_enabled(): + save_dict(output.layer_states, "layer_states", subdir="layers") + + return output + + Qwen3VLTextModel.forward = patched_text_model_forward + + # Patch Qwen3VLModel.forward to capture visual embeddings + original_model_forward = Qwen3VLModel.forward + + def patched_model_forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + past_key_values=None, + inputs_embeds=None, + pixel_values=None, + pixel_values_videos=None, + image_grid_thw=None, + video_grid_thw=None, + cache_position=None, + **kwargs, + ): + # Call original forward + output = original_model_forward( + self, + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + inputs_embeds=inputs_embeds, + pixel_values=pixel_values, + pixel_values_videos=pixel_values_videos, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + cache_position=cache_position, + **kwargs, + ) + + # Capture layer_states from output + if hasattr(output, "layer_states") and output.layer_states is not None and is_enabled(): + save_dict(output.layer_states, "layer_states", subdir="layers") + + return output + + Qwen3VLModel.forward = patched_model_forward + + # Patch Qwen3VLVisionModel.forward to capture visual embeddings + original_vision_forward = Qwen3VLVisionModel.forward + + def patched_vision_forward(self, hidden_states, grid_thw, **kwargs): + if is_enabled(): + save_tensor(hidden_states, "visual_hidden_states", subdir="embeddings") + + hidden_states = self.patch_embed(hidden_states) + + pos_embeds = self.fast_pos_embed_interpolate(grid_thw) + hidden_states = hidden_states + pos_embeds + + rotary_pos_emb = self.rot_pos_emb(grid_thw) + + seq_len, _ = hidden_states.size() + hidden_states = hidden_states.reshape(seq_len, -1) + rotary_pos_emb = rotary_pos_emb.reshape(seq_len, -1) + emb = torch.cat((rotary_pos_emb, rotary_pos_emb), dim=-1) + position_embeddings = (emb.cos(), emb.sin()) + + cu_seqlens = torch.repeat_interleave(grid_thw[:, 1] * grid_thw[:, 2], grid_thw[:, 0]).cumsum( + dim=0, + # Select dtype based on the following factors: + # - FA2 requires that cu_seqlens_q must have dtype int32 + # - torch.onnx.export requires that cu_seqlens_q must have same dtype as grid_thw + # See https://github.com/huggingface/transformers/pull/34852 for more information + dtype=grid_thw.dtype if torch.jit.is_tracing() else torch.int32, + ) + cu_seqlens = F.pad(cu_seqlens, (1, 0), value=0) + + deepstack_feature_lists = [] + for layer_num, blk in enumerate(self.blocks): + hidden_states = blk( + hidden_states, + cu_seqlens=cu_seqlens, + position_embeddings=position_embeddings, + layer_ids=layer_num, + **kwargs, + ) + save_tensor(hidden_states, f"visual_hidden_states_{layer_num}", subdir="embeddings") + if layer_num in self.deepstack_visual_indexes: + deepstack_feature = self.deepstack_merger_list[self.deepstack_visual_indexes.index(layer_num)]( + hidden_states + ) + save_tensor(deepstack_feature, f"visual_deepstack_feature_{layer_num}", subdir="embeddings") + deepstack_feature_lists.append(deepstack_feature) + + hidden_states = self.merger(hidden_states) + save_tensor(hidden_states, "final_visual_image_embeds", subdir="embeddings") + print(f"[DEBUG] Visual Atten Type: {self.blocks[0].attn.config._attn_implementation}") + + output = hidden_states, deepstack_feature_lists + else: + return original_vision_forward(self, hidden_states, grid_thw, **kwargs) + + # Visual model returns (image_embeds, deepstack_image_embeds) + if is_enabled(): + if isinstance(output, tuple) and len(output) >= 1: + image_embeds = output[0] + save_tensor(image_embeds, "visual_image_embeds", subdir="embeddings") + + if len(output) >= 2 and output[1] is not None: + deepstack_embeds = output[1] + for i, embed in enumerate(deepstack_embeds): + save_tensor(embed, f"visual_deepstack_embeds_{i}", subdir="embeddings") + + return output + + Qwen3VLVisionModel.forward = patched_vision_forward + + original_vision_decoder_block_forward = Qwen3VLVisionBlock.forward + + def patched_vision_decoder_block_forward( + self, hidden_states, cu_seqlens, rotary_pos_emb=None, position_embeddings=None, **kwargs + ): + if is_enabled(): + layer_ids = kwargs.pop("layer_ids", 0) + norm_result = self.norm1(hidden_states) + save_tensor(norm_result, f"visual_block_{layer_ids}_after_norm1", subdir="embeddings") + + attn_result = self.attn( + norm_result, + cu_seqlens=cu_seqlens, + rotary_pos_emb=rotary_pos_emb, + position_embeddings=position_embeddings, + layer_ids=layer_ids, + **kwargs, + ) + save_tensor(attn_result, f"visual_block_{layer_ids}_after_attn", subdir="embeddings") + + hidden_states = hidden_states + attn_result + + norm_result = self.norm2(hidden_states) + save_tensor(norm_result, f"visual_block_{layer_ids}_after_norm2", subdir="embeddings") + + mlp_result = self.mlp(norm_result) + save_tensor(mlp_result, f"visual_block_{layer_ids}_after_mlp", subdir="embeddings") + + hidden_states = hidden_states + mlp_result + return hidden_states + return original_vision_decoder_block_forward( + self, hidden_states, cu_seqlens, position_embeddings, **kwargs + ) + + Qwen3VLVisionBlock.forward = patched_vision_decoder_block_forward + + original_vision_attention_forward = Qwen3VLVisionAttention.forward + + def patched_vision_attention_forward( + self, + hidden_states, + cu_seqlens, + rotary_pos_emb=None, + position_embeddings=None, + **kwargs, + ): + if is_enabled(): + layer_ids = kwargs.pop("layer_ids", 0) + seq_length = hidden_states.shape[0] + query_states, key_states, value_states = ( + self.qkv(hidden_states).reshape(seq_length, 3, self.num_heads, -1).permute(1, 0, 2, 3).unbind(0) + ) + cos, sin = position_embeddings + query_states, key_states = apply_rotary_pos_emb_vision(query_states, key_states, cos, sin) + + query_states = query_states.transpose(0, 1).unsqueeze(0) + key_states = key_states.transpose(0, 1).unsqueeze(0) + value_states = value_states.transpose(0, 1).unsqueeze(0) + + if layer_ids == 0: + save_tensor(query_states, f"visual_block_{layer_ids}_query_states", subdir="embeddings") + save_tensor(key_states, f"visual_block_{layer_ids}_key_states", subdir="embeddings") + save_tensor(value_states, f"visual_block_{layer_ids}_value_states", subdir="embeddings") + save_tensor(cos, f"visual_block_{layer_ids}_cos", subdir="embeddings") + save_tensor(sin, f"visual_block_{layer_ids}_sin", subdir="embeddings") + + attention_interface: Callable = eager_attention_forward + if self.config._attn_implementation != "eager": + attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation] + + if self.config._attn_implementation == "flash_attention_2": + # Flash Attention 2: Use cu_seqlens for variable length attention + max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max() + attn_output, _ = attention_interface( + self, + query_states, + key_states, + value_states, + attention_mask=None, + scaling=self.scaling, + dropout=0.0 if not self.training else self.attention_dropout, + cu_seq_lens_q=cu_seqlens, + cu_seq_lens_k=cu_seqlens, + max_length_q=max_seqlen, + max_length_k=max_seqlen, + is_causal=False, + **kwargs, + ) + + if layer_ids == 0: + save_tensor(attn_output, f"visual_block_{layer_ids}_after_attn_output", subdir="embeddings") + else: + # Other implementations: Process each chunk separately + lengths = cu_seqlens[1:] - cu_seqlens[:-1] + splits = [ + torch.split(tensor, lengths.tolist(), dim=2) + for tensor in (query_states, key_states, value_states) + ] + + attn_outputs = [ + attention_interface( + self, + q, + k, + v, + attention_mask=None, + scaling=self.scaling, + dropout=0.0 if not self.training else self.attention_dropout, + is_causal=False, + **kwargs, + )[0] + for q, k, v in zip(*splits) + ] + attn_output = torch.cat(attn_outputs, dim=1) + + attn_output = attn_output.reshape(seq_length, -1).contiguous() + attn_output = self.proj(attn_output) + + if layer_ids == 0: + save_tensor(attn_output, f"visual_block_{layer_ids}_after_o_output", subdir="embeddings") + return attn_output + else: + return original_vision_attention_forward( + self, hidden_states, cu_seqlens, rotary_pos_emb, position_embeddings, **kwargs + ) + + Qwen3VLVisionAttention.forward = patched_vision_attention_forward + + original_text_mlp_forward = Qwen3VLTextMLP.forward + + def patched_text_mlp_forward(self, x, layer_ids=0): + if is_enabled(): + up_proj = self.up_proj(x) + save_tensor(up_proj, f"text_block_{layer_ids}_up_proj", subdir="layers") + gate_proj = self.gate_proj(x) + save_tensor(gate_proj, f"text_block_{layer_ids}_gate_proj", subdir="layers") + act_fn = self.act_fn(gate_proj) + save_tensor(act_fn, f"text_block_{layer_ids}_act_fn", subdir="layers") + down_proj = self.down_proj(act_fn * up_proj) + save_tensor(down_proj, f"text_block_{layer_ids}_down_proj", subdir="layers") + + if layer_ids == 0: + up_proj_weight = self.up_proj.weight + save_tensor(up_proj_weight, f"text_block_{layer_ids}_up_proj_weight", subdir="layers") + gate_proj_weight = self.gate_proj.weight + save_tensor(gate_proj_weight, f"text_block_{layer_ids}_gate_proj_weight", subdir="layers") + down_proj_weight = self.down_proj.weight + save_tensor(down_proj_weight, f"text_block_{layer_ids}_down_proj_weight", subdir="layers") + return down_proj + return original_text_mlp_forward(self, x) + + Qwen3VLTextMLP.forward = patched_text_mlp_forward + + original_text_decoder_layer_forward = Qwen3VLTextDecoderLayer.forward + + def patched_text_decoder_layer_forward( + self, + hidden_states: torch.Tensor, + position_embeddings: tuple[torch.Tensor, torch.Tensor], + attention_mask=None, + position_ids=None, + past_key_values=None, + use_cache=False, + cache_position=None, + **kwargs, + ): + if is_enabled(): + layer_ids = kwargs.pop("layer_ids", 0) + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + + before_attn = hidden_states + # Self Attention + hidden_states, _ = self.self_attn( + hidden_states=hidden_states, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + use_cache=use_cache, + cache_position=cache_position, + position_embeddings=position_embeddings, + **kwargs, + ) + + after_attn = hidden_states + + hidden_states = residual + hidden_states + + # Fully Connected + residual = hidden_states + hidden_states = self.post_attention_layernorm(hidden_states) + + after_post_norm = hidden_states + + hidden_states = self.mlp(hidden_states, layer_ids=layer_ids) + after_mlp = hidden_states + + hidden_states = residual + hidden_states + + after_mlp_res = hidden_states + + layer_states = { + "before_attn": before_attn, + "after_attn": after_attn, + "after_post_norm": after_post_norm, + "after_mlp": after_mlp, + "after_mlp_res": after_mlp_res, + } + return hidden_states, layer_states + else: + return original_text_decoder_layer_forward( + self, + hidden_states, + position_embeddings, + attention_mask, + position_ids, + past_key_values, + use_cache, + cache_position, + **kwargs, + ) + + Qwen3VLTextDecoderLayer.forward = patched_text_decoder_layer_forward + + return True + except ImportError as e: + print(f"Warning: Could not import Qwen3VL models for patching: {e}") + return False + + +# ----------------------------- +# Megatron/mcore patches +# ----------------------------- +def apply_qwen3vl_megatron_patches(): + """ + Apply patches to mcore_adapter Qwen3-VL classes to capture per-layer states + (similar naming/layout to the HF patch above) for divergence debugging. + """ + try: + from mcore_adapter.models.qwen3_vl.modeling_qwen3_vl import Qwen3VLGPTModel # type: ignore[import-not-found] + from mcore_adapter.models.qwen3_vl.transformer_block import ( + Qwen3VLTransformerBlock, + ) # type: ignore[import-not-found] + + # ------------------------- + # Patch Qwen3VLGPTModel.forward + # Capture embeddings + visual injection inputs at the text stack boundary. + # ------------------------- + original_gpt_forward = Qwen3VLGPTModel.forward + + def patched_gpt_forward( + self, + input_ids, + position_ids, + attention_mask, + decoder_input=None, + labels=None, + inference_context=None, + packed_seq_params=None, + extra_block_kwargs=None, + runtime_gather_output=None, + *, + inference_params=None, + loss_mask=None, + visual_pos_masks=None, + deepstack_visual_embeds=None, + ): + if is_enabled(): + if decoder_input is not None: + save_tensor(decoder_input, "inputs_embeds", subdir="embeddings") + if visual_pos_masks is not None: + save_tensor(visual_pos_masks, "visual_pos_masks", subdir="embeddings") + if deepstack_visual_embeds is not None: + for i, visual_embed in enumerate(deepstack_visual_embeds): + save_tensor(visual_embed, f"deepstack_visual_embeds_{i}", subdir="embeddings") + + return original_gpt_forward( + self, + input_ids=input_ids, + position_ids=position_ids, + attention_mask=attention_mask, + decoder_input=decoder_input, + labels=labels, + inference_context=inference_context, + packed_seq_params=packed_seq_params, + extra_block_kwargs=extra_block_kwargs, + runtime_gather_output=runtime_gather_output, + inference_params=inference_params, + loss_mask=loss_mask, + visual_pos_masks=visual_pos_masks, + deepstack_visual_embeds=deepstack_visual_embeds, + ) + + Qwen3VLGPTModel.forward = patched_gpt_forward + + # ------------------------- + # Patch Qwen3VLTransformerBlock to capture per-layer intermediates. + # Uses hooks to avoid changing model math. + # Also patches _deepstack_process to attribute "deepstack" state to the last executed layer. + # ------------------------- + original_block_forward = Qwen3VLTransformerBlock.forward + original_deepstack_process = Qwen3VLTransformerBlock._deepstack_process + + def _first_tensor(x): + if x is None: + return None + if isinstance(x, torch.Tensor): + return x + if isinstance(x, (list, tuple)): + for item in x: + t = _first_tensor(item) + if t is not None: + return t + return None + if hasattr(x, "unwrap"): # WrappedTensor + try: + return x.unwrap() + except Exception: + return None + return None + + def patched_deepstack_process(self, hidden_states, visual_pos_masks, visual_embeds): + out = original_deepstack_process(self, hidden_states, visual_pos_masks, visual_embeds) + if is_enabled(): + idx = getattr(self, "_capture_last_layer_idx", None) + if idx is not None: + save_tensor(out, f"layer_states_layer_{idx}_deepstack", subdir="layers") + return out + + Qwen3VLTransformerBlock._deepstack_process = patched_deepstack_process + + def patched_block_forward(self, *args, **kwargs): + if not is_enabled(): + return original_block_forward(self, *args, **kwargs) + + # Last layer idx (global layer number - 1) whose forward just ran on this PP rank. + self._capture_last_layer_idx = None + handles = [] + + def _register(module, fn): + try: + h = module.register_forward_hook(fn) + handles.append(h) + except Exception: + pass + + try: + for layer in getattr(self, "layers", []): + layer_idx = getattr(layer, "layer_number", None) + if layer_idx is not None: + layer_idx = int(layer_idx) - 1 + + # input_layernorm -> before_attn + ln = getattr(layer, "input_layernorm", None) + if ln is not None: + _register( + ln, + ( + lambda idx: ( + lambda _m, _inp, out: ( + save_tensor( + _first_tensor(out), + f"layer_states_layer_{idx}_before_attn", + subdir="layers", + ) + if idx is not None + else None + ) + ) + )(layer_idx), + ) + + # self_attention -> after_attn (attention output before residual) + attn = getattr(layer, "self_attention", None) + if attn is not None: + _register( + attn, + ( + lambda idx: ( + lambda _m, _inp, out: ( + save_tensor( + _first_tensor(out), f"layer_states_layer_{idx}_after_attn", subdir="layers" + ) + if idx is not None + else None + ) + ) + )(layer_idx), + ) + + # post-attn norm (naming differs across versions) + post_ln = getattr(layer, "pre_mlp_layernorm", None) or getattr( + layer, "post_attention_layernorm", None + ) + if post_ln is not None: + _register( + post_ln, + ( + lambda idx: ( + lambda _m, _inp, out: ( + save_tensor( + _first_tensor(out), + f"layer_states_layer_{idx}_after_post_norm", + subdir="layers", + ) + if idx is not None + else None + ) + ) + )(layer_idx), + ) + + # mlp -> after_mlp + mlp = getattr(layer, "mlp", None) + if mlp is not None: + _register( + mlp, + ( + lambda idx: ( + lambda _m, _inp, out: ( + save_tensor( + _first_tensor(out), f"layer_states_layer_{idx}_after_mlp", subdir="layers" + ) + if idx is not None + else None + ) + ) + )(layer_idx), + ) + + # layer output -> after_mlp_res (final hidden after residuals) + def _layer_out_hook(idx): + def _hook(_m, _inp, out): + t = _first_tensor(out) + if idx is not None: + self._capture_last_layer_idx = idx + if t is not None: + save_tensor(t, f"layer_states_layer_{idx}_after_mlp_res", subdir="layers") + + return _hook + + _register(layer, _layer_out_hook(layer_idx)) + + return original_block_forward(self, *args, **kwargs) + finally: + for h in handles: + try: + h.remove() + except Exception: + pass + + Qwen3VLTransformerBlock.forward = patched_block_forward + + return True + except Exception as e: + print(f"Warning: Could not import mcore Qwen3VL models for patching: {e}") + return False + + +# Auto-apply patches when module is imported if enabled +if os.getenv("AUTO_APPLY_MODEL_PATCHES", "0") == "1": + apply_qwen3vl_patches() + apply_qwen3vl_megatron_patches() diff --git a/tests/distributed/strategy/log_probs/layer_states_capture.py b/tests/distributed/strategy/log_probs/layer_states_capture.py new file mode 100644 index 000000000..689e9f351 --- /dev/null +++ b/tests/distributed/strategy/log_probs/layer_states_capture.py @@ -0,0 +1,165 @@ +import os +from pathlib import Path +from typing import Any, Dict, Optional + +import torch +import torch.distributed as dist + +_capture_info = None + + +class LayerStatesCapture: + + _instance = None + _initialized = False + + def __new__(cls): + if cls._instance is None: + cls._instance = super().__new__(cls) + return cls._instance + + def __init__(self): + if not self._initialized: + self._update_from_env_or_meta() + self._initialized = True + + def _update_from_env_or_meta(self): + """Update capture settings from environment variables or global _capture_info.""" + global _capture_info + + # First check global _capture_info (set from meta_info) + if _capture_info is not None: + self.save_dir = _capture_info.get("save_dir") + self.prefix = _capture_info.get("prefix", "capture") + self.global_step = _capture_info.get("step", 0) + self.batch_idx = _capture_info.get("batch_idx", 0) + else: + # Fall back to environment variables + self.save_dir = os.getenv("LAYER_STATES_SAVE_DIR", None) + self.prefix = os.getenv("LAYER_STATES_PREFIX", "capture") + self.global_step = int(os.getenv("LAYER_STATES_STEP", "0")) + self.batch_idx = int(os.getenv("LAYER_STATES_BATCH", "0")) + + self.enabled = self.save_dir is not None + + def update_from_meta_info(self, meta_info: Dict): + """Update capture settings from DataProto meta_info.""" + global _capture_info + if "_capture_layer_states" in meta_info: + _capture_info = meta_info["_capture_layer_states"] + self._update_from_env_or_meta() + else: + _capture_info = None + self._update_from_env_or_meta() + + def save_tensor(self, tensor: torch.Tensor, name: str, subdir: str = ""): + """Save a tensor to disk if capture is enabled.""" + # Refresh settings before each save + self._update_from_env_or_meta() + + if not self.enabled: + return + + # Optional: gather CP (Ulysses) sharded sequence tensors before saving. + # This is meant for debugging context-parallel divergence: + # - We only gather common "sequence-shaped" tensors (ndim == 3), e.g. (bs, seq, hidden) + # - We concatenate on dim=1 by default (the seq dimension) + # - We save only on rank0 to avoid duplicate files + # + # Enable with: + # - LAYER_STATES_CP_GATHER=1 + # Optional knobs: + # - LAYER_STATES_CP_GATHER_DIM (default: 1) + # - LAYER_STATES_CP_GATHER_SAVE_LOCAL=1 (also save local shard under original name) + do_cp_gather = os.getenv("LAYER_STATES_CP_GATHER", "0") == "1" + gather_dim = int(os.getenv("LAYER_STATES_CP_GATHER_DIM", "1")) + save_local = os.getenv("LAYER_STATES_CP_GATHER_SAVE_LOCAL", "0") == "1" + + gathered_tensor: torch.Tensor | None = None + if ( + do_cp_gather + and isinstance(tensor, torch.Tensor) + and tensor.ndim == 3 + and dist.is_available() + and dist.is_initialized() + ): + try: + # Prefer the dedicated CP/Ulysses group if available; otherwise fall back to WORLD. + try: + from roll.utils.context_parallel.globals import ( + get_ulysses_group, + ) # local import for test-only util + + group = get_ulysses_group() + except Exception: + group = dist.group.WORLD + + world = dist.get_world_size(group=group) + if world > 1 and gather_dim < tensor.ndim: + # Assumes equal shapes across ranks for the gathered dim (true for padded CP and non-rmpad tests). + parts = [torch.empty_like(tensor) for _ in range(world)] + dist.all_gather(parts, tensor, group=group) + gathered_tensor = torch.cat(parts, dim=gather_dim) + + if dist.get_rank(group=group) != 0: + # Non-zero ranks participate in all_gather but do not write files. + return + except Exception: + # Never fail training/tests due to debug capture logic. + gathered_tensor = None + + save_path = Path(self.save_dir) + if subdir: + save_path = save_path / subdir + save_path.mkdir(parents=True, exist_ok=True) + + if gathered_tensor is not None: + if save_local: + local_path = save_path / f"{self.prefix}_step{self.global_step}_batch{self.batch_idx}_{name}.pt" + torch.save(tensor.cpu().detach(), local_path) + + file_path = save_path / f"{self.prefix}_step{self.global_step}_batch{self.batch_idx}_{name}_gathered.pt" + torch.save(gathered_tensor.cpu().detach(), file_path) + else: + file_path = save_path / f"{self.prefix}_step{self.global_step}_batch{self.batch_idx}_{name}.pt" + torch.save(tensor.cpu().detach(), file_path) + + def save_dict(self, data: Dict[str, Any], name: str, subdir: str = ""): + """Save a dictionary of tensors.""" + # Refresh settings before each save + self._update_from_env_or_meta() + + if not self.enabled: + return + + for key, value in data.items(): + if isinstance(value, torch.Tensor): + self.save_tensor(value, f"{name}_{key}", subdir) + elif isinstance(value, dict): + self.save_dict(value, f"{name}_{key}", subdir) + elif isinstance(value, (list, tuple)) and len(value) > 0: + if isinstance(value[0], torch.Tensor): + for i, tensor in enumerate(value): + self.save_tensor(tensor, f"{name}_{key}_{i}", subdir) + + +# Global instance +_capture = LayerStatesCapture() + + +def save_tensor(tensor: torch.Tensor, name: str, subdir: str = ""): + """Convenience function to save a tensor.""" + _capture._update_from_env_or_meta() # Refresh settings + _capture.save_tensor(tensor, name, subdir) + + +def save_dict(data: Dict[str, Any], name: str, subdir: str = ""): + """Convenience function to save a dict.""" + _capture._update_from_env_or_meta() # Refresh settings + _capture.save_dict(data, name, subdir) + + +def is_enabled() -> bool: + """Check if capture is enabled.""" + _capture._update_from_env_or_meta() # Refresh settings + return _capture.enabled diff --git a/tests/distributed/strategy/log_probs/log_probs_cmp_config.yaml b/tests/distributed/strategy/log_probs/log_probs_cmp_config.yaml index b3079dfd3..befeed536 100644 --- a/tests/distributed/strategy/log_probs/log_probs_cmp_config.yaml +++ b/tests/distributed/strategy/log_probs/log_probs_cmp_config.yaml @@ -21,7 +21,7 @@ prompt_length: 128 response_length: 64 -pretrain: Qwen/Qwen2.5-7B-Instruct +pretrain: Qwen/Qwen3-0.6B actor_train: diff --git a/tests/distributed/strategy/log_probs/log_probs_fsdp_config.yaml b/tests/distributed/strategy/log_probs/log_probs_fsdp_config.yaml new file mode 100644 index 000000000..c827ece26 --- /dev/null +++ b/tests/distributed/strategy/log_probs/log_probs_fsdp_config.yaml @@ -0,0 +1,109 @@ +hydra: + run: + dir: . + output_subdir: null + +exp_name: "fsdp_log_probs_test" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output + +track_with: stdout + +rollout_batch_size: 128 +prompt_length: 128 +response_length: 512 + +pretrain: Qwen/Qwen3-0.6B + +actor_train: + name: actor_train + worker_cls: roll.pipeline.base_worker.ActorWorker + training_args: + per_device_train_batch_size: 2 + gradient_accumulation_steps: 8 + learning_rate: 1.0e-05 + num_train_epochs: 1 + max_steps: 10 + warmup_steps: 0 + logging_steps: 1 + save_steps: 10 + eval_steps: 10 + seed: 42 + max_grad_norm: 1.0 + weight_decay: 0.0 + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + ulysses_size: 1 + data_args: + template: qwen2_5 + file_name: data/comparison_gpt4_data_zh.json + dataset_dir: data + prompt: instruction + interleave_probs: "1.0" + preprocessing_num_workers: 16 + max_samples: ${rollout_batch_size} + strategy_args: + strategy_name: fsdp2_train + strategy_config: + param_dtype: bf16 + reduce_dtype: fp32 + reshard_after_forward: True + offload_policy: true + fsdp_size: 8 + checkpoint_config: + async_upload: False + offload_nccl: False + use_remove_padding: False + use_dynamic_batching_in_train: False + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +actor_infer: + name: actor_infer + worker_cls: roll.pipeline.base_worker.ActorWorker + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${response_length} + top_p: 1.0 + top_k: 100 + num_beams: 1 + temperature: 1.0 + num_return_sequences: 1 + data_args: + template: qwen2_5 + file_name: data/comparison_gpt4_data_zh.json + dataset_dir: data + prompt: instruction + interleave_probs: "1.0" + preprocessing_num_workers: 16 + max_samples: ${rollout_batch_size} + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.75 + block_size: 16 + load_format: auto + max_num_seqs: 1024 + max_num_batched_tokens: 8096 + enable_prefix_caching: true + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +reference: + name: reference + worker_cls: roll.pipeline.base_worker.ActorWorker + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + strategy_args: + strategy_name: hf_infer + strategy_config: ~ + device_mapping: list(range(0,1)) + infer_batch_size: 2 + diff --git a/tests/distributed/strategy/log_probs/log_probs_fsdp_cp_config.yaml b/tests/distributed/strategy/log_probs/log_probs_fsdp_cp_config.yaml new file mode 100644 index 000000000..8adc17070 --- /dev/null +++ b/tests/distributed/strategy/log_probs/log_probs_fsdp_cp_config.yaml @@ -0,0 +1,110 @@ +hydra: + run: + dir: . + output_subdir: null + +exp_name: "fsdp_cp_log_probs_test" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output + +track_with: stdout + +rollout_batch_size: 128 +prompt_length: 8192 +response_length: 8192 + +pretrain: Qwen/Qwen3-0.6B + +actor_train: + name: actor_train + worker_cls: roll.pipeline.base_worker.ActorWorker + training_args: + per_device_train_batch_size: 2 + gradient_accumulation_steps: 16 + learning_rate: 1.0e-05 + num_train_epochs: 1 + max_steps: 10 + warmup_steps: 0 + logging_steps: 1 + save_steps: 10 + eval_steps: 10 + seed: 42 + max_grad_norm: 1.0 + weight_decay: 0.0 + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + ulysses_size: 2 + data_args: + template: qwen2_5 + file_name: data/comparison_gpt4_data_zh.json + dataset_dir: data + prompt: instruction + interleave_probs: "1.0" + preprocessing_num_workers: 16 + max_samples: ${rollout_batch_size} + strategy_args: + strategy_name: fsdp2_train + strategy_config: + param_dtype: bf16 + reduce_dtype: fp32 + reshard_after_forward: True + offload_policy: False + fsdp_size: 4 + checkpoint_config: + async_upload: False + offload_nccl: False + use_remove_padding: False + use_dynamic_batching_in_train: False + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +actor_infer: + name: actor_infer + worker_cls: roll.pipeline.base_worker.ActorWorker + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${response_length} + top_p: 1.0 + top_k: 100 + num_beams: 1 + temperature: 1.0 + num_return_sequences: 1 + data_args: + template: qwen2_5 + file_name: data/comparison_gpt4_data_zh.json + dataset_dir: data + prompt: instruction + interleave_probs: "1.0" + preprocessing_num_workers: 16 + max_samples: ${rollout_batch_size} + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.75 + block_size: 16 + load_format: auto + max_num_seqs: 1024 + max_num_batched_tokens: 8096 + enable_prefix_caching: true + device_mapping: list(range(0,8)) + infer_batch_size: 2 + + +reference: + name: reference + worker_cls: roll.pipeline.base_worker.ActorWorker + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + strategy_args: + strategy_name: hf_infer + strategy_config: ~ + device_mapping: list(range(0,1)) + infer_batch_size: 2 + diff --git a/tests/distributed/strategy/log_probs/log_probs_fsdp_cp_rmpad_config.yaml b/tests/distributed/strategy/log_probs/log_probs_fsdp_cp_rmpad_config.yaml new file mode 100644 index 000000000..9b9bc3899 --- /dev/null +++ b/tests/distributed/strategy/log_probs/log_probs_fsdp_cp_rmpad_config.yaml @@ -0,0 +1,111 @@ +hydra: + run: + dir: . + output_subdir: null + +exp_name: "fsdp_cp_log_probs_test" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output + +track_with: stdout + +rollout_batch_size: 128 +prompt_length: 8192 +response_length: 8192 + +pretrain: Qwen/Qwen3-0.6B + +actor_train: + name: actor_train + worker_cls: roll.pipeline.base_worker.ActorWorker + training_args: + per_device_train_batch_size: 2 + gradient_accumulation_steps: 16 + learning_rate: 1.0e-05 + num_train_epochs: 1 + max_steps: 10 + warmup_steps: 0 + logging_steps: 1 + save_steps: 10 + eval_steps: 10 + seed: 42 + max_grad_norm: 1.0 + weight_decay: 0.0 + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + ulysses_size: 2 + data_args: + template: qwen2_5 + file_name: data/comparison_gpt4_data_zh.json + dataset_dir: data + prompt: instruction + interleave_probs: "1.0" + preprocessing_num_workers: 16 + max_samples: ${rollout_batch_size} + strategy_args: + strategy_name: fsdp2_train + strategy_config: + param_dtype: bf16 + reduce_dtype: fp32 + reshard_after_forward: True + offload_policy: False + fsdp_size: 4 + checkpoint_config: + async_upload: False + offload_nccl: False + use_remove_padding: true + use_dynamic_batching_in_train: False + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +actor_infer: + name: actor_infer + worker_cls: roll.pipeline.base_worker.ActorWorker + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${response_length} + top_p: 1.0 + top_k: 100 + num_beams: 1 + temperature: 1.0 + num_return_sequences: 1 + data_args: + template: qwen2_5 + file_name: data/comparison_gpt4_data_zh.json + dataset_dir: data + prompt: instruction + interleave_probs: "1.0" + preprocessing_num_workers: 16 + max_samples: ${rollout_batch_size} + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.75 + block_size: 16 + load_format: auto + max_num_seqs: 1024 + max_num_batched_tokens: 8096 + enable_prefix_caching: true + device_mapping: list(range(0,8)) + infer_batch_size: 2 + + +reference: + name: reference + worker_cls: roll.pipeline.base_worker.ActorWorker + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + strategy_args: + strategy_name: hf_infer + strategy_config: ~ + device_mapping: list(range(0,1)) + infer_batch_size: 2 + use_remove_padding: false + diff --git a/tests/distributed/strategy/log_probs/log_probs_fsdp_lora_config.yaml b/tests/distributed/strategy/log_probs/log_probs_fsdp_lora_config.yaml new file mode 100644 index 000000000..6721a716d --- /dev/null +++ b/tests/distributed/strategy/log_probs/log_probs_fsdp_lora_config.yaml @@ -0,0 +1,113 @@ +hydra: + run: + dir: . + output_subdir: null + +exp_name: "fsdp_lora_log_probs_test" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output + +track_with: stdout + +rollout_batch_size: 128 +prompt_length: 128 +response_length: 512 + +pretrain: Qwen/Qwen3-0.6B + +actor_train: + name: actor_train + worker_cls: roll.pipeline.base_worker.ActorWorker + training_args: + per_device_train_batch_size: 2 + gradient_accumulation_steps: 8 + learning_rate: 1.0e-05 + num_train_epochs: 1 + max_steps: 10 + warmup_steps: 0 + logging_steps: 1 + save_steps: 10 + eval_steps: 10 + seed: 42 + max_grad_norm: 1.0 + weight_decay: 0.0 + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + ulysses_size: 1 + lora_target: q_proj, k_proj, v_proj, o_proj + lora_r: 16 + lora_alpha: 32 + lora_dropout: 0.05 + data_args: + template: qwen2_5 + file_name: data/comparison_gpt4_data_zh.json + dataset_dir: data + prompt: instruction + interleave_probs: "1.0" + preprocessing_num_workers: 16 + max_samples: ${rollout_batch_size} + strategy_args: + strategy_name: fsdp2_train + strategy_config: + param_dtype: bf16 + reduce_dtype: fp32 + reshard_after_forward: True + offload_policy: False + fsdp_size: 8 + checkpoint_config: + async_upload: False + offload_nccl: False + use_remove_padding: False + use_dynamic_batching_in_train: False + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +actor_infer: + name: actor_infer + worker_cls: roll.pipeline.base_worker.ActorWorker + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${response_length} + top_p: 1.0 + top_k: 100 + num_beams: 1 + temperature: 1.0 + num_return_sequences: 1 + data_args: + template: qwen2_5 + file_name: data/comparison_gpt4_data_zh.json + dataset_dir: data + prompt: instruction + interleave_probs: "1.0" + preprocessing_num_workers: 16 + max_samples: ${rollout_batch_size} + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.75 + block_size: 16 + load_format: auto + max_num_seqs: 1024 + max_num_batched_tokens: 8096 + enable_prefix_caching: true + device_mapping: list(range(0,8)) + infer_batch_size: 2 + +reference: + name: reference + worker_cls: roll.pipeline.base_worker.ActorWorker + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + strategy_args: + strategy_name: hf_infer + strategy_config: ~ + device_mapping: list(range(0,1)) + infer_batch_size: 2 + diff --git a/tests/distributed/strategy/log_probs/log_probs_fsdp_vlm_cp2_config.yaml b/tests/distributed/strategy/log_probs/log_probs_fsdp_vlm_cp2_config.yaml new file mode 100644 index 000000000..de983f158 --- /dev/null +++ b/tests/distributed/strategy/log_probs/log_probs_fsdp_vlm_cp2_config.yaml @@ -0,0 +1,103 @@ +hydra: + run: + dir: . + output_subdir: null + +exp_name: "fsdp_cp2_vlm_log_probs_test" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output + +track_with: stdout + +# Keep batch sizes small by default (VLM is heavy). Override as needed. +rollout_batch_size: 2 + +# Prompt/response lengths are taken from the example (prompt_length=8192). Response length can be smaller for tests. +prompt_length: 8192 +response_length: 8192 + +pretrain: /home/dilixiati.dlxtmhte/.cache/openlm/hub/b961282fc5087c3ee28b5c7d2a72424e + +actor_train: + name: actor_train + worker_cls: roll.pipeline.base_worker.ActorWorker + training_args: + per_device_train_batch_size: 2 + gradient_accumulation_steps: 1 + learning_rate: 1.0e-6 + num_train_epochs: 1 + max_steps: 1 + warmup_steps: 0 + logging_steps: 1 + save_steps: 10 + eval_steps: 10 + seed: 42 + max_grad_norm: 1.0 + weight_decay: 1.0e-2 + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + dtype: bf16 + ulysses_size: 2 + # Ensure image preprocessing works (mirrors RLVRVLMPipeline defaults). + max_pixels: 1048576 + min_pixels: 3136 + # keep vision frozen (as in example config) to reduce training footprint + freeze_module_prefix: vision_model + data_args: + file_name: ./data/geoqa_data/ + dataset_dir: ./ + preprocessing_num_workers: 16 + strategy_args: + strategy_name: fsdp2_train + strategy_config: + param_dtype: bf16 + reduce_dtype: fp32 + reshard_after_forward: true + offload_policy: false + fsdp_size: 1 + checkpoint_config: + async_upload: false + offload_nccl: false + use_remove_padding: false + use_dynamic_batching_in_train: false + # Match the example (8 GPUs). The test will skip if the machine has fewer. + device_mapping: list(range(0,2)) + infer_batch_size: 1 + +actor_infer: + name: actor_infer + worker_cls: roll.pipeline.base_worker.InferWorker + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + disable_gradient_checkpointing: true + dtype: bf16 + generating_args: + max_new_tokens: ${response_length} + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + device_mapping: list(range(0,8)) + infer_batch_size: 1 + +reference: + name: reference + worker_cls: roll.pipeline.base_worker.ActorWorker + model_args: + model_name_or_path: ${pretrain} + attn_implementation: fa2 + dtype: bf16 + strategy_args: + strategy_name: hf_infer + strategy_config: ~ + device_mapping: list(range(0,1)) + infer_batch_size: 1 \ No newline at end of file diff --git a/tests/distributed/strategy/log_probs/log_probs_megatron_config.yaml b/tests/distributed/strategy/log_probs/log_probs_megatron_config.yaml index 3b51df270..1b4f67e2f 100644 --- a/tests/distributed/strategy/log_probs/log_probs_megatron_config.yaml +++ b/tests/distributed/strategy/log_probs/log_probs_megatron_config.yaml @@ -14,7 +14,7 @@ rollout_batch_size: 512 prompt_length: 128 response_length: 512 -pretrain: Qwen/Qwen2.5-7B-Instruct +pretrain: Qwen/Qwen3-0.6B actor_infer: model_args: diff --git a/tests/distributed/strategy/log_probs/test_ds_hf_log_probs.py b/tests/distributed/strategy/log_probs/test_ds_hf_log_probs.py index e5810bcae..113fb1286 100644 --- a/tests/distributed/strategy/log_probs/test_ds_hf_log_probs.py +++ b/tests/distributed/strategy/log_probs/test_ds_hf_log_probs.py @@ -7,14 +7,15 @@ from roll.datasets.collator import DataCollatorWithPaddingForPaddedKeys from roll.datasets.loader import get_dataset -from roll.pipeline.base_worker import ActorWorker from roll.distributed.executor.cluster import Cluster from roll.distributed.scheduler.initialize import init from roll.distributed.scheduler.protocol import DataProto from roll.models.model_providers import default_tokenizer_provider from roll.pipeline.base_pipeline import BasePipeline +from roll.pipeline.base_worker import ActorWorker from roll.utils.logging import get_logger -from tests.distributed.strategy.make_baseline_config import make_baseline_config +from tests.distributed.strategy.make_baseline_config import \ + make_baseline_config logger = get_logger() @@ -26,7 +27,6 @@ def __init__(self, pipeline_config): set_seed(self.pipeline_config.seed) self.tokenizer = default_tokenizer_provider( model_args=self.pipeline_config.actor_train.model_args, - template_name=self.pipeline_config.actor_train.data_args.template, ) self.dataset = get_dataset( tokenizer=self.tokenizer, @@ -92,22 +92,84 @@ def run(self): logprobs_zero3_eq = self.reference.compute_log_probs(batch) prompt_ids = generate_output.batch["prompts"] + response_ids = generate_output.batch["responses"] prompts = self.tokenizer.batch_decode(prompt_ids, skip_special_tokens=True) - for prompt, logprob_zero3_ne, logprob_hf, logprob_zero3_eq in zip( + responses = self.tokenizer.batch_decode(response_ids, skip_special_tokens=True) + + # Compute per-sample differences + count = 0 + sum_diff_zero3ne_hf_max = 0.0 + sum_diff_zero3ne_hf_mean = 0.0 + sum_diff_zero3eq_hf_max = 0.0 + sum_diff_zero3eq_hf_mean = 0.0 + + for prompt, response, logprob_zero3_ne, logprob_hf, logprob_zero3_eq in zip( prompts, + responses, logprobs_zero3_ne.batch["log_probs"], logprobs_hf.batch["log_probs"], logprobs_zero3_eq.batch["log_probs"], ): + # Compute differences + diff_zero3ne_hf_max = (logprob_zero3_ne - logprob_hf).abs().max().item() + diff_zero3ne_hf_mean = (logprob_zero3_ne - logprob_hf).abs().mean().item() + diff_zero3eq_hf_max = (logprob_zero3_eq - logprob_hf).abs().max().item() + diff_zero3eq_hf_mean = (logprob_zero3_eq - logprob_hf).abs().mean().item() + + sum_diff_zero3ne_hf_max += diff_zero3ne_hf_max + sum_diff_zero3ne_hf_mean += diff_zero3ne_hf_mean + sum_diff_zero3eq_hf_max += diff_zero3eq_hf_max + sum_diff_zero3eq_hf_mean += diff_zero3eq_hf_mean + count += 1 + result = { "prompt": prompt, + "response": response, + "diff_zero3ne_hf_max": diff_zero3ne_hf_max, + "diff_zero3ne_hf_mean": diff_zero3ne_hf_mean, + "diff_zero3eq_hf_max": diff_zero3eq_hf_max, + "diff_zero3eq_hf_mean": diff_zero3eq_hf_mean, "logprob_zero3_ne": logprob_zero3_ne.tolist(), "logprob_hf": logprob_hf.tolist(), "logprob_zero3_eq": logprob_zero3_eq.tolist(), } - print(result) results.append(result) - + + # Log average differences for this batch + logger.info( + f"Batch {global_step} - ZeRO3(ne) vs HF: " + f"avg_diff_max={sum_diff_zero3ne_hf_max / count:.6f}, " + f"avg_diff_mean={sum_diff_zero3ne_hf_mean / count:.6f}" + ) + logger.info( + f"Batch {global_step} - ZeRO3(eq) vs HF: " + f"avg_diff_max={sum_diff_zero3eq_hf_max / count:.6f}, " + f"avg_diff_mean={sum_diff_zero3eq_hf_mean / count:.6f}" + ) + + global_step += 1 + + logger.info("pipeline complete!") + + # Compute and log overall statistics + if results: + overall_zero3ne_hf_max = sum(r["diff_zero3ne_hf_max"] for r in results) / len(results) + overall_zero3ne_hf_mean = sum(r["diff_zero3ne_hf_mean"] for r in results) / len(results) + overall_zero3eq_hf_max = sum(r["diff_zero3eq_hf_max"] for r in results) / len(results) + overall_zero3eq_hf_mean = sum(r["diff_zero3eq_hf_mean"] for r in results) / len(results) + + logger.info("=" * 80) + logger.info("Overall Statistics:") + logger.info( + f" ZeRO3(ne) vs HF: avg_diff_max={overall_zero3ne_hf_max:.6f}, " + f"avg_diff_mean={overall_zero3ne_hf_mean:.6f}" + ) + logger.info( + f" ZeRO3(eq) vs HF: avg_diff_max={overall_zero3eq_hf_max:.6f}, " + f"avg_diff_mean={overall_zero3eq_hf_mean:.6f}" + ) + logger.info("=" * 80) + return results @@ -121,4 +183,8 @@ def run(self): output_file = "logprobs_cmp.json" with open(output_file, "w") as f: - json.dump(results, f, ensure_ascii=False) + for m in results: + json.dump(m, f, ensure_ascii=False) + f.write("\n") + + logger.info(f"Results saved to {output_file}") diff --git a/tests/distributed/strategy/log_probs/test_fsdp_log_probs.py b/tests/distributed/strategy/log_probs/test_fsdp_log_probs.py new file mode 100644 index 000000000..3953d32d4 --- /dev/null +++ b/tests/distributed/strategy/log_probs/test_fsdp_log_probs.py @@ -0,0 +1,311 @@ +import json +from typing import Any, Dict + +import torch +from torch.utils.data import DataLoader +from tqdm import tqdm + +from roll.datasets.collator import DataCollatorWithPaddingForPaddedKeys +from roll.datasets.loader import get_dataset +from roll.distributed.executor.cluster import Cluster +from roll.distributed.scheduler.initialize import init +from roll.distributed.scheduler.protocol import DataProto +from roll.models.model_providers import default_tokenizer_provider +from roll.pipeline.base_pipeline import BasePipeline +from roll.pipeline.base_worker import ActorWorker, InferWorker +from roll.pipeline.rlvr.rlvr_config import RLVRConfig +from roll.utils.logging import get_logger +from tests.distributed.strategy.make_baseline_config import make_baseline_config + +logger = get_logger() + + +class TestFSDPLogProbsPipeline(BasePipeline): + def __init__(self, pipeline_config: RLVRConfig): + super().__init__(pipeline_config) + + self.tokenizer = default_tokenizer_provider( + model_args=self.pipeline_config.actor_train.model_args, + ) + + # Load dataset + self.dataset = get_dataset( + tokenizer=self.tokenizer, + data_args=self.pipeline_config.actor_train.data_args, + ) + + # Create data collator + data_collator = DataCollatorWithPaddingForPaddedKeys( + tokenizer=self.tokenizer, + max_length=self.pipeline_config.prompt_length, + padding="max_length", + ) + + # Create dataloader + self.dataloader = DataLoader( + dataset=self.dataset, + batch_size=self.pipeline_config.rollout_batch_size, + shuffle=True, + drop_last=True, + collate_fn=data_collator, + ) + + max_steps = len(self.dataloader) * self.pipeline_config.actor_train.training_args.num_train_epochs + self.pipeline_config.set_max_steps(max_steps=max_steps) + + # Initialize clusters + self.actor_train: Any = Cluster( + name=self.pipeline_config.actor_train.name, + worker_cls=ActorWorker, + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.actor_train, + ) + self.actor_infer: Any = Cluster( + name=self.pipeline_config.actor_infer.name, + worker_cls=InferWorker, + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.actor_infer, + ) + self.reference: Any = Cluster( + name=self.pipeline_config.reference.name, + worker_cls=ActorWorker, + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.reference, + ) + + self.actor_train.initialize(pipeline_config=self.pipeline_config, blocking=True) + self.actor_infer.initialize(pipeline_config=self.pipeline_config, blocking=True) + self.reference.initialize(pipeline_config=self.pipeline_config, blocking=True) + + @torch.no_grad() + def run(self): + """ + Compare log probs between FSDP2 strategy and HF reference implementation. + Similar to test_ds_hf_log_probs.py logic. + """ + global_step = 0 + results = [] + + for batch_dict in tqdm(self.dataloader): + logger.info(f"pipeline step {global_step} start...") + + batch_dict: Dict + batch: DataProto = DataProto.from_single_dict(batch_dict) + batch.meta_info = {"global_step": global_step} + + # Generate responses using actor_infer + gen_batch = batch.pop(batch_keys=["input_ids", "attention_mask", "position_ids"]) + gen_batch.meta_info = {"global_step": global_step} + generate_output: DataProto = self.actor_infer.generate(data=gen_batch) + + # Combine generated output with original batch + batch.batch = generate_output.batch + batch = batch.union(generate_output) + + if self.pipeline_config.actor_train.model_args.lora_target is not None: + batch.meta_info["disable_adapter"] = True + logprobs_fsdp_disable_adapter = self.actor_train.compute_log_probs(batch) + batch.meta_info["disable_adapter"] = False + logprobs_fsdp_enable_adapter = self.actor_train.compute_log_probs(batch) + logprobs_fsdp = logprobs_fsdp_enable_adapter + else: + logprobs_fsdp = self.actor_train.compute_log_probs(batch) + logprobs_fsdp_disable_adapter = None + logprobs_fsdp_enable_adapter = None + + # Compute log probs from reference (should also use HF) + logprobs_ref = self.reference.compute_log_probs(batch) + + # Extract prompt and response for logging + prompt_ids = generate_output.batch["prompts"] + response_ids = generate_output.batch["responses"] + prompts = self.tokenizer.batch_decode(prompt_ids, skip_special_tokens=True) + responses = self.tokenizer.batch_decode(response_ids, skip_special_tokens=True) + + # Compare FSDP vs HF and FSDP vs Reference + count = 0 + sum_diff_max = 0.0 + sum_diff_mean = 0.0 + + # Statistics for adapter enable/disable comparison + sum_diff_adapter_enable_disable_max = 0.0 + sum_diff_adapter_enable_disable_mean = 0.0 + count_adapter = 0 + + # Statistics for FSDP vs HF comparison + sum_diff_fsdp_hf_max = 0.0 + sum_diff_fsdp_hf_mean = 0.0 + count_fsdp_hf = 0 + + # Prepare logprobs lists + logprobs_fsdp_list = logprobs_fsdp.batch["log_probs"] + logprobs_ref_list = logprobs_ref.batch["log_probs"] + + # Prepare adapter logprobs if available + logprobs_fsdp_enable_list = None + logprobs_fsdp_disable_list = None + if logprobs_fsdp_enable_adapter is not None and logprobs_fsdp_disable_adapter is not None: + logprobs_fsdp_enable_list = logprobs_fsdp_enable_adapter.batch["log_probs"] + logprobs_fsdp_disable_list = logprobs_fsdp_disable_adapter.batch["log_probs"] + + for idx, (prompt, response, logprob_fsdp, logprob_ref) in enumerate( + zip( + prompts, + responses, + logprobs_fsdp_list, + logprobs_ref_list, + ) + ): + # Compare FSDP (with adapter enabled) vs FSDP (with adapter disabled) + if logprobs_fsdp_enable_list is not None and logprobs_fsdp_disable_list is not None: + logprob_enable = logprobs_fsdp_enable_list[idx] + logprob_disable = logprobs_fsdp_disable_list[idx] + diff_adapter_max = (logprob_enable - logprob_disable).abs().max().item() + diff_adapter_mean = (logprob_enable - logprob_disable).abs().mean().item() + sum_diff_adapter_enable_disable_max += diff_adapter_max + sum_diff_adapter_enable_disable_mean += diff_adapter_mean + count_adapter += 1 + adapter_diff_max = diff_adapter_max + adapter_diff_mean = diff_adapter_mean + else: + adapter_diff_max = None + adapter_diff_mean = None + + # Compare FSDP vs HF (if both have values) + if logprob_fsdp is not None and logprob_ref is not None: + diff_fsdp_hf_max = (logprob_fsdp - logprob_ref).abs().max().item() + diff_fsdp_hf_mean = (logprob_fsdp - logprob_ref).abs().mean().item() + sum_diff_fsdp_hf_max += diff_fsdp_hf_max + sum_diff_fsdp_hf_mean += diff_fsdp_hf_mean + count_fsdp_hf += 1 + else: + diff_fsdp_hf_max = None + diff_fsdp_hf_mean = None + + # Original comparison (FSDP vs HF, kept for backward compatibility) + diff_max = diff_fsdp_hf_max if diff_fsdp_hf_max is not None else 0.0 + diff_mean = diff_fsdp_hf_mean if diff_fsdp_hf_mean is not None else 0.0 + sum_diff_max += diff_max + sum_diff_mean += diff_mean + count += 1 + + result = { + "prompt": prompt, + "response": response, + "diff_max": diff_max, + "diff_mean": diff_mean, + "logprob_fsdp": logprob_fsdp.tolist(), + "logprob_ref": logprob_ref.tolist(), + } + + # Add adapter comparison if available + if adapter_diff_max is not None: + result["diff_adapter_enable_disable_max"] = adapter_diff_max + result["diff_adapter_enable_disable_mean"] = adapter_diff_mean + + # Add explicit FSDP vs HF comparison if available + if diff_fsdp_hf_max is not None: + result["diff_fsdp_hf_max"] = diff_fsdp_hf_max + result["diff_fsdp_hf_mean"] = diff_fsdp_hf_mean + + results.append(result) + + # Log statistics + if count > 0: + logger.info(f"avg_diff_max: {sum_diff_max / count}, avg_diff_mean: {sum_diff_mean / count}") + + if count_adapter > 0: + logger.info( + f"avg_diff_adapter_enable_disable_max: {sum_diff_adapter_enable_disable_max / count_adapter}, " + f"avg_diff_adapter_enable_disable_mean: {sum_diff_adapter_enable_disable_mean / count_adapter}" + ) + + if count_fsdp_hf > 0: + logger.info( + f"avg_diff_fsdp_hf_max: {sum_diff_fsdp_hf_max / count_fsdp_hf}, " + f"avg_diff_fsdp_hf_mean: {sum_diff_fsdp_hf_mean / count_fsdp_hf}" + ) + global_step += 1 + + logger.info("pipeline complete!") + return results + + +def test_fsdp_log_probs_full(): + init() + config = make_baseline_config(config_path="./log_probs", config_name="log_probs_fsdp_config") + pipeline = TestFSDPLogProbsPipeline(config) + results = pipeline.run() + + output_file = "test_fsdp_log_probs_full.json" + with open(output_file, "w") as f: + for m in results: + json.dump(m, f, ensure_ascii=False) + f.write("\n") + logger.info(f"Test FSDP (full) completed, results saved to {output_file}") + + +def test_fsdp_log_probs_lora(): + init() + config = make_baseline_config(config_path="./log_probs", config_name="log_probs_fsdp_lora_config") + pipeline = TestFSDPLogProbsPipeline(config) + results = pipeline.run() + + output_file = "test_fsdp_log_probs_lora.json" + with open(output_file, "w") as f: + for m in results: + json.dump(m, f, ensure_ascii=False) + f.write("\n") + logger.info(f"Test FSDP (LoRA) completed, results saved to {output_file}") + + +def test_fsdp_log_probs_cp(): + init() + config = make_baseline_config(config_path="./log_probs", config_name="log_probs_fsdp_cp_config") + + if torch.cuda.device_count() < 8: + logger.warning(f"Skipping CP test, need at least 8 GPUs (have {torch.cuda.device_count()})") + return + + pipeline = TestFSDPLogProbsPipeline(config) + results = pipeline.run() + + output_file = "test_fsdp_log_probs_cp.json" + with open(output_file, "w") as f: + for m in results: + json.dump(m, f, ensure_ascii=False) + f.write("\n") + logger.info(f"Test FSDP (CP) completed, results saved to {output_file}") + + +def test_fsdp_log_probs_cp_rmpad(): + init() + config = make_baseline_config(config_path="./log_probs", config_name="log_probs_fsdp_cp_rmpad_config") + pipeline = TestFSDPLogProbsPipeline(config) + results = pipeline.run() + + output_file = "test_fsdp_log_probs_cp_rmpad.json" + with open(output_file, "w") as f: + for m in results: + json.dump(m, f, ensure_ascii=False) + f.write("\n") + logger.info(f"Test FSDP (CP+RMpad) completed, results saved to {output_file}") + + +if __name__ == "__main__": + import sys + + if len(sys.argv) > 1: + test_name = sys.argv[1] + if test_name == "full": + test_fsdp_log_probs_full() + elif test_name == "lora": + test_fsdp_log_probs_lora() + elif test_name == "cp": + test_fsdp_log_probs_cp() + elif test_name == "cp_rmpad": + test_fsdp_log_probs_cp_rmpad() + else: + logger.error(f"Unknown test: {test_name}. Use 'full', 'lora', or 'cp'.") + else: + test_fsdp_log_probs_full() diff --git a/tests/distributed/strategy/log_probs/test_fsdp_vlm_layer_states.py b/tests/distributed/strategy/log_probs/test_fsdp_vlm_layer_states.py new file mode 100644 index 000000000..b5929d5d6 --- /dev/null +++ b/tests/distributed/strategy/log_probs/test_fsdp_vlm_layer_states.py @@ -0,0 +1,308 @@ +import json +import os +from pathlib import Path +from typing import Any, Dict, List, Optional, Union + +import ray +import torch +from torch.utils.data import DataLoader +from tqdm import tqdm + +# Apply model patches before importing anything that uses the model +os.environ["AUTO_APPLY_MODEL_PATCHES"] = "1" +from tests.distributed.strategy.log_probs.apply_model_patch import ( + apply_qwen3vl_megatron_patches, + apply_qwen3vl_patches, +) + +apply_qwen3vl_patches() +apply_qwen3vl_megatron_patches() + +from roll.datasets.collator import DataCollatorWithPaddingForMM +from roll.distributed.executor.cluster import Cluster +from roll.distributed.scheduler.initialize import init +from roll.distributed.scheduler.protocol import DataProto +from roll.models.model_providers import default_processor_provider, get_extra_data_provider +from roll.pipeline.base_pipeline import BasePipeline +from roll.pipeline.base_worker import ActorWorker, InferWorker +from roll.pipeline.rlvr.rlvr_config import RLVRConfig +from roll.utils.logging import get_logger +from tests.distributed.strategy.log_probs.analyze_layer_divergence import analyze_divergence +from tests.distributed.strategy.make_baseline_config import make_baseline_config + +logger = get_logger() + + +def _actorworker_set_capture_env(self, env: Dict[str, str]): + """ + Test-only helper executed inside Ray workers. + - Sets capture env vars used by `layer_states_capture.py` + - Ensures model patches are applied inside the worker process (not just the driver) + """ + for k, v in env.items(): + os.environ[k] = str(v) + # Apply patches inside the worker process so FSDP2/HF forwards get instrumented. + try: + from tests.distributed.strategy.log_probs.apply_model_patch import ( + apply_qwen3vl_megatron_patches, + apply_qwen3vl_patches, + ) + + apply_qwen3vl_patches() + apply_qwen3vl_megatron_patches() + except Exception: + pass + + +# Monkeypatch onto ActorWorker so we can call it on Ray actors from this test. +setattr(ActorWorker, "set_capture_env", _actorworker_set_capture_env) + + +def _set_capture_env_on_cluster(cluster: Cluster, save_dir: Path, prefix: str, step: int, batch_idx: int): + env = { + "LAYER_STATES_SAVE_DIR": str(save_dir), + "LAYER_STATES_PREFIX": str(prefix), + "LAYER_STATES_STEP": str(step), + "LAYER_STATES_BATCH": str(batch_idx), + } + ray.get([w.set_capture_env.remote(env) for w in cluster.workers]) + + +def _as_list(x: Union[str, List[str], None]) -> List[str]: + if x is None: + return [] + if isinstance(x, str): + return [x] + return list(x) + + +def save_inputs_and_embeddings(data: DataProto, save_dir: Path, prefix: str, global_step: int, batch_idx: int = 0): + """Save input tensors for comparison.""" + save_dir.mkdir(parents=True, exist_ok=True) + + # Save input_ids, attention_mask, position_ids + for key in ["input_ids", "attention_mask", "position_ids", "response_mask"]: + if key in data.batch: + save_path = save_dir / f"{prefix}_step{global_step}_batch{batch_idx}_{key}.pt" + torch.save(data.batch[key].cpu().detach(), save_path) + + # Save multi_modal_data if present + if "multi_modal_data" in data.non_tensor_batch: + mm_data = data.non_tensor_batch["multi_modal_data"] + save_path = save_dir / f"{prefix}_step{global_step}_batch{batch_idx}_multi_modal_data.json" + mm_metadata = {} + if isinstance(mm_data, (list, tuple)): + for i, sample_mm in enumerate(mm_data): + if isinstance(sample_mm, dict): + for k, v in sample_mm.items(): + if isinstance(v, torch.Tensor): + key_name = f"sample{i}_{k}" + mm_metadata[key_name] = {"shape": list(v.shape), "dtype": str(v.dtype)} + tensor_path = save_dir / f"{prefix}_step{global_step}_batch{batch_idx}_mm_{key_name}.pt" + torch.save(v.cpu().detach(), tensor_path) + with open(save_path, "w") as f: + json.dump(mm_metadata, f, indent=2) + + +class TestFSDPVLMLayerStatesPipeline(BasePipeline): + def __init__(self, pipeline_config: RLVRConfig, output_dir: str = "./layer_states_output"): + super().__init__(pipeline_config) + self.pipeline_config = pipeline_config + self.output_dir = Path(output_dir) + self.output_dir.mkdir(parents=True, exist_ok=True) + + self.processor = default_processor_provider(self.pipeline_config.actor_train.model_args) + if self.processor is None: + raise RuntimeError("VLM layer states test requires a processor (AutoProcessor).") + # Follow RLVRVLMPipeline: ensure these are not None + img_proc = getattr(self.processor, "image_processor", None) + if img_proc is not None: + model_args = self.pipeline_config.actor_train.model_args + if getattr(img_proc, "max_pixels", None) is None: + img_proc.max_pixels = getattr(model_args, "max_pixels", 1024 * 1024) + if getattr(img_proc, "min_pixels", None) is None: + img_proc.min_pixels = getattr(model_args, "min_pixels", 56 * 56) + self.tokenizer = self.processor.tokenizer + self.tokenizer.padding_side = "left" + + # Dataset + self.dataset = self._build_dataset_or_skip() + + data_collator = DataCollatorWithPaddingForMM( + tokenizer=self.tokenizer, + processor=self.processor, + extra_data_provider=get_extra_data_provider( + self.pipeline_config.actor_train.model_args.model_name_or_path, + processor=self.processor, + ), + max_length=self.pipeline_config.prompt_length, + padding="max_length", + ) + + self.dataloader = DataLoader( + dataset=self.dataset, + batch_size=self.pipeline_config.rollout_batch_size, + shuffle=True, + drop_last=True, + collate_fn=data_collator, + ) + + max_steps = len(self.dataloader) * self.pipeline_config.actor_train.training_args.num_train_epochs + self.pipeline_config.set_max_steps(max_steps=max_steps) + + self.actor_train: Any = Cluster( + name=self.pipeline_config.actor_train.name, + worker_cls=ActorWorker, + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.actor_train, + ) + self.actor_infer: Any = Cluster( + name=self.pipeline_config.actor_infer.name, + worker_cls=InferWorker, + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.actor_infer, + ) + self.reference: Any = Cluster( + name=self.pipeline_config.reference.name, + worker_cls=ActorWorker, + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.reference, + ) + + self.actor_train.initialize(pipeline_config=self.pipeline_config, blocking=True) + self.actor_infer.initialize(pipeline_config=self.pipeline_config, blocking=True) + self.reference.initialize(pipeline_config=self.pipeline_config, blocking=True) + + def _build_dataset_or_skip(self): + data_args = self.pipeline_config.actor_train.data_args + file_names = _as_list(getattr(data_args, "file_name", None)) + + if file_names and all(os.path.exists(p) for p in file_names): + import datasets + + from roll.pipeline.rlvr.rlvr_math_vlm_pipeline import encode_function, get_dataset + + features = datasets.Features( + { + "image": datasets.Sequence(feature=datasets.Image(decode=True)), + "prompt": datasets.Value("string"), + "ground_truth": datasets.Value("string"), + "image_flag": datasets.Value("bool"), + "tag": datasets.Value("string"), + } + ) + return get_dataset(data_args, encode_function, self.processor, features) + + @torch.no_grad() + def run(self, max_batches: Optional[int] = None): + """ + Run the pipeline and capture layer states using environment variables. + + Args: + max_batches: Maximum number of batches to process (None for all) + """ + global_step = 0 + results = [] + + # Create output directories + fsdp_dir = self.output_dir / "fsdp2" + hf_dir = self.output_dir / "hf" + inputs_dir = self.output_dir / "inputs" + analysis_dir = self.output_dir / "analysis" + analysis_dir.mkdir(parents=True, exist_ok=True) + + for batch_idx, batch_dict in enumerate(tqdm(self.dataloader)): + if max_batches is not None and batch_idx >= max_batches: + break + + logger.info(f"vlm layer states pipeline step {global_step} batch {batch_idx} start...") + + batch: DataProto = DataProto.from_single_dict(batch_dict) + batch.meta_info = {"global_step": global_step, "_broadcast_non_tensor_batch": True} + + # Save inputs and embeddings + save_inputs_and_embeddings(batch, inputs_dir, "input", global_step, batch_idx) + + # Generate responses using actor_infer (vLLM) + gen_batch = batch.pop( + batch_keys=["input_ids", "attention_mask", "position_ids"], + non_tensor_batch_keys=["multi_modal_data"], + ) + gen_batch.meta_info = {"global_step": global_step} + generate_output: DataProto = self.actor_infer.generate(data=gen_batch) + + # Merge generated full sequences back + batch.batch = generate_output.batch + batch = batch.union(generate_output) + + _set_capture_env_on_cluster( + self.actor_train, + save_dir=fsdp_dir, + prefix="fsdp2", + step=global_step, + batch_idx=batch_idx, + ) + logprobs_fsdp = self.actor_train.compute_log_probs(batch) + + _set_capture_env_on_cluster( + self.reference, + save_dir=hf_dir, + prefix="hf", + step=global_step, + batch_idx=batch_idx, + ) + logprobs_ref = self.reference.compute_log_probs(batch) + + # Directly compare saved inputs/embeddings/layer states for this step/batch. + analysis_out = analysis_dir / f"divergence_step{global_step}_batch{batch_idx}.json" + analyze_divergence( + fsdp_dir=fsdp_dir, + hf_dir=hf_dir, + inputs_dir=inputs_dir, + output_file=analysis_out, + global_step=global_step, + batch_idx=batch_idx, + threshold=1e-5, + ) + + lp_fsdp = logprobs_fsdp.batch["log_probs"] + lp_ref = logprobs_ref.batch["log_probs"] + mask = batch.batch["response_mask"][:, 1:].to(torch.bool) + + diff = (lp_fsdp - lp_ref).abs() + diff_max = diff[mask].max().item() if mask.any() else 0.0 + diff_mean = diff[mask].mean().item() if mask.any() else 0.0 + + results.append( + { + "global_step": global_step, + "batch_idx": batch_idx, + "diff_max": diff_max, + "diff_mean": diff_mean, + } + ) + logger.info(f"vlm logprob diff_max={diff_max:.6f}, diff_mean={diff_mean:.6f}") + + global_step += 1 + + logger.info("vlm layer states pipeline complete!") + + # Save summary + summary_path = self.output_dir / "summary.json" + with open(summary_path, "w") as f: + json.dump(results, f, indent=2) + + return results + + +def test_fsdp_vlm_layer_states_cp2(): + init() + config = make_baseline_config(config_path="./log_probs", config_name="log_probs_fsdp_vlm_cp2_config") + pipeline = TestFSDPVLMLayerStatesPipeline(config, output_dir="./layer_states_output") + results = pipeline.run(max_batches=1) # Start with 1 batch for testing + + logger.info(f"Test FSDP VLM layer states (CP2) completed, results saved to {pipeline.output_dir}") + + +if __name__ == "__main__": + test_fsdp_vlm_layer_states_cp2() diff --git a/tests/distributed/strategy/log_probs/test_fsdp_vlm_log_probs.py b/tests/distributed/strategy/log_probs/test_fsdp_vlm_log_probs.py new file mode 100644 index 000000000..e1b7af84e --- /dev/null +++ b/tests/distributed/strategy/log_probs/test_fsdp_vlm_log_probs.py @@ -0,0 +1,214 @@ +import json +import os +from typing import Any, Dict, List, Optional, Union + +import torch +from torch.utils.data import DataLoader +from tqdm import tqdm + +from roll.datasets.collator import DataCollatorWithPaddingForMM +from roll.distributed.executor.cluster import Cluster +from roll.distributed.scheduler.initialize import init +from roll.distributed.scheduler.protocol import DataProto +from roll.models.model_providers import default_processor_provider, get_extra_data_provider +from roll.pipeline.base_pipeline import BasePipeline +from roll.pipeline.base_worker import ActorWorker, InferWorker +from roll.pipeline.rlvr.rlvr_config import RLVRConfig +from roll.utils.logging import get_logger +from tests.distributed.strategy.make_baseline_config import make_baseline_config + +logger = get_logger() + + +def _as_list(x: Union[str, List[str], None]) -> List[str]: + if x is None: + return [] + if isinstance(x, str): + return [x] + return list(x) + + +class TestFSDPVLMLogProbsPipeline(BasePipeline): + """ + VLM logprob precision test: + - use VLM processor + DataCollatorWithPaddingForMM (same data path as RLVRVLMPipeline) + - generate with vLLM (actor_infer) + - compare compute_log_probs between FSDP2 (actor_train) and HF (reference) + """ + + def __init__(self, pipeline_config: RLVRConfig): + super().__init__(pipeline_config) + self.pipeline_config = pipeline_config + + # ------------------------------------------------------------------ + # Qwen3-VL precision debug dumps (rank-0 only inside each Ray actor process). + # We must pass env vars via Ray runtime_env (worker_config.system_envs), not via driver os.environ. + dump_root = os.path.abspath( + os.getenv( + "QWEN3_VL_TEST_DUMP_ROOT", + os.path.join(self.pipeline_config.output_dir or ".", "qwen3_vl_dumps"), + ) + ) + os.makedirs(dump_root, exist_ok=True) + self.pipeline_config.actor_train.system_envs["QWEN3_VL_DUMP_DIR"] = os.path.join(dump_root, "actor_train") + self.pipeline_config.reference.system_envs["QWEN3_VL_DUMP_DIR"] = os.path.join(dump_root, "reference") + + self.processor = default_processor_provider(self.pipeline_config.actor_train.model_args) + if self.processor is None: + raise RuntimeError("VLM logprob test requires a processor (AutoProcessor).") + # Follow RLVRVLMPipeline: ensure these are not None, otherwise qwen2_vl smart_resize will crash. + img_proc = getattr(self.processor, "image_processor", None) + if img_proc is not None: + model_args = self.pipeline_config.actor_train.model_args + if getattr(img_proc, "max_pixels", None) is None: + img_proc.max_pixels = getattr(model_args, "max_pixels", 1024 * 1024) + if getattr(img_proc, "min_pixels", None) is None: + img_proc.min_pixels = getattr(model_args, "min_pixels", 56 * 56) + self.tokenizer = self.processor.tokenizer + self.tokenizer.padding_side = "left" + + # Dataset: prefer real VLM dataset if paths exist; otherwise skip (this is a GPU-heavy test anyway). + self.dataset = self._build_dataset_or_skip() + + data_collator = DataCollatorWithPaddingForMM( + tokenizer=self.tokenizer, + processor=self.processor, + extra_data_provider=get_extra_data_provider( + self.pipeline_config.actor_train.model_args.model_name_or_path, + processor=self.processor, + ), + max_length=self.pipeline_config.prompt_length, + padding="max_length", + ) + + self.dataloader = DataLoader( + dataset=self.dataset, + batch_size=self.pipeline_config.rollout_batch_size, + shuffle=True, + drop_last=True, + collate_fn=data_collator, + ) + + max_steps = len(self.dataloader) * self.pipeline_config.actor_train.training_args.num_train_epochs + self.pipeline_config.set_max_steps(max_steps=max_steps) + + self.actor_train: Any = Cluster( + name=self.pipeline_config.actor_train.name, + worker_cls=ActorWorker, + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.actor_train, + ) + # self.actor_infer: Any = Cluster( + # name=self.pipeline_config.actor_infer.name, + # worker_cls=InferWorker, + # resource_manager=self.resource_manager, + # worker_config=self.pipeline_config.actor_infer, + # ) + self.reference: Any = Cluster( + name=self.pipeline_config.reference.name, + worker_cls=ActorWorker, + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.reference, + ) + + self.actor_train.initialize(pipeline_config=self.pipeline_config, blocking=True) + # self.actor_infer.initialize(pipeline_config=self.pipeline_config, blocking=True) + self.reference.initialize(pipeline_config=self.pipeline_config, blocking=True) + + def _build_dataset_or_skip(self): + # Follow RLVRVLMPipeline data path when real files exist. + data_args = self.pipeline_config.actor_train.data_args + file_names = _as_list(getattr(data_args, "file_name", None)) + + # Common pattern in configs: absolute paths only exist on the training machine. + if file_names and all(os.path.exists(p) for p in file_names): + import datasets + + from roll.pipeline.rlvr.rlvr_math_vlm_pipeline import encode_function, get_dataset + + features = datasets.Features( + { + # only support single image temporarily since sglang usage + # "image": datasets.Image(decode=True), + "image": datasets.Sequence(feature=datasets.Image(decode=True)), + "prompt": datasets.Value("string"), + "ground_truth": datasets.Value("string"), + # for text and multi-modal mixed data usage, indicating valid image + "image_flag": datasets.Value("bool"), + # for area seperated validation, dummy currently + "tag": datasets.Value("string"), + } + ) + return get_dataset(data_args, encode_function, self.processor, features) + + @torch.no_grad() + def run(self): + global_step = 0 + results = [] + + for batch_dict in tqdm(self.dataloader): + logger.info(f"vlm logprob pipeline step {global_step} start...") + + batch: DataProto = DataProto.from_single_dict(batch_dict) + batch.meta_info = {"global_step": global_step, "_broadcast_non_tensor_batch": True} + batch.batch["response_mask"] = batch.batch["attention_mask"].clone() + + # Generate responses using actor_infer (vLLM). Needs multi_modal_data for VLM prompts. + # gen_batch = batch.pop( + # batch_keys=["input_ids", "attention_mask", "position_ids"], + # non_tensor_batch_keys=["multi_modal_data"], + # ) + # gen_batch.meta_info = {"global_step": global_step} + # generate_output: DataProto = self.actor_infer.generate(data=gen_batch) + + # Merge generated full sequences back with original (keeps multi_modal_inputs for HF/FSDP forward). + # batch.batch = generate_output.batch + # batch = batch.union(generate_output) + + # Compute log probs from FSDP2 and HF reference. + logprobs_fsdp = self.actor_train.compute_log_probs(batch) + logprobs_ref = self.reference.compute_log_probs(batch) + + # layer_states = self.actor_train.compute_layer_state(batch) + # layer_states_ref = self.reference.compute_layer_state(batch) + # breakpoint() + + lp_fsdp = logprobs_fsdp.batch["log_probs"] + lp_ref = logprobs_ref.batch["log_probs"] + + diff = (lp_fsdp - lp_ref).abs() + diff_max = diff.max().item() + diff_mean = diff.mean().item() + + results.append( + { + "global_step": global_step, + "diff_max": diff_max, + "diff_mean": diff_mean, + } + ) + logger.info(f"vlm logprob diff_max={diff_max:.6f}, diff_mean={diff_mean:.6f}") + + global_step += 1 + break + + logger.info("vlm logprob pipeline complete!") + return results + + +def test_fsdp_vlm_log_probs_cp2(): + init() + config = make_baseline_config(config_path="./log_probs", config_name="log_probs_fsdp_vlm_cp2_config") + pipeline = TestFSDPVLMLogProbsPipeline(config) + results = pipeline.run() + + output_file = "test_fsdp_vlm_log_probs_cp2.json" + with open(output_file, "w", encoding="utf-8") as f: + for m in results: + json.dump(m, f, ensure_ascii=False) + f.write("\n") + logger.info(f"Test FSDP VLM log probs (CP2) completed, results saved to {output_file}") + + +if __name__ == "__main__": + test_fsdp_vlm_log_probs_cp2() diff --git a/tests/distributed/strategy/log_probs/test_fsdp_vlm_log_probs_perf.py b/tests/distributed/strategy/log_probs/test_fsdp_vlm_log_probs_perf.py new file mode 100644 index 000000000..3073e4bd3 --- /dev/null +++ b/tests/distributed/strategy/log_probs/test_fsdp_vlm_log_probs_perf.py @@ -0,0 +1,310 @@ +import json +import os +import time +from typing import Any, Dict, List, Optional, Union + +import torch +import torch.distributed as dist +from torch.utils.data import DataLoader +from tqdm import tqdm + +from roll.datasets.collator import DataCollatorWithPaddingForMM +from roll.distributed.executor.cluster import Cluster +from roll.distributed.scheduler.initialize import init +from roll.distributed.scheduler.protocol import DataProto +from roll.models.model_providers import default_processor_provider, get_extra_data_provider +from roll.pipeline.base_pipeline import BasePipeline +from roll.pipeline.base_worker import ActorWorker, InferWorker +from roll.pipeline.rlvr.rlvr_config import RLVRConfig +from roll.utils.logging import get_logger +from tests.distributed.strategy.make_baseline_config import make_baseline_config + +logger = get_logger() + + +def _as_list(x: Union[str, List[str], None]) -> List[str]: + if x is None: + return [] + if isinstance(x, str): + return [x] + return list(x) + + +def get_timer_stats(): + """Get timer statistics from the context parallel utilities.""" + try: + from roll.utils.context_parallel.globals import get_timer, log_timer_stats, clear_timer_stats + return { + "available": True, + "timers": log_timer_stats(), + } + except Exception as e: + return { + "available": False, + "error": str(e) + } + + +def get_memory_stats(): + """Get GPU memory statistics.""" + if not torch.cuda.is_available(): + return {"available": False} + + return { + "available": True, + "allocated_gb": torch.cuda.memory_allocated() / 1024**3, + "reserved_gb": torch.cuda.memory_reserved() / 1024**3, + "max_allocated_gb": torch.cuda.max_memory_allocated() / 1024**3, + } + + +class TestFSDPVLMLogProbsPipeline(BasePipeline): + """ + VLM logprob precision test with performance statistics: + - use VLM processor + DataCollatorWithPaddingForMM (same data path as RLVRVLMPipeline) + - compare compute_log_probs between FSDP2 (actor_train) and HF (reference) + - measure timing, memory usage, and communication overhead + """ + + def __init__(self, pipeline_config: RLVRConfig): + super().__init__(pipeline_config) + self.pipeline_config = pipeline_config + + # ------------------------------------------------------------------ + # Qwen3-VL precision debug dumps (rank-0 only inside each Ray actor process). + dump_root = os.path.abspath( + os.getenv( + "QWEN3_VL_TEST_DUMP_ROOT", + os.path.join(self.pipeline_config.output_dir or ".", "qwen3_vl_dumps"), + ) + ) + os.makedirs(dump_root, exist_ok=True) + self.pipeline_config.actor_train.system_envs["QWEN3_VL_DUMP_DIR"] = os.path.join(dump_root, "actor_train") + self.pipeline_config.reference.system_envs["QWEN3_VL_DUMP_DIR"] = os.path.join(dump_root, "reference") + + self.processor = default_processor_provider(self.pipeline_config.actor_train.model_args) + if self.processor is None: + raise RuntimeError("VLM logprob test requires a processor (AutoProcessor).") + # Follow RLVRVLMPipeline: ensure these are not None + img_proc = getattr(self.processor, "image_processor", None) + if img_proc is not None: + model_args = self.pipeline_config.actor_train.model_args + if getattr(img_proc, "max_pixels", None) is None: + img_proc.max_pixels = getattr(model_args, "max_pixels", 1024 * 1024) + if getattr(img_proc, "min_pixels", None) is None: + img_proc.min_pixels = getattr(model_args, "min_pixels", 56 * 56) + self.tokenizer = self.processor.tokenizer + self.tokenizer.padding_side = "left" + + # Dataset + self.dataset = self._build_dataset_or_skip() + + data_collator = DataCollatorWithPaddingForMM( + tokenizer=self.tokenizer, + processor=self.processor, + extra_data_provider=get_extra_data_provider( + self.pipeline_config.actor_train.model_args.model_name_or_path, + processor=self.processor, + ), + max_length=self.pipeline_config.prompt_length, + padding="max_length", + ) + + self.dataloader = DataLoader( + dataset=self.dataset, + batch_size=self.pipeline_config.rollout_batch_size, + shuffle=True, + drop_last=True, + collate_fn=data_collator, + ) + + max_steps = len(self.dataloader) * self.pipeline_config.actor_train.training_args.num_train_epochs + self.pipeline_config.set_max_steps(max_steps=max_steps) + + self.actor_train: Any = Cluster( + name=self.pipeline_config.actor_train.name, + worker_cls=ActorWorker, + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.actor_train, + ) + self.reference: Any = Cluster( + name=self.pipeline_config.reference.name, + worker_cls=ActorWorker, + resource_manager=self.resource_manager, + worker_config=self.pipeline_config.reference, + ) + + self.actor_train.initialize(pipeline_config=self.pipeline_config, blocking=True) + self.reference.initialize(pipeline_config=self.pipeline_config, blocking=True) + + def _build_dataset_or_skip(self): + data_args = self.pipeline_config.actor_train.data_args + file_names = _as_list(getattr(data_args, "file_name", None)) + + if file_names and all(os.path.exists(p) for p in file_names): + import datasets + + from roll.pipeline.rlvr.rlvr_math_vlm_pipeline import encode_function, get_dataset + + features = datasets.Features( + { + "image": datasets.Sequence(feature=datasets.Image(decode=True)), + "prompt": datasets.Value("string"), + "ground_truth": datasets.Value("string"), + "image_flag": datasets.Value("bool"), + "tag": datasets.Value("string"), + } + ) + return get_dataset(data_args, encode_function, self.processor, features) + + @torch.no_grad() + def run(self): + global_step = 0 + results = [] + + # Clear timer stats before starting + try: + from roll.utils.context_parallel.globals import clear_timer_stats + clear_timer_stats() + except: + pass + + for batch_dict in tqdm(self.dataloader): + logger.info(f"vlm logprob pipeline step {global_step} start...") + + batch: DataProto = DataProto.from_single_dict(batch_dict) + batch.meta_info = {"global_step": global_step, "_broadcast_non_tensor_batch": True} + batch.batch["response_mask"] = batch.batch["attention_mask"].clone() + + # Get initial memory stats + mem_before = get_memory_stats() + + # Time FSDP2 compute_log_probs + start_fsdp = time.time() + logprobs_fsdp = self.actor_train.compute_log_probs(batch) + time_fsdp = time.time() - start_fsdp + + # Get memory stats after FSDP2 + mem_after_fsdp = get_memory_stats() + + # Get timer stats after FSDP2 + timer_stats_fsdp = get_timer_stats() + + # Clear timers for reference run + try: + from roll.utils.context_parallel.globals import clear_timer_stats + clear_timer_stats() + except: + pass + + # Time HF reference compute_log_probs + start_ref = time.time() + logprobs_ref = self.reference.compute_log_probs(batch) + time_ref = time.time() - start_ref + + # Get memory stats after reference + mem_after_ref = get_memory_stats() + + # Get timer stats after reference (should be minimal) + timer_stats_ref = get_timer_stats() + + # Compute correctness metrics + lp_fsdp = logprobs_fsdp.batch["log_probs"] + lp_ref = logprobs_ref.batch["log_probs"] + + diff = (lp_fsdp - lp_ref).abs() + diff_max = diff.max().item() + diff_mean = diff.mean().item() + diff_std = diff.std().item() + + # Check if results are numerically equivalent + is_correct = diff_max < 1e-5 + + # Batch statistics + batch_size = batch.batch["input_ids"].size(0) + seq_len = batch.batch["input_ids"].size(1) + num_tokens = (batch.batch["attention_mask"].sum()).item() + + # Speedup calculation + speedup = time_ref / time_fsdp if time_fsdp > 0 else 0 + + result = { + "global_step": global_step, + "correctness": { + "diff_max": diff_max, + "diff_mean": diff_mean, + "diff_std": diff_std, + "is_correct": is_correct, + }, + "performance": { + "time_fsdp_seconds": time_fsdp, + "time_ref_seconds": time_ref, + "speedup": speedup, + "tokens_per_second_fsdp": num_tokens / time_fsdp if time_fsdp > 0 else 0, + "tokens_per_second_ref": num_tokens / time_ref if time_ref > 0 else 0, + }, + "memory": { + "before_gb": mem_before.get("allocated_gb", 0), + "after_fsdp_gb": mem_after_fsdp.get("allocated_gb", 0), + "after_ref_gb": mem_after_ref.get("allocated_gb", 0), + "fsdp_memory_increase_gb": mem_after_fsdp.get("allocated_gb", 0) - mem_before.get("allocated_gb", 0), + }, + "batch_info": { + "batch_size": batch_size, + "seq_len": seq_len, + "num_tokens": num_tokens, + }, + "communication": { + "fsdp_timer_stats": timer_stats_fsdp, + "ref_timer_stats": timer_stats_ref, + } + } + + results.append(result) + + logger.info(f"Step {global_step}:") + logger.info(f" Correctness: diff_max={diff_max:.6f}, diff_mean={diff_mean:.6f}, is_correct={is_correct}") + logger.info(f" Performance: FSDP={time_fsdp:.4f}s, Ref={time_ref:.4f}s, Speedup={speedup:.2f}x") + logger.info(f" Throughput: FSDP={result['performance']['tokens_per_second_fsdp']:.0f} tok/s, Ref={result['performance']['tokens_per_second_ref']:.0f} tok/s") + + if timer_stats_fsdp.get("available"): + logger.info(f" Communication stats: {timer_stats_fsdp.get('timers', {})}") + + global_step += 1 + break # Only run one step for testing + + logger.info("vlm logprob pipeline complete!") + return results + + +def test_fsdp_vlm_log_probs_cp2_with_perf(): + """Test VLM logprobs with CP2 and comprehensive performance statistics.""" + init() + config = make_baseline_config(config_path="./log_probs", config_name="log_probs_fsdp_vlm_cp2_config") + pipeline = TestFSDPVLMLogProbsPipeline(config) + results = pipeline.run() + + output_file = "test_fsdp_vlm_log_probs_cp2_with_perf.json" + with open(output_file, "w", encoding="utf-8") as f: + json.dump(results, f, indent=2, ensure_ascii=False) + + logger.info(f"Test FSDP VLM log probs (CP2) with performance stats completed!") + logger.info(f"Results saved to {output_file}") + + # Print summary + if results: + r = results[0] + logger.info("\n" + "="*80) + logger.info("PERFORMANCE SUMMARY") + logger.info("="*80) + logger.info(f"Correctness: {r['correctness']['is_correct']} (diff_max={r['correctness']['diff_max']:.6f})") + logger.info(f"Speedup: {r['performance']['speedup']:.2f}x") + logger.info(f"FSDP time: {r['performance']['time_fsdp_seconds']:.4f}s") + logger.info(f"Reference time: {r['performance']['time_ref_seconds']:.4f}s") + logger.info(f"FSDP throughput: {r['performance']['tokens_per_second_fsdp']:.0f} tokens/s") + logger.info(f"Memory increase: {r['memory']['fsdp_memory_increase_gb']:.2f} GB") + logger.info("="*80) + + +if __name__ == "__main__": + test_fsdp_vlm_log_probs_cp2_with_perf() \ No newline at end of file diff --git a/tests/distributed/strategy/log_probs/test_megatron_strategy.py b/tests/distributed/strategy/log_probs/test_megatron_strategy.py index 71e8ea907..452ae0c35 100644 --- a/tests/distributed/strategy/log_probs/test_megatron_strategy.py +++ b/tests/distributed/strategy/log_probs/test_megatron_strategy.py @@ -1,5 +1,5 @@ import json -from typing import Any, List, Dict +from typing import Any, Dict, List import ray import torch @@ -8,15 +8,16 @@ from roll.datasets.collator import DataCollatorWithPaddingForPaddedKeys from roll.datasets.loader import get_dataset -from roll.pipeline.base_worker import ActorWorker from roll.distributed.executor.cluster import Cluster from roll.distributed.scheduler.initialize import init from roll.distributed.scheduler.protocol import DataProto from roll.models.model_providers import default_tokenizer_provider from roll.pipeline.base_pipeline import BasePipeline +from roll.pipeline.base_worker import ActorWorker from roll.pipeline.rlvr.rlvr_config import RLVRConfig from roll.utils.logging import get_logger -from tests.distributed.strategy.make_baseline_config import make_baseline_config +from tests.distributed.strategy.make_baseline_config import \ + make_baseline_config logger = get_logger() @@ -27,7 +28,6 @@ def __init__(self, pipeline_config: RLVRConfig): super().__init__(pipeline_config) self.tokenizer = default_tokenizer_provider( model_args=self.pipeline_config.reference.model_args, - template_name=self.pipeline_config.reference.data_args.template, ) self.dataset = get_dataset( tokenizer=self.tokenizer, @@ -83,11 +83,15 @@ def run(self): ref_log_probs_refs: List[ray.ObjectRef] = self.reference.compute_log_probs(batch, blocking=False) ref_log_probs = DataProto.materialize_concat(data_refs=ref_log_probs_refs) ref_log_probs.rename(old_keys="log_probs", new_keys="ref_log_probs") + if "entropy" in ref_log_probs.batch.keys(): + del ref_log_probs.batch["entropy"] ref_log_probs.meta_info.pop("metrics", {}) batch = batch.union(ref_log_probs) hf_log_probs: DataProto = self.actor_infer.compute_log_probs(batch) hf_log_probs.rename(old_keys="log_probs", new_keys="hf_log_probs") + if "entropy" in hf_log_probs.batch.keys(): + del hf_log_probs.batch["entropy"] hf_log_probs.meta_info.pop("metrics", {}) batch = batch.union(hf_log_probs) response_mask = batch.batch["response_mask"] diff --git a/tests/distributed/strategy/model_update/model_update_baseline_config.yaml b/tests/distributed/strategy/model_update/model_update_baseline_config.yaml index 59f3c8254..777a1cb9a 100644 --- a/tests/distributed/strategy/model_update/model_update_baseline_config.yaml +++ b/tests/distributed/strategy/model_update/model_update_baseline_config.yaml @@ -16,7 +16,7 @@ prompt_length: 1024 response_length: 1024 -pretrain: Qwen/Qwen2.5-7B-Instruct +pretrain: Qwen/Qwen3-8B actor_train: model_args: diff --git a/tests/distributed/strategy/model_update/model_update_debug.py b/tests/distributed/strategy/model_update/model_update_debug.py index 0dd18c6e1..0d6cb8c27 100644 --- a/tests/distributed/strategy/model_update/model_update_debug.py +++ b/tests/distributed/strategy/model_update/model_update_debug.py @@ -4,8 +4,10 @@ from roll.configs.worker_config import StrategyArguments from roll.distributed.scheduler.initialize import init from roll.utils.logging import get_logger -from tests.distributed.strategy.make_baseline_config import make_baseline_config -from tests.distributed.strategy.model_update.model_update_pipeline import ModelUpdatePipeline +from tests.distributed.strategy.make_baseline_config import \ + make_baseline_config +from tests.distributed.strategy.model_update.model_update_pipeline import \ + ModelUpdatePipeline logger = get_logger() @@ -16,13 +18,13 @@ def vllm_model_update_baseline(): init() ppo_config = make_baseline_config(config_path="./model_update", config_name="model_update_baseline_config") - # vllm_strategy_args = StrategyArguments(strategy_name="vllm", - # strategy_config={ - # "gpu_memory_utilization": 0.8, - # "block_size": 16, - # }) - # - # ppo_config.actor_infer.strategy_args = vllm_strategy_args + # Enable stat logging for vLLM to allow metrics collection + if ( + hasattr(ppo_config.actor_infer, "strategy_args") + and ppo_config.actor_infer.strategy_args.strategy_name == "vllm" + ): + if "disable_log_stats" not in ppo_config.actor_infer.strategy_args.strategy_config: + ppo_config.actor_infer.strategy_args.strategy_config["disable_log_stats"] = False pipeline = ModelUpdatePipeline(pipeline_config=ppo_config) @@ -53,6 +55,34 @@ def ds_2_hf_model_update_baseline(): logger.info(f"{json.dumps({'total_time': total_time, 'time_list': generate_times})}") +def fsdp2_train_model_update(): + os.environ["RAY_PROFILING"] = "1" + + init() + + ppo_config = make_baseline_config(config_path="./model_update", config_name="model_update_fsdp") + # Enable stat logging for vLLM to allow metrics collection + if ( + hasattr(ppo_config.actor_infer, "strategy_args") + and ppo_config.actor_infer.strategy_args.strategy_name == "vllm" + ): + if "disable_log_stats" not in ppo_config.actor_infer.strategy_args.strategy_config: + ppo_config.actor_infer.strategy_args.strategy_config["disable_log_stats"] = False + + pipeline = ModelUpdatePipeline(pipeline_config=ppo_config) + + metric_list = pipeline.run() + generate_times = [metric["time/model_update"] for metric in metric_list[:-2]] + total_time = sum(generate_times) + + logger.info(f"{json.dumps({'total_time': total_time, 'time_list': generate_times})}") + + output_file = "model_update_fsdp.json" + with open(output_file, "w") as f: + json.dump(metric_list, f, ensure_ascii=False) + + if __name__ == "__main__": - vllm_model_update_baseline() + # vllm_model_update_baseline() # ds_2_hf_model_update_baseline() + fsdp2_train_model_update() diff --git a/tests/distributed/strategy/model_update/model_update_fsdp.yaml b/tests/distributed/strategy/model_update/model_update_fsdp.yaml new file mode 100644 index 000000000..a6c6a1e99 --- /dev/null +++ b/tests/distributed/strategy/model_update/model_update_fsdp.yaml @@ -0,0 +1,70 @@ + +hydra: + run: + dir: . + output_subdir: null + +exp_name: "model_update_baseline" +seed: 42 +logging_dir: ./output/logs +output_dir: ./output + +track_with: stdout + +rollout_batch_size: 1024 +prompt_length: 1024 +response_length: 1024 + + +pretrain: Qwen/Qwen3-0.6B + +actor_train: + model_args: + dtype: bf16 + model_type: ~ + data_args: + template: qwen2_5 + file_name: data/comparison_gpt4_data_zh.json + dataset_dir: data + prompt: instruction + interleave_probs: "1.0" + training_args: + learning_rate: 5.0e-7 + weight_decay: 0 + per_device_train_batch_size: 1 + gradient_accumulation_steps: 2 + warmup_ratio: 0.1 + num_train_epochs: 1 + strategy_args: + strategy_name: fsdp2_train + strategy_config: + fsdp_size: 8 + param_dtype: bf16 + reduce_dtype: fp32 + reshard_after_forward: true + offload_policy: true + forward_prefetch: true + use_batched_model_update: true + device_mapping: list(range(0,8)) + + +actor_infer: + model_args: + dtype: bf16 + generating_args: + max_new_tokens: ${response_length} + top_p: 0.99 + top_k: 100 + num_beams: 1 + temperature: 0.99 + num_return_sequences: 1 + data_args: + template: qwen2_5 + strategy_args: + strategy_name: vllm + strategy_config: + gpu_memory_utilization: 0.8 + block_size: 16 + max_model_len: 6000 + tensor_parallel_size: 1 + device_mapping: list(range(0,8)) diff --git a/tests/distributed/strategy/model_update/model_update_multi_group_debug.py b/tests/distributed/strategy/model_update/model_update_multi_group_debug.py index 7ce740f93..03b4e3b1b 100644 --- a/tests/distributed/strategy/model_update/model_update_multi_group_debug.py +++ b/tests/distributed/strategy/model_update/model_update_multi_group_debug.py @@ -4,8 +4,10 @@ from roll.configs.worker_config import StrategyArguments from roll.distributed.scheduler.initialize import init from roll.utils.logging import get_logger -from tests.distributed.strategy.make_baseline_config import make_baseline_config -from tests.distributed.strategy.model_update.model_update_pipeline_multi_group import ModelUpdatePipeline +from tests.distributed.strategy.make_baseline_config import \ + make_baseline_config +from tests.distributed.strategy.model_update.model_update_pipeline_multi_group import \ + ModelUpdatePipeline logger = get_logger() @@ -16,13 +18,13 @@ def vllm_model_update_baseline(): init() ppo_config = make_baseline_config(config_path="./model_update", config_name="model_update_baseline_config") - # vllm_strategy_args = StrategyArguments(strategy_name="vllm", - # strategy_config={ - # "gpu_memory_utilization": 0.8, - # "block_size": 16, - # }) - # - # ppo_config.actor_infer.strategy_args = vllm_strategy_args + # Enable stat logging for vLLM to allow metrics collection + if ( + hasattr(ppo_config.actor_infer, "strategy_args") + and ppo_config.actor_infer.strategy_args.strategy_name == "vllm" + ): + if "disable_log_stats" not in ppo_config.actor_infer.strategy_args.strategy_config: + ppo_config.actor_infer.strategy_args.strategy_config["disable_log_stats"] = False pipeline = ModelUpdatePipeline(pipeline_config=ppo_config) @@ -53,6 +55,34 @@ def ds_2_hf_model_update_baseline(): logger.info(f"{json.dumps({'total_time': total_time, 'time_list': generate_times})}") +def fsdp2_train_model_update(): + os.environ["RAY_PROFILING"] = "1" + + init() + + ppo_config = make_baseline_config(config_path="./model_update", config_name="model_update_fsdp") + # Enable stat logging for vLLM to allow metrics collection + if ( + hasattr(ppo_config.actor_infer, "strategy_args") + and ppo_config.actor_infer.strategy_args.strategy_name == "vllm" + ): + if "disable_log_stats" not in ppo_config.actor_infer.strategy_args.strategy_config: + ppo_config.actor_infer.strategy_args.strategy_config["disable_log_stats"] = False + + pipeline = ModelUpdatePipeline(pipeline_config=ppo_config) + + metric_list = pipeline.run() + generate_times = [metric["time/model_update"] for metric in metric_list[:-2]] + total_time = sum(generate_times) + + logger.info(f"{json.dumps({'total_time': total_time, 'time_list': generate_times})}") + + output_file = "model_update_fsdp.json" + with open(output_file, "w") as f: + json.dump(metric_list, f, ensure_ascii=False, indent=2) + + if __name__ == "__main__": - vllm_model_update_baseline() + # vllm_model_update_baseline() # ds_2_hf_model_update_baseline() + fsdp2_train_model_update() diff --git a/tests/distributed/strategy/model_update/model_update_pipeline.py b/tests/distributed/strategy/model_update/model_update_pipeline.py index aeac940b7..c8fe7c42e 100644 --- a/tests/distributed/strategy/model_update/model_update_pipeline.py +++ b/tests/distributed/strategy/model_update/model_update_pipeline.py @@ -1,16 +1,19 @@ +import os from typing import Any, Dict +import ray import torch from codetiming import Timer -from roll.pipeline.base_worker import ActorWorker from roll.distributed.executor.cluster import Cluster from roll.distributed.scheduler.protocol import DataProto from roll.models.model_providers import default_tokenizer_provider from roll.pipeline.base_pipeline import BasePipeline +from roll.pipeline.base_worker import ActorWorker, InferWorker from roll.pipeline.rlvr.rlvr_config import RLVRConfig from roll.utils.logging import get_logger + logger = get_logger() @@ -25,13 +28,13 @@ def __init__(self, pipeline_config: RLVRConfig): self.pipeline_config.set_max_steps(max_steps=1024) self.actor_train: Any = Cluster( name=self.pipeline_config.actor_train.name, - worker_cls=ActorWorker, + worker_cls=self.pipeline_config.actor_train.worker_cls, resource_manager=self.resource_manager, worker_config=self.pipeline_config.actor_train, ) self.actor_infer: Any = Cluster( name=self.pipeline_config.actor_infer.name, - worker_cls=ActorWorker, + worker_cls=self.pipeline_config.actor_infer.worker_cls, resource_manager=self.resource_manager, worker_config=self.pipeline_config.actor_infer, ) diff --git a/tests/distributed/strategy/model_update/model_update_pipeline_multi_group.py b/tests/distributed/strategy/model_update/model_update_pipeline_multi_group.py index 8d9a89eac..29b4cad69 100644 --- a/tests/distributed/strategy/model_update/model_update_pipeline_multi_group.py +++ b/tests/distributed/strategy/model_update/model_update_pipeline_multi_group.py @@ -4,12 +4,12 @@ import torch from codetiming import Timer -from roll.configs.worker_config import WorkerConfig, StrategyArguments -from roll.pipeline.base_worker import ActorWorker +from roll.configs.worker_config import StrategyArguments, WorkerConfig from roll.distributed.executor.cluster import Cluster from roll.distributed.scheduler.protocol import DataProto from roll.models.model_providers import default_tokenizer_provider from roll.pipeline.base_pipeline import BasePipeline +from roll.pipeline.base_worker import ActorWorker, InferWorker from roll.pipeline.rlvr.rlvr_config import RLVRConfig from roll.utils.logging import get_logger @@ -33,7 +33,7 @@ def __init__(self, pipeline_config: RLVRConfig): ) self.actor_infer: Any = Cluster( name=self.pipeline_config.actor_infer.name, - worker_cls=ActorWorker, + worker_cls=InferWorker, resource_manager=self.resource_manager, worker_config=self.pipeline_config.actor_infer, ) diff --git a/tests/distributed/strategy/standalone/fsdp2_standalone_strategy.py b/tests/distributed/strategy/standalone/fsdp2_standalone_strategy.py new file mode 100644 index 000000000..081c86be7 --- /dev/null +++ b/tests/distributed/strategy/standalone/fsdp2_standalone_strategy.py @@ -0,0 +1,523 @@ +import contextlib +import os +from dataclasses import dataclass +from typing import Any, Dict, Optional, Tuple + +import torch +import torch.distributed as dist +from torch.distributed.device_mesh import init_device_mesh +from torch.distributed.fsdp import CPUOffloadPolicy, MixedPrecisionPolicy +from transformers import AutoConfig, AutoModelForCausalLM, AutoModelForVision2Seq + +from roll.platforms import current_platform +from roll.utils.context_parallel import get_ulysses_group, set_upg_manager +from roll.utils.context_parallel.autograd_gather import ulysses_gather +from roll.utils.context_parallel.rmpad_ulysses import ( + gather_outputs_and_unpad, + ulysses_pad_and_slice_inputs, + ulysses_pad_inputs, +) +from roll.utils.fsdp_utils import ( + apply_fsdp2, + fsdp2_load_full_state_dict, + get_init_weight_context_manager, + get_shard_placement_fn, +) +from roll.utils.functionals import log_probs_from_logits + + +def _parse_dtype(dtype): + if dtype is None: + return None + if isinstance(dtype, torch.dtype): + return dtype + if isinstance(dtype, str): + dtype_lower = dtype.lower() + dtype_map = { + "bf16": torch.bfloat16, + "bfloat16": torch.bfloat16, + "fp16": torch.float16, + "float16": torch.float16, + "half": torch.float16, + "fp32": torch.float32, + "float32": torch.float32, + "float": torch.float32, + "fp64": torch.float64, + "float64": torch.float64, + } + if dtype_lower in dtype_map: + return dtype_map[dtype_lower] + if hasattr(torch, dtype): + return getattr(torch, dtype) + raise ValueError(f"Unsupported dtype string: {dtype}") + return dtype + + +def create_device_mesh_with_ulysses(world_size: int, fsdp_size: int): + """ + Matches `roll.distributed.strategy.fsdp2_strategy.create_device_mesh_with_ulysses`. + """ + if fsdp_size <= 1 or fsdp_size >= world_size: + mesh_shape = (world_size,) + mesh_dim_names = ["fsdp"] + else: + ddp_size = world_size // fsdp_size + mesh_shape = (ddp_size, fsdp_size) + mesh_dim_names = ["ddp", "fsdp"] + return init_device_mesh( + current_platform.device_type, + mesh_shape=mesh_shape, + mesh_dim_names=mesh_dim_names, + ) + + +def _validate_ulysses_compat(config, cp_size: int): + try: + num_attention_heads, num_key_value_heads = ( + config.num_attention_heads, + config.num_key_value_heads, + ) + except AttributeError: + num_attention_heads, num_key_value_heads = ( + config.text_config.num_attention_heads, + config.text_config.num_key_value_heads, + ) + + assert ( + num_attention_heads % cp_size == 0 + ), f"num_attention_heads {num_attention_heads} must be divisible by ulysses_size {cp_size}" + assert num_key_value_heads % cp_size == 0 or cp_size % num_key_value_heads == 0, ( + f"num_key_value_heads {num_key_value_heads} must be divisible by ulysses_size " + f"{cp_size} or vice versa. Upon ulysses_size % num_key_value_heads == 0, " + f"kv heads are repeated to ensure correctness." + ) + + +@dataclass +class StandaloneRankInfo: + dp_rank: int + dp_size: int + cp_rank: int + cp_size: int + + +@dataclass +class StandaloneFSDP2Config: + model_name_or_path: str + is_trainable: bool = False + # FSDP2 + param_dtype: torch.dtype = torch.bfloat16 + reduce_dtype: torch.dtype = torch.float32 + reshard_after_forward: bool = True + fsdp_size: int = 1 + cpu_offload: bool = False + # CP(Ulysses) + ulysses_size: int = 1 + use_remove_padding: bool = False + # HF + trust_remote_code: bool = True + attn_implementation: Optional[str] = None # e.g. "fa2" / "sdpa" / None + + +class StandaloneFSDP2Strategy: + def __init__(self, cfg: StandaloneFSDP2Config): + self.cfg = cfg + self.rank_info: Optional[StandaloneRankInfo] = None + self.device_mesh = None + self.fsdp_config: Optional[Dict[str, Any]] = None + self.model: Optional[torch.nn.Module] = None + self.config = None + self.param_dtype = _parse_dtype(cfg.param_dtype) + self.reduce_dtype = _parse_dtype(cfg.reduce_dtype) + + def _init_dist_if_needed(self): + if dist.is_initialized(): + return + if current_platform.device_type != "cpu": + backends_str = f"cpu:gloo,{current_platform.device_type}:{current_platform.communication_backend}" + else: + backends_str = current_platform.communication_backend + dist.init_process_group(backend=backends_str) + + def _setup_rank_info(self) -> StandaloneRankInfo: + world_size = dist.get_world_size() + global_rank = dist.get_rank() + + cp_size = int(self.cfg.ulysses_size or 1) + if cp_size > 1: + patch_info = current_platform.apply_ulysses_patch() + if patch_info is None: + cp_size = 1 + + dp_rank = global_rank // cp_size + dp_size = world_size // cp_size + cp_rank = global_rank % cp_size + + info = StandaloneRankInfo(dp_rank=dp_rank, dp_size=dp_size, cp_rank=cp_rank, cp_size=cp_size) + self.rank_info = info + return info + + def _setup_device(self): + if current_platform.device_type == "cuda": + local_rank = int(os.environ.get("LOCAL_RANK", str(dist.get_rank()))) + torch.cuda.set_device(local_rank) + + def setup_fsdp2_configuration(self): + mixed_precision = MixedPrecisionPolicy( + param_dtype=self.param_dtype, + reduce_dtype=self.reduce_dtype, + cast_forward_inputs=True, + ) + + offload_policy = None + if bool(self.cfg.cpu_offload): + offload_policy = CPUOffloadPolicy(pin_memory=True) + + self.fsdp_config = { + "mesh": self.device_mesh, + "reshard_after_forward": bool(self.cfg.reshard_after_forward), + "mp_policy": mixed_precision, + "offload_policy": offload_policy, + "shard_placement_fn": get_shard_placement_fn(fsdp_size=int(self.cfg.fsdp_size or 1)), + } + + def _pick_model_class(self, cfg) -> Any: + if type(cfg) in AutoModelForVision2Seq._model_mapping.keys(): # assume built-in models + return AutoModelForVision2Seq + return AutoModelForCausalLM + + def _apply_roll_model_patches(self, model: torch.nn.Module, cfg) -> None: + # Mirror the important parts of `roll.models.model_providers.load_model` that affect CP/FSDP2. + model_type = getattr(cfg, "model_type", None) or "" + ulysses_size = int(self.rank_info.cp_size if self.rank_info is not None else 1) + # Apply the same shared model forward patches as the main codebase. + from roll.models.model_providers import patch_model + + patch_model(model, cfg, use_mcore=False) + + if ulysses_size > 1 and getattr(cfg, "vision_config", None) is not None: + if model_type in ("qwen2_5_vl", "qwen3_vl"): + from roll.utils.context_parallel.vlm_cp_patch import find_vlm_text_decoder, patch_vlm_decoder_for_cp + + decoder = find_vlm_text_decoder(model) + if decoder is not None: + patch_vlm_decoder_for_cp(decoder, name=f"{model_type}.text_decoder") + + if getattr(cfg, "vision_config", None) is not None: + # Ensure vision tower blocks do not cast forward inputs under FSDP2. + from roll.models.model_providers import get_vl_model_vision_tower_blocks + + vision_tower_blocks = get_vl_model_vision_tower_blocks(model) + if vision_tower_blocks is not None: + for block in vision_tower_blocks: + block._fsdp2_cast_forward_inputs = False + + def initialize(self): + self._init_dist_if_needed() + self._setup_device() + info = self._setup_rank_info() + + world_size = dist.get_world_size() + + fsdp_size = int(self.cfg.fsdp_size or 1) + if info.cp_size > 1 and (fsdp_size <= 1 or fsdp_size >= world_size): + fsdp_size = world_size // info.cp_size + self.cfg.fsdp_size = fsdp_size + + if info.cp_size > 1: + set_upg_manager(ulysses_size=info.cp_size, rank=dist.get_rank(), world_size=world_size) + + self.device_mesh = create_device_mesh_with_ulysses(world_size=world_size, fsdp_size=fsdp_size) + + hf_cfg = AutoConfig.from_pretrained(self.cfg.model_name_or_path, trust_remote_code=self.cfg.trust_remote_code) + self.config = hf_cfg + if info.cp_size > 1: + _validate_ulysses_compat(hf_cfg, info.cp_size) + + if getattr(hf_cfg, "vision_config", None) is not None: + vc = hf_cfg.vision_config + setattr(vc, "_attn_implementation", "sdpa") + setattr(vc, "attn_implementation", "sdpa") + + setattr(hf_cfg, "use_cache", not bool(self.cfg.is_trainable)) + + use_meta_tensor = not getattr(hf_cfg, "tie_word_embeddings", False) + init_context = get_init_weight_context_manager(use_meta_tensor=use_meta_tensor, mesh=self.device_mesh) + + model_cls = self._pick_model_class(hf_cfg) + with init_context(): + model = model_cls.from_pretrained( + self.cfg.model_name_or_path, + config=hf_cfg, + trust_remote_code=self.cfg.trust_remote_code, + low_cpu_mem_usage=False, + ) + + self._apply_roll_model_patches(model, hf_cfg) + is_lora = getattr(model, "peft_config", None) is not None + + full_state = model.state_dict() + + self.setup_fsdp2_configuration() + assert self.fsdp_config is not None + # `apply_fsdp2()` needs a wrap policy list. Most HF models expose `_no_split_modules`, + # but some custom models may not; fall back to a conservative module-level wrap. + wrap_list = getattr(model, "_no_split_modules", None) + if not wrap_list: + wrap_list = ["Linear"] + strategy_cfg = {"wrap_policy": {"transformer_layer_cls_to_wrap": wrap_list}} + apply_fsdp2(model, self.fsdp_config, config=strategy_cfg, is_lora=is_lora) + + fsdp2_load_full_state_dict( + model=model, + full_state=full_state, + device_mesh=self.device_mesh, + cpu_offload=self.fsdp_config["offload_policy"], + ) + + self.model = model + dist.barrier() + + def unwrap_model(self): + if self.model is None: + return None + return getattr(self.model, "module", self.model) + + def get_feature_on_cp_rank( + self, + input_ids: torch.Tensor, + attention_mask: torch.Tensor = None, + position_ids: torch.Tensor = None, + ): + assert self.rank_info is not None + seqlens_in_batch = input_ids.size(1) + assert ( + seqlens_in_batch % self.rank_info.cp_size == 0 + ), f"input_length={seqlens_in_batch} not divisible by cp_size={self.rank_info.cp_size}" + cp_middle_rank_len = seqlens_in_batch // self.rank_info.cp_size + start_index = cp_middle_rank_len * self.rank_info.cp_rank + end_index = cp_middle_rank_len * (self.rank_info.cp_rank + 1) + + result = {"input_ids": input_ids[:, start_index:end_index]} + if attention_mask is not None: + result["attention_mask"] = attention_mask[:, start_index:end_index] + if position_ids is not None: + if position_ids.dim() == 3: + result["position_ids"] = position_ids[:, :, start_index:end_index] + else: + result["position_ids"] = position_ids[:, start_index:end_index] + return result + + def fsdp2_forward( + self, + *, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + position_ids: torch.Tensor, + forward_args: Optional[Dict[str, Any]] = None, + ) -> torch.Tensor: + """ + Mirrors `FSDP2InferStrategy._fsdp2_forward`. + Returns logits (possibly CP-sliced then gathered/padded back to keep downstream shape consistent). + """ + assert self.model is not None + assert self.rank_info is not None + forward_args = dict(forward_args or {}) + + cp_size = self.rank_info.cp_size + cp_rank = self.rank_info.cp_rank + + underlying = self.unwrap_model() + model_type = getattr(getattr(underlying, "config", None), "model_type", "") or "" + is_vlm = getattr(getattr(underlying, "config", None), "vision_config", None) is not None + is_supported_vlm = is_vlm and model_type in ("qwen2_5_vl", "qwen2_vl", "qwen3_vl", "qwen3_vl_moe") + + if "use_cache" not in forward_args: + forward_args["use_cache"] = False + + # Remove padding + CP path + if cp_size > 1 and self.cfg.use_remove_padding: + try: + from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input + except Exception as e: + raise RuntimeError("use_remove_padding=True requires flash_attn installed.") from e + + input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1), attention_mask) + input_ids_rmpad = input_ids_rmpad.transpose(0, 1) # (1, total_nnz) + + if position_ids is None: + raise RuntimeError("remove_padding path requires position_ids.") + + if position_ids.dim() == 3: + position_ids_rmpad = ( + index_first_axis( + rearrange(position_ids, "c b s ... -> (b s) c ..."), + indices, + ) + .transpose(0, 1) + .unsqueeze(1) + ) # (C, 1, total_nnz) + else: + position_ids_rmpad = index_first_axis( + rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), + indices, + ).transpose(0, 1) + + if is_supported_vlm: + input_ids_rmpad, position_ids_rmpad, pad_size = ulysses_pad_inputs( + input_ids_rmpad, + position_ids_rmpad, + cp_size=cp_size, + ) + else: + input_ids_rmpad, position_ids_rmpad, pad_size = ulysses_pad_and_slice_inputs( + input_ids_rmpad, + position_ids_rmpad, + cp_size=cp_size, + cp_rank=cp_rank, + ) + + output = self.model( + input_ids=input_ids_rmpad, + attention_mask=None, + position_ids=position_ids_rmpad, + **forward_args, + ) + logits_rmpad = output.logits # (1, local_tokens, vocab) + + logits_rmpad = gather_outputs_and_unpad( + logits_rmpad, + gather_dim=1, + unpad_dim=1, + padding_size=pad_size, + group=get_ulysses_group(), + ) + + logits = pad_input( + hidden_states=logits_rmpad.squeeze(0).unsqueeze(-1), + indices=indices, + batch=input_ids.size(0), + seqlen=input_ids.size(1), + ).squeeze(-1) + + features = self.get_feature_on_cp_rank(logits) + return features["input_ids"] + + # CP slicing path (non-rmpad) + if cp_size > 1 and (not is_supported_vlm): + feats = self.get_feature_on_cp_rank(input_ids, attention_mask, position_ids) + input_ids = feats["input_ids"] + attention_mask = feats["attention_mask"] + position_ids = feats["position_ids"] + + if not self.cfg.use_remove_padding: + if cp_size > 1 and is_supported_vlm: + assert ( + input_ids.size(1) % cp_size == 0 + ), f"input_length={input_ids.size(1)} not divisible by cp_size={cp_size} for VLM non-rmpad CP" + logits_local = self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + **forward_args, + ).logits # (bs, local_seq, vocab) + logits_full = gather_outputs_and_unpad( + logits_local, + gather_dim=1, + unpad_dim=None, + padding_size=0, + group=get_ulysses_group(), + ) + features = self.get_feature_on_cp_rank(logits_full) + return features["input_ids"] + + return self.model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + **forward_args, + ).logits + + # remove-padding without CP (or cp_size==1) + try: + from flash_attn.bert_padding import index_first_axis, pad_input, rearrange, unpad_input + except Exception as e: + raise RuntimeError("use_remove_padding=True requires flash_attn installed.") from e + + input_ids_rmpad, indices, *_ = unpad_input(input_ids.unsqueeze(-1), attention_mask) + input_ids_rmpad = input_ids_rmpad.transpose(0, 1) + + if position_ids is None: + raise RuntimeError("remove_padding path requires position_ids.") + + if position_ids.dim() == 3: + position_ids_rmpad = ( + index_first_axis( + rearrange(position_ids, "c b s ... -> (b s) c ..."), + indices, + ) + .transpose(0, 1) + .unsqueeze(1) + ) + else: + position_ids_rmpad = index_first_axis( + rearrange(position_ids.unsqueeze(-1), "b s ... -> (b s) ..."), + indices, + ).transpose(0, 1) + + output = self.model( + input_ids=input_ids_rmpad, + attention_mask=None, + position_ids=position_ids_rmpad, + **forward_args, + ) + logits = pad_input( + hidden_states=output.logits.squeeze(0).unsqueeze(-1), + indices=indices, + batch=input_ids.size(0), + seqlen=input_ids.size(1), + ).squeeze(-1) + return logits + + def compute_log_probs( + self, + *, + logits: torch.Tensor, + input_ids: torch.Tensor, + attention_mask: torch.Tensor, + ) -> torch.Tensor: + """ + Mirrors `FSDP2InferStrategy.op_compute_log_probs`. + Returns per-token logprobs aligned to `attention_mask[:, 1:]` (shifted labels). + """ + assert self.rank_info is not None + + labels = input_ids[:, 1:].clone() + labels[attention_mask[:, 1:] == 0] = 0 + + if self.rank_info.cp_size > 1: + labels = torch.cat([labels, torch.zeros_like(labels[:, :1])], dim=1) + labels = self.get_feature_on_cp_rank(labels)["input_ids"] + + log_probs = log_probs_from_logits(logits, labels) + log_probs = ulysses_gather( + log_probs, + gather_dim=1, + group=get_ulysses_group(), + grad_scaler=True, + ) + log_probs = log_probs[:, :-1] * attention_mask[:, 1:] + else: + labels = torch.cat([labels, torch.zeros_like(labels[:, :1])], dim=1) + log_probs = log_probs_from_logits(logits, labels) + log_probs = log_probs[:, :-1] * attention_mask[:, 1:] + + return log_probs + + @contextlib.contextmanager + def autocast(self): + if current_platform.device_type == "cpu": + yield + return + with torch.autocast(device_type=current_platform.device_type, dtype=self.param_dtype): + yield diff --git a/tests/distributed/strategy/standalone/run_fsdp2_standalone.py b/tests/distributed/strategy/standalone/run_fsdp2_standalone.py new file mode 100644 index 000000000..d1d462899 --- /dev/null +++ b/tests/distributed/strategy/standalone/run_fsdp2_standalone.py @@ -0,0 +1,114 @@ +import argparse +import os +from typing import Any, Dict, Optional + +import torch +import torch.distributed as dist +from transformers import AutoTokenizer + +from tests.distributed.strategy.standalone.fsdp2_standalone_strategy import ( + StandaloneFSDP2Config, + StandaloneFSDP2Strategy, +) + + +def _build_text_batch( + *, + tokenizer, + prompt: str, + response: str, + device: torch.device, + max_length: int, + model_name_or_path: str, +): + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + text = prompt + response + enc = tokenizer( + [text], + return_tensors="pt", + padding="max_length", + truncation=True, + max_length=max_length, + ) + input_ids = enc["input_ids"].to(device) + attention_mask = enc["attention_mask"].to(device) + + bsz, seqlen = input_ids.shape + position_ids = torch.arange(seqlen, dtype=torch.long, device=device).unsqueeze(0).expand(bsz, -1) + position_ids = position_ids.masked_fill(attention_mask == 0, 0) + return input_ids, attention_mask, position_ids + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("--model", required=True, help="Local model path (preferred for standalone runs).") + parser.add_argument("--prompt", default="Hello", help="Prompt text.") + parser.add_argument("--response", default=" world", help="Response text (appended to prompt).") + parser.add_argument("--max-length", type=int, default=128) + parser.add_argument("--cp-size", type=int, default=1) + parser.add_argument("--fsdp-size", type=int, default=1) + parser.add_argument("--param-dtype", default="bf16", choices=["bf16", "fp16", "fp32"]) + parser.add_argument("--reduce-dtype", default="fp32", choices=["bf16", "fp16", "fp32"]) + parser.add_argument("--reshard-after-forward", type=int, default=1, choices=[0, 1]) + parser.add_argument("--cpu-offload", type=int, default=0, choices=[0, 1]) + parser.add_argument("--use-remove-padding", type=int, default=0, choices=[0, 1]) + args = parser.parse_args() + + cfg = StandaloneFSDP2Config( + model_name_or_path=args.model, + is_trainable=False, + ulysses_size=int(args.cp_size), + fsdp_size=int(args.fsdp_size), + param_dtype=args.param_dtype, + reduce_dtype=args.reduce_dtype, + reshard_after_forward=bool(args.reshard_after_forward), + cpu_offload=bool(args.cpu_offload), + use_remove_padding=bool(args.use_remove_padding), + ) + strat = StandaloneFSDP2Strategy(cfg) + strat.initialize() + + rank = dist.get_rank() + device = ( + torch.device("cuda", int(os.environ.get("LOCAL_RANK", "0"))) + if torch.cuda.is_available() + else torch.device("cpu") + ) + + tokenizer = AutoTokenizer.from_pretrained( + args.model, local_files_only=True, trust_remote_code=True, padding_side="left" + ) + input_ids, attention_mask, position_ids = _build_text_batch( + tokenizer=tokenizer, + prompt=args.prompt, + response=args.response, + device=device, + max_length=int(args.max_length), + model_name_or_path=args.model, + ) + + with torch.no_grad(), strat.autocast(): + logits = strat.fsdp2_forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + forward_args={"use_cache": False}, + ) + log_probs = strat.compute_log_probs(logits=logits, input_ids=input_ids, attention_mask=attention_mask) + + scalar = log_probs.sum() + dist.all_reduce(scalar) + if rank == 0: + print( + f"[standalone fsdp2] world_size={dist.get_world_size()} cp_size={strat.rank_info.cp_size} " + f"fsdp_size={cfg.fsdp_size} remove_padding={cfg.use_remove_padding} " + f"log_probs_sum(all_reduce)={scalar.item():.4f}" + ) + + dist.barrier() + dist.destroy_process_group() + + +if __name__ == "__main__": + main() diff --git a/tests/distributed/strategy/test_fsdp_strategy_collection.py b/tests/distributed/strategy/test_fsdp_strategy_collection.py new file mode 100644 index 000000000..d20cfb5e3 --- /dev/null +++ b/tests/distributed/strategy/test_fsdp_strategy_collection.py @@ -0,0 +1,918 @@ +import os +import random +import socket +from types import SimpleNamespace + +import numpy as np +import pytest +import torch +import torch.distributed as dist +import torch.multiprocessing as mp +from tensordict import TensorDict +from torch.distributed.fsdp import CPUOffloadPolicy, MixedPrecisionPolicy +from torch.distributed.tensor import DTensor + +from roll.distributed.scheduler.protocol import DataProto +from roll.distributed.strategy import fsdp2_strategy +from roll.distributed.strategy.fsdp2_strategy import ( + FSDP2InferStrategy, FSDP2StrategyBase, FSDP2TrainStrategy, + create_device_mesh_with_ulysses) +from roll.platforms import current_platform +from roll.utils.fsdp_utils import (apply_fsdp2, fsdp2_load_full_state_dict, + get_shard_placement_fn) +from roll.utils.offload_states import OffloadStateType + + +class _PlatformStub: + def __init__(self, device_type="cpu", backend=None): + self.device_type = device_type + if backend is None: + backend = "nccl" if device_type == "cuda" else "gloo" + self.communication_backend = backend + + def current_device(self): + if self.device_type == "cuda": + current = ( + torch.cuda.current_device() + if torch.cuda.is_available() + else 0 + ) + return torch.device("cuda", current) + return "cpu" + + def apply_ulysses_patch(self): + return None + + +class DummyTrainingArgs: + def __init__(self): + self.per_device_train_batch_size = 2 + self.gradient_accumulation_steps = 1 + self.learning_rate = 3e-4 + self.adam_beta1 = 0.9 + self.adam_beta2 = 0.95 + self.weight_decay = 0.01 + self.lr_scheduler_type = "linear" + self.max_steps = 10 + + def get_warmup_steps(self, max_steps): + return 1 + + +class DummyModelArgs: + def __init__(self, ulysses_size=1): + self.ulysses_size = ulysses_size + self.model_name_or_path = "dummy-model" + self.model_config_kwargs = {} + self.lora_target = None + + +def make_worker( + strategy_config=None, use_remove_padding=False, ulysses_size=1 +): + worker_config = SimpleNamespace( + name="dummy_worker", + training_args=DummyTrainingArgs(), + model_args=DummyModelArgs(ulysses_size=ulysses_size), + strategy_args=SimpleNamespace( + strategy_config=strategy_config or {} + ), + use_remove_padding=use_remove_padding, + checkpoint_config=None, + offload_nccl=False, + ) + worker = SimpleNamespace( + worker_config=worker_config, + pipeline_config=SimpleNamespace(seed=0, max_grad_norm=1.0), + rank_info=SimpleNamespace( + dp_rank=0, + dp_size=1, + cp_rank=0, + cp_size=1, + tp_rank=0, + pp_rank=0, + ), + world_size=1, + rank=0, + ) + return worker + + +@pytest.fixture +def worker_factory(): + def _factory( + strategy_config=None, use_remove_padding=False, ulysses_size=1 + ): + return make_worker( + strategy_config=strategy_config, + use_remove_padding=use_remove_padding, + ulysses_size=ulysses_size, + ) + + return _factory + + +@pytest.fixture +def strategy_factory(worker_factory): + strategies = [] + + def _factory(strategy_cls, **worker_kwargs): + worker = worker_factory(**worker_kwargs) + strategy = strategy_cls(worker) + strategies.append(strategy) + return strategy + + yield _factory + + for strategy in strategies: + strategy.thread_executor.shutdown(wait=True) + + +@pytest.fixture +def platform_stub(): + return _PlatformStub() + + +@pytest.fixture(autouse=True) +def _patch_platform(monkeypatch, platform_stub): + monkeypatch.setattr(fsdp2_strategy, "current_platform", platform_stub) + + +class DummyCheckpointManager: + def __init__(self, checkpoint_config=None): + self.checkpoint_config = checkpoint_config + self.upload_calls = [] + + def upload(self, *args, **kwargs): + self.upload_calls.append((args, kwargs)) + + +@pytest.fixture(autouse=True) +def patch_checkpoint_manager(monkeypatch): + monkeypatch.setattr( + fsdp2_strategy, "CheckpointManager", DummyCheckpointManager + ) + + +class DummyForwardModel(torch.nn.Module): + def __init__(self, logits): + super().__init__() + self.kwargs = None + self._ret = SimpleNamespace(logits=logits) + + def forward(self, **kwargs): + self.kwargs = kwargs + return self._ret + + +class MockModel: + def __init__(self): + self.to_calls = [] + self.cpu_called = False + + def to(self, device, non_blocking=False): + self.to_calls.append((device, non_blocking)) + return self + + def cpu(self): + self.cpu_called = True + return self + + +def test_create_device_mesh_with_ulysses_global_mesh( + monkeypatch, platform_stub +): + """1D global mesh""" + captured = {} + + def fake_init(device_type, mesh_shape, mesh_dim_names): + captured["device_type"] = device_type + captured["mesh_shape"] = mesh_shape + captured["mesh_dim_names"] = mesh_dim_names + return "mesh" + + monkeypatch.setattr(fsdp2_strategy, "init_device_mesh", fake_init) + + mesh = create_device_mesh_with_ulysses(world_size=4, fsdp_size=1) + + assert mesh == "mesh" + assert captured["device_type"] == platform_stub.device_type + assert captured["mesh_shape"] == (4,) + assert captured["mesh_dim_names"] == ["fsdp"] + + +def test_create_device_mesh_with_ulysses_hsdp_mesh(monkeypatch): + """2D HSDP mesh""" + captured = {} + + def fake_init(device_type, mesh_shape, mesh_dim_names): + captured["mesh_shape"] = mesh_shape + captured["mesh_dim_names"] = mesh_dim_names + return "mesh" + + monkeypatch.setattr(fsdp2_strategy, "init_device_mesh", fake_init) + + mesh = create_device_mesh_with_ulysses(world_size=8, fsdp_size=4) + + assert mesh == "mesh" + assert captured["mesh_shape"] == (2, 4) + assert captured["mesh_dim_names"] == ["ddp", "fsdp"] + + +def test_build_checkpoint_paths_uses_rank_and_world(strategy_factory): + """Test that the checkpoint paths are built correctly""" + strategy = strategy_factory(FSDP2StrategyBase) + model_path, optim_path, extra_path = strategy._build_checkpoint_paths( + "/tmp/ckpts", world_size=2, dp_rank=1 + ) + assert model_path.endswith("model_world_size_2_rank_1.pt") + assert optim_path.endswith("optim_world_size_2_rank_1.pt") + assert extra_path.endswith("extra_state_world_size_2_rank_1.pt") + + +def test_copy_weight_to_param(strategy_factory): + """Test that the weight is copied to the parameter correctly""" + strategy = strategy_factory(FSDP2StrategyBase) + param = torch.nn.Parameter(torch.zeros(3)) + weight = torch.arange(3).float() + + strategy._copy_weight_to_param(param, weight) + + assert torch.allclose(param.detach(), weight) + + +def test_gather_full_tensor_returns_clone(strategy_factory): + strategy = strategy_factory(FSDP2StrategyBase) + param = torch.nn.Parameter(torch.tensor([1.0, 2.0])) + + gathered = strategy._gather_full_tensor(param) + assert torch.allclose(gathered, param.detach()) + + # _gather_full_tensor needs to return a detached clone of the parameter; + gathered += 1 + assert torch.allclose(param.detach(), torch.tensor([1.0, 2.0])) + + +def test_move_optimizer_states_respects_target_device( + strategy_factory, monkeypatch +): + """ + Make sure that the optimizer states are moved to the correct device after load/offload. + """ + strategy = strategy_factory(FSDP2StrategyBase) + + class FakeTensor: + def __init__(self): + self.device = "cpu" + + def to(self, device, non_blocking=False): + self.device = device + return self + + fake_tensor = FakeTensor() + strategy.optimizer = SimpleNamespace( + state={"p": {"momentum": fake_tensor}} + ) + + orig_is_tensor = fsdp2_strategy.torch.is_tensor + monkeypatch.setattr( + fsdp2_strategy.torch, + "is_tensor", + lambda obj: isinstance(obj, FakeTensor) or orig_is_tensor(obj), + ) + + strategy._move_optimizer_states("meta") + + assert fake_tensor.device == "meta" + + +def test_get_broadcast_tensor_returns_cpu_view(strategy_factory): + strategy = strategy_factory(FSDP2StrategyBase) + weight_cpu = torch.ones(5) + + result = strategy._get_broadcast_tensor(weight_cpu) + + assert result is weight_cpu + + +def test_get_feature_on_cp_rank_slices_correct_window(strategy_factory): + strategy = strategy_factory(FSDP2InferStrategy) + strategy.worker.rank_info.cp_size = 2 + strategy.worker.rank_info.cp_rank = 1 + + input_ids = torch.arange(8).view(1, 8) + attention_mask = torch.ones_like(input_ids) + position_ids = torch.arange(16).view(2, 1, 8) + + features = strategy.get_feature_on_cp_rank( + input_ids, attention_mask, position_ids + ) + + expected_ids = torch.arange(4, 8).view(1, 4) + assert torch.equal(features["input_ids"], expected_ids) + assert torch.equal( + features["attention_mask"], torch.ones_like(expected_ids) + ) + assert torch.equal( + features["position_ids"], + torch.tensor( + [[[4, 5, 6, 7]], [[12, 13, 14, 15]]], dtype=position_ids.dtype + ), + ) + + +def test_op_compute_log_probs_matches_manual(strategy_factory): + strategy = strategy_factory(FSDP2InferStrategy) + logits = torch.tensor([[[0.0, 1.0], [1.0, 0.0], [0.5, -0.5]]]) + input_ids = torch.tensor([[0, 1, 0]]) + attention_mask = torch.tensor([[1, 1, 0]]) + + result = strategy.op_compute_log_probs( + logits, input_ids, attention_mask + ) + + labels = input_ids[:, 1:].clone() + labels[attention_mask[:, 1:] == 0] = 0 + labels = torch.cat([labels, torch.zeros_like(labels[:, :1])], dim=1) + log_probs = ( + torch.nn.functional.log_softmax(logits.float(), dim=-1) + .gather(dim=-1, index=labels.unsqueeze(-1)) + .squeeze(-1) + ) + expected = log_probs[:, :-1] * attention_mask[:, 1:] + + assert torch.allclose(result, expected) + + +def test_op_compute_entropy_masks_prompt(strategy_factory): + strategy = strategy_factory(FSDP2InferStrategy) + logits = torch.tensor( + [[[0.0, 1.0], [1.5, 0.5], [0.3, 0.7], [1.2, 0.2]]] + ) + attention_mask = torch.tensor([[1, 1, 1, 0]]) + + result = strategy.op_compute_entropy(logits, attention_mask) + + probs = torch.softmax(logits.float(), dim=-1) + manual_entropy = torch.logsumexp(logits.float(), dim=-1) - ( + probs * logits + ).sum(dim=-1) + expected = manual_entropy[:, :-1] * attention_mask[:, 1:] + + assert torch.allclose(result, expected) + + +def test_setup_fsdp2_configuration_respects_strategy_config( + strategy_factory, +): + strategy_config = { + "param_dtype": torch.float16, + "reduce_dtype": torch.float32, + "reshard_after_forward": False, + "offload_policy": True, + "fsdp_size": 2, + } + strategy = strategy_factory( + FSDP2InferStrategy, strategy_config=strategy_config + ) + strategy.device_mesh = "mesh-handle" + + strategy.setup_fsdp2_configuration() + + cfg = strategy.fsdp_config + assert cfg["mesh"] == "mesh-handle" + assert cfg["reshard_after_forward"] is False + assert cfg["offload_policy"] is not False + assert cfg["mp_policy"].param_dtype == torch.float16 + assert callable(cfg["shard_placement_fn"]) + + +def test_clip_grad_norm_cpu_offload_uses_dummy_helper( + strategy_factory, monkeypatch +): + strategy = strategy_factory(FSDP2TrainStrategy) + strategy.model = torch.nn.Linear(2, 2, bias=False) + expected_params = list(strategy.model.parameters()) + + for param in expected_params: + param.grad = torch.ones_like(param) + + strategy.cpu_offload_enabled = True + + recorded = {} + + def fake_get_total_norm(grads, norm_type, error_if_nonfinite, foreach): + recorded["total_norm_args"] = ( + list(grads), + norm_type, + error_if_nonfinite, + foreach, + ) + return torch.tensor(2.0) + + def fake_clip_grads_with_norm_(parameters, max_norm, total_norm, foreach): + recorded["clip_args"] = ( + list(parameters), + max_norm, + total_norm.clone(), + foreach, + ) + + monkeypatch.setattr( + fsdp2_strategy, "_get_total_norm", fake_get_total_norm + ) + monkeypatch.setattr( + fsdp2_strategy, "_clip_grads_with_norm_", fake_clip_grads_with_norm_ + ) + + returned_norm = strategy._clip_grad_norm(max_norm=1.0) + + assert "total_norm_args" in recorded + grads_arg, norm_type, err_flag, foreach_flag = recorded["total_norm_args"] + assert grads_arg == [param.grad for param in expected_params] + assert norm_type == 2.0 + assert err_flag is False + assert foreach_flag is None + + assert "clip_args" in recorded + clip_params, clip_max_norm, clip_total_norm, clip_foreach = recorded[ + "clip_args" + ] + assert clip_params == expected_params + assert clip_max_norm == 1.0 + assert clip_foreach is None + assert clip_total_norm.item() == pytest.approx(2.0) + + assert returned_norm.item() == pytest.approx(2.0) + + +def _fsdp2_cpu_offload_grad_clip_worker(rank, world_size, port): + use_cuda = torch.cuda.is_available() + backend = "nccl" if use_cuda else "gloo" + fsdp2_strategy.current_platform = _PlatformStub( + device_type="cuda" if use_cuda else "cpu", + backend=backend, + ) + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(port) + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + dist.init_process_group( + backend=backend, + rank=rank, + world_size=world_size, + ) + try: + if use_cuda: + torch.cuda.set_device(rank % torch.cuda.device_count()) + device = torch.device("cuda", torch.cuda.current_device()) + else: + device = torch.device("cpu") + + model = _TinyMLP(input_dim=4, hidden_dim=4, output_dim=2).to(device) + mesh = create_device_mesh_with_ulysses( + world_size=world_size, fsdp_size=world_size + ) + mp_policy = ( + MixedPrecisionPolicy( + param_dtype=torch.float16, + reduce_dtype=torch.float32, + cast_forward_inputs=True, + ) + if use_cuda + else None + ) + offload_policy = ( + CPUOffloadPolicy(pin_memory=True) if use_cuda else False + ) + fsdp_kwargs = { + "mesh": mesh, + "reshard_after_forward": True, + "mp_policy": mp_policy, + "offload_policy": offload_policy, + "shard_placement_fn": get_shard_placement_fn(world_size), + } + full_state = model.state_dict() + apply_fsdp2(model, fsdp_kwargs, {"fsdp_size": world_size}) + fsdp2_load_full_state_dict(model, full_state, mesh, offload_policy) + + features = torch.randn(2, 4, device=device, requires_grad=False) + targets = torch.randn(2, 2, device=device, requires_grad=False) + loss = model(features, targets) + loss.backward() + + strategy = FSDP2TrainStrategy.__new__(FSDP2TrainStrategy) + strategy.model = model + strategy.cpu_offload_enabled = True + + total_norm = strategy._clip_grad_norm(max_norm=0.5) + scalar_norm = ( + total_norm.to_local() if hasattr(total_norm, "to_local") else total_norm + ) + scalar_norm = float(scalar_norm.detach().cpu().item()) + gathered = [0.0 for _ in range(world_size)] + dist.all_gather_object(gathered, scalar_norm) + + if rank == 0: + baseline = gathered[0] + print(f"Gathered norms: {gathered}") + for idx, other in enumerate(gathered[1:], start=1): + print(f"Rank 0 norm: {baseline}, Rank {idx} norm: {other}, diff: {abs(baseline - other)}") + assert other > 0, f"Rank {idx} returned zero/negative norm" + finally: + dist.destroy_process_group() + + +@pytest.mark.skipif( + fsdp2_strategy.MixedPrecisionPolicy is None, + reason="FSDP2 requires torch>=2.4", +) +@pytest.mark.skipif( + not dist.is_available(), + reason="torch.distributed is not available", +) +@pytest.mark.skipif( + not torch.cuda.is_available(), + reason="CPU-offload grad clip test requires CUDA", +) +def test_fsdp2_cpu_offload_grad_clip_distributed(): + world_size = min(2, torch.cuda.device_count()) + port = _find_free_port() + mp.spawn( + _fsdp2_cpu_offload_grad_clip_worker, + args=(world_size, port), + nprocs=world_size, + join=True, + ) + + +def test_fsdp2_forward_without_remove_padding(strategy_factory): + strategy = strategy_factory( + FSDP2TrainStrategy, use_remove_padding=False + ) + strategy.worker.rank_info.cp_size = 1 + logits = torch.randn(1, 2, 4) + strategy.model = DummyForwardModel(logits=logits) + + input_ids = torch.ones(1, 2, dtype=torch.long) + attention_mask = torch.ones_like(input_ids) + position_ids = torch.zeros_like(input_ids) + + output = strategy._fsdp2_forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + forward_args={"foo": torch.tensor(1)}, + ) + + assert torch.equal(output, logits) + assert strategy.model.kwargs["input_ids"] is input_ids + assert strategy.model.kwargs["attention_mask"] is attention_mask + assert strategy.model.kwargs["position_ids"] is position_ids + + +def test_fsdp2_forward_slices_cp_inputs(strategy_factory): + strategy = strategy_factory( + FSDP2TrainStrategy, use_remove_padding=False + ) + strategy.worker.rank_info.cp_size = 2 + strategy.worker.rank_info.cp_rank = 1 + logits = torch.randn(1, 2, 4) + strategy.model = DummyForwardModel(logits=logits) + strategy.param_dtype = torch.float32 + + input_ids = torch.arange(0, 4).view(1, 4).long() + attention_mask = torch.ones_like(input_ids) + position_ids = torch.zeros_like(input_ids) + + strategy._fsdp2_forward( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + forward_args={}, + ) + + expected_slice = input_ids[:, 2:] + assert torch.equal(strategy.model.kwargs["input_ids"], expected_slice) + assert torch.equal( + strategy.model.kwargs["attention_mask"], attention_mask[:, 2:] + ) + assert torch.equal( + strategy.model.kwargs["position_ids"], position_ids[:, 2:] + ) + + +def test_forward_step_uses_cp_slice(strategy_factory): + strategy = strategy_factory( + FSDP2InferStrategy, use_remove_padding=False + ) + strategy.worker.rank_info.cp_size = 2 + strategy.worker.rank_info.cp_rank = 1 + logits = torch.zeros(1, 2, 3) + strategy.model = DummyForwardModel(logits=logits) + strategy.param_dtype = torch.float32 + + seq_len = 4 + batch = TensorDict( + { + "input_ids": torch.arange(seq_len).view(1, seq_len), + "attention_mask": torch.ones(1, seq_len, dtype=torch.long), + "position_ids": torch.zeros(1, seq_len, dtype=torch.long), + "response_mask": torch.ones(1, seq_len, dtype=torch.long), + }, + batch_size=[1], + ) + data = DataProto(batch=batch, meta_info={"micro_batch_size": 1}) + + def dummy_forward_func(local_data, output_tensor): + zeros = torch.zeros_like(local_data.batch["input_ids"]).float() + return output_tensor.sum(), {"log_probs": zeros, "entropy": zeros} + + results = strategy.forward_step( + batch=data, + forward_func=dummy_forward_func, + ) + + assert "log_probs" in results and "entropy" in results + expected_slice = torch.arange(seq_len).view(1, seq_len)[:, seq_len // 2 :] + assert torch.equal(strategy.model.kwargs["input_ids"], expected_slice) + + +def test_load_states_moves_model_and_optimizer( + strategy_factory, monkeypatch +): + strategy = strategy_factory(FSDP2StrategyBase) + strategy.model = MockModel() + + captured = {} + + def fake_move(self, device, non_blocking=False): + captured["device"] = device + captured["non_blocking"] = non_blocking + + monkeypatch.setattr( + FSDP2StrategyBase, "_move_optimizer_states", fake_move + ) + + strategy.load_states( + include=[ + OffloadStateType.model_params, + OffloadStateType.optimizer_states, + ], + non_blocking=True, + ) + + assert strategy.model.to_calls == [("cpu", True)] + assert captured["device"] == "cpu" + assert captured["non_blocking"] is True + + +def test_offload_states_moves_to_cpu_and_clears_cuda_cache( + strategy_factory, monkeypatch, platform_stub +): + strategy = strategy_factory(FSDP2StrategyBase) + strategy.model = MockModel() + platform_stub.device_type = "cuda" + + captured = {} + + def fake_move(self, device, non_blocking=False): + captured["device"] = device + captured["non_blocking"] = non_blocking + + monkeypatch.setattr( + FSDP2StrategyBase, "_move_optimizer_states", fake_move + ) + + cache_cleared = {"flag": False} + monkeypatch.setattr( + fsdp2_strategy.torch.cuda, + "empty_cache", + lambda: cache_cleared.__setitem__("flag", True), + ) + + strategy.offload_states( + include=[ + OffloadStateType.model_params, + OffloadStateType.optimizer_states, + ], + non_blocking=True, + ) + + assert strategy.model.cpu_called + assert isinstance(captured["device"], torch.device) + assert captured["device"].type == "cpu" + assert captured["non_blocking"] is True + assert cache_cleared["flag"] is True + + +def test_rng_state_roundtrip(monkeypatch): + cpu_state = torch.arange(4, dtype=torch.uint8) + cuda_state = torch.arange(5, dtype=torch.uint8) + numpy_state = ("MT19937", np.arange(624, dtype=np.uint32), 0, 0, 0.0) + random_state = (3, (1, 2, 3), None) + + monkeypatch.setattr(torch, "get_rng_state", lambda: cpu_state.clone()) + monkeypatch.setattr( + torch.cuda, "get_rng_state", lambda: cuda_state.clone() + ) + + captured = {} + monkeypatch.setattr( + torch, + "set_rng_state", + lambda state: captured.__setitem__("cpu", state.clone()), + ) + monkeypatch.setattr( + torch.cuda, + "set_rng_state", + lambda state: captured.__setitem__("cuda", state.clone()), + ) + monkeypatch.setattr(np.random, "get_state", lambda: numpy_state) + monkeypatch.setattr( + np.random, + "set_state", + lambda state: captured.__setitem__("numpy", state), + ) + monkeypatch.setattr(random, "getstate", lambda: random_state) + monkeypatch.setattr( + random, + "setstate", + lambda state: captured.__setitem__("random", state), + ) + + rng_state = FSDP2StrategyBase.get_rng_state() + FSDP2StrategyBase.load_rng_state(rng_state) + + assert torch.equal(rng_state["cpu"], cpu_state) + assert torch.equal(captured["cpu"], cpu_state) + assert torch.equal(rng_state["cuda"], cuda_state) + assert torch.equal(captured["cuda"], cuda_state) + assert rng_state["numpy"] == numpy_state + assert captured["numpy"] == numpy_state + assert rng_state["random"] == random_state + assert captured["random"] == random_state + + +class _TinyMLP(torch.nn.Module): + _no_split_modules = ["Linear"] + + def __init__(self, input_dim=8, hidden_dim=16, output_dim=2): + super().__init__() + self.layers = torch.nn.Sequential( + torch.nn.Linear(input_dim, hidden_dim), + torch.nn.ReLU(), + torch.nn.Linear(hidden_dim, output_dim), + ) + self.config = SimpleNamespace(tie_word_embeddings=False) + self.loss_fn = torch.nn.MSELoss() + + def forward(self, inputs, targets): + return self.loss_fn(self.layers(inputs), targets) + + +def _find_free_port(): + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as sock: + sock.bind(("", 0)) + return sock.getsockname()[1] + + +def _generate_synthetic_batches(steps, batch_size, input_dim, output_dim): + generator = torch.Generator().manual_seed(2024) + features = torch.randn( + steps, batch_size, input_dim, generator=generator + ) + targets = torch.randn( + steps, batch_size, output_dim, generator=generator + ) + return features, targets + + +def _collect_full_state(model): + state = {} + for name, param in model.named_parameters(): + tensor = param.detach() + if DTensor is not None and isinstance(tensor, DTensor): + if tensor.device.type == "cpu" and torch.cuda.is_available(): + tensor = tensor.to("cuda") + tensor = tensor.full_tensor() + state[name] = tensor.cpu().numpy() + return state + + +def _fsdp2_training_worker(rank, world_size, port, steps): + use_cuda = torch.cuda.is_available() + backend = "nccl" if use_cuda else "gloo" + fsdp2_strategy.current_platform = _PlatformStub( + device_type="cuda" if use_cuda else "cpu", backend=backend + ) + current_platform = _PlatformStub( + device_type="cuda" if use_cuda else "cpu", backend=backend + ) + os.environ["MASTER_ADDR"] = "127.0.0.1" + os.environ["MASTER_PORT"] = str(port) + os.environ["RANK"] = str(rank) + os.environ["WORLD_SIZE"] = str(world_size) + dist.init_process_group( + backend=backend, rank=rank, world_size=world_size + ) + try: + if use_cuda: + torch.cuda.set_device(rank) + device = torch.device("cuda", rank) + else: + device = torch.device("cpu") + torch.manual_seed(0) + np.random.seed(0) + random.seed(0) + + model = _TinyMLP() + model.train() + mesh = create_device_mesh_with_ulysses( + world_size=world_size, fsdp_size=world_size + ) + mp_policy = ( + MixedPrecisionPolicy( + param_dtype=torch.float16, + reduce_dtype=torch.float32, + cast_forward_inputs=True, + ) + if use_cuda + else None + ) + offload_policy = ( + CPUOffloadPolicy(pin_memory=True) if use_cuda else False + ) + fsdp_kwargs = { + "mesh": mesh, + "reshard_after_forward": True, + "mp_policy": mp_policy, + "offload_policy": offload_policy, + "shard_placement_fn": get_shard_placement_fn(world_size), + } + strategy_config = { + "fsdp_size": world_size, + } + full_state = model.state_dict() + apply_fsdp2(model, fsdp_kwargs, strategy_config) + fsdp2_load_full_state_dict(model, full_state, mesh, offload_policy) + + optimizer = torch.optim.AdamW(model.parameters(), lr=0.01) + inputs, targets = _generate_synthetic_batches( + steps, batch_size=4, input_dim=8, output_dim=2 + ) + + for step in range(steps): + optimizer.zero_grad() + batch_inputs = inputs[step].to(device) + batch_targets = targets[step].to(device) + loss = model(batch_inputs, batch_targets) + print("Output Device:", loss.device) + print("Target Device:", batch_targets.device) + print("Output Dtype:", loss.dtype) + print("Target Dtype:", batch_targets.dtype) + print("Output Shape:", loss.shape) + print("Target Shape:", batch_targets.shape) + loss.backward() + optimizer.step() + + dist.barrier() + local_state = _collect_full_state(model) + gathered = [None] * world_size if rank == 0 else None + dist.gather_object(local_state, gathered, dst=0) + if rank == 0: + baseline = gathered[0] + for idx, other in enumerate(gathered[1:], start=1): + for key in baseline.keys(): + np.testing.assert_allclose( + baseline[key], + other[key], + atol=1e-6, + err_msg=f"Parameter {key} mismatch between ranks 0 and {idx}", + ) + dist.barrier() + finally: + dist.destroy_process_group() + + +@pytest.mark.skipif( + fsdp2_strategy.MixedPrecisionPolicy is None, + reason="FSDP2 requires torch>=2.4", +) +@pytest.mark.skipif( + not dist.is_available(), + reason="torch.distributed is not available", +) +@pytest.mark.skipif( + not torch.cuda.is_available(), + reason="FSDP2 distributed training sync test requires CUDA", +) +def test_fsdp2_distributed_training_keeps_states_in_sync(): + world_size = 2 + port = _find_free_port() + mp.spawn( + _fsdp2_training_worker, + args=(world_size, port, 3), + nprocs=world_size, + join=True, + ) diff --git a/tests/models/cuda_mem/test_mca_model_forward.py b/tests/models/cuda_mem/test_mca_model_forward.py index 5570b746e..dbd9c95d4 100644 --- a/tests/models/cuda_mem/test_mca_model_forward.py +++ b/tests/models/cuda_mem/test_mca_model_forward.py @@ -130,7 +130,7 @@ def forward_step_func(data_iterator, module): time.sleep(600) """ -RANK=0 WORLD_SIZE=1 MASTER_ADDR='127.0.0.1' MASTER_PORT=54893 python tests/models/cuda_mem/test_turbo_model_forward.py +RANK=0 WORLD_SIZE=1 MASTER_ADDR='127.0.0.1' MASTER_PORT=54893 python tests/models/cuda_mem/test_mca_model_forward.py -torchrun --standalone --nnodes=1 --nproc-per-node=2 tests/models/cuda_mem/test_turbo_model_forward.py +torchrun --standalone --nnodes=1 --nproc-per-node=2 tests/models/cuda_mem/test_mca_model_forward.py """ diff --git a/tests/third_party/megatron/test_offload_states.py b/tests/third_party/megatron/test_offload_states.py index 5044ad396..cb6416ed9 100644 --- a/tests/third_party/megatron/test_offload_states.py +++ b/tests/third_party/megatron/test_offload_states.py @@ -34,7 +34,7 @@ from roll.third_party.megatron.optimizer import get_megatron_optimizer -class TurboModelCreator: +class McaModelCreator: def __init__(self, optimizer_type, model_name="/data/cpfs_0/common/models/Qwen2.5-0.5B-Instruct"): self.model_name = model_name @@ -222,7 +222,7 @@ def test_megatron_init_memory(): max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT, ) - mca_model = TurboModelCreator(optimizer_type="dist_optimizer") + mca_model = McaModelCreator(optimizer_type="dist_optimizer") # buffer_data = [] # for buffer in mca_model.optimizer.buffers: @@ -259,7 +259,7 @@ def test_megatron_init_ddp_memory(): max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT, ) - mca_model = TurboModelCreator(optimizer_type=None) + mca_model = McaModelCreator(optimizer_type=None) offload_megatron_no_grad_module(model_chunks=mca_model.model.get_models()) @@ -287,7 +287,7 @@ def check_tensors(expected_tensors: List[torch.Tensor], tensors: List[torch.Tens assert torch.equal(tensor_expected, tensor_restored) -def run_model_infer(mca_model: TurboModelCreator, included_state, pin_memory, non_blocking): +def run_model_infer(mca_model: McaModelCreator, included_state, pin_memory, non_blocking): with torch.no_grad(): for batch in mca_model.data_loader: input_ids, attention_mask = batch @@ -325,7 +325,7 @@ def run_model_infer(mca_model: TurboModelCreator, included_state, pin_memory, no ) -def run_model_dist_optimizer(mca_model: TurboModelCreator, included_state, pin_memory, non_blocking): +def run_model_dist_optimizer(mca_model: McaModelCreator, included_state, pin_memory, non_blocking): assert isinstance(mca_model.optimizer, DistributedOptimizer) for batch in mca_model.data_loader: @@ -530,7 +530,7 @@ def run_model_dist_optimizer(mca_model: TurboModelCreator, included_state, pin_m ) -def run_model_fp16_optimizer(mca_model: TurboModelCreator, included_state, pin_memory, non_blocking): +def run_model_fp16_optimizer(mca_model: McaModelCreator, included_state, pin_memory, non_blocking): assert isinstance(mca_model.optimizer, Float16OptimizerWithFloat16Params) for batch in mca_model.data_loader: @@ -706,7 +706,7 @@ def run_model_fp16_optimizer(mca_model: TurboModelCreator, included_state, pin_m ) -def run_model_fp32_optimizer(mca_model: TurboModelCreator, included_state, pin_memory, non_blocking): +def run_model_fp32_optimizer(mca_model: McaModelCreator, included_state, pin_memory, non_blocking): assert isinstance(mca_model.optimizer, FP32Optimizer) for batch in mca_model.data_loader: @@ -895,7 +895,7 @@ def test_megatron_offload_states(included_state, pin_memory, non_blocking, optim # stacks='python' # ) - mca_model = TurboModelCreator(optimizer_type=optimizer_type) + mca_model = McaModelCreator(optimizer_type=optimizer_type) include = None if included_state is None else [included_state] if optimizer_type is None: diff --git a/tests/third_party/sglang/test_abort.py b/tests/third_party/sglang/test_abort.py new file mode 100644 index 000000000..f1a92d498 --- /dev/null +++ b/tests/third_party/sglang/test_abort.py @@ -0,0 +1,119 @@ +import ray +import asyncio +import uuid + +from sglang.srt.managers.io_struct import GenerateReqInput + +from roll.third_party.sglang import patch as sglang_patch +from roll.utils.checkpoint_manager import download_model + +def chat_format(prompt): + system = "Please reason step by step, and put your final answer within \\boxed{}." + return f"<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" + +prompts = [ + "类型#上衣*材质#牛仔布*颜色#白色*风格#简约*图案#刺绣*衣样式#外套*衣款式#破洞,生成一段文案", + "根据关键词描述生成女装/女士精品行业连衣裙品类的发在淘宝的小红书风格的推送配文,包括标题和内容。关键词:pe。要求:1. 推送标题要体现关键词和品类特点,语言通顺,有吸引力,约10个字;2. 推送内容要语言通顺,突出关键词和品类特点,对目标受众有吸引力,长度约30字。标题:", + "100.25和90.75谁更大?", +] + +chat_prompts = [chat_format(prompt) for prompt in prompts] + +async def test_sampling_n(model): + sampling_params = { + 'temperature': 0.8, + 'min_new_tokens': 8192, + 'max_new_tokens': 8192, + 'stream_interval': 50, + 'n': 3, + } + obj = GenerateReqInput( + text=chat_prompts[0], + sampling_params=sampling_params, + rid=None, + stream=True, + ) + chunks: list[dict] = [None for _ in range(sampling_params['n'])] + generator = model.tokenizer_manager.generate_request(obj, None) + async for chunk in generator: + index = chunk.get("index", 0) + chunks[index] = chunk + assert all(chunk is not None for chunk in chunks) + assert all(chunk["meta_info"]["finish_reason"]["type"] == "length" for chunk in chunks) + +async def test_abort_all(model): + sampling_params = { + 'temperature': 0.8, + 'min_new_tokens': 8192, + 'max_new_tokens': 8192, + 'stream_interval': 50, + 'n': 3, + } + obj = GenerateReqInput( + text=chat_prompts[0], + sampling_params=sampling_params, + stream=True, + ) + async def _generate(): + generator = model.tokenizer_manager.generate_request(obj, None) + chunks: list[dict] = [None for _ in range(sampling_params['n'])] + generator = model.tokenizer_manager.generate_request(obj, None) + async for chunk in generator: + index = chunk.get("index", 0) + chunks[index] = chunk + return chunks + task = asyncio.create_task(_generate()) + await asyncio.sleep(1) + for rid in model.tokenizer_manager.rid_to_state: + model.tokenizer_manager.abort_request(rid) + chunks = await task + assert all(chunk is not None for chunk in chunks) # assume at least generate one iter + assert all(chunk["meta_info"]["finish_reason"]["type"] == "abort" for chunk in chunks) + +async def test_abort(model): + sampling_params = { + 'temperature': 0.8, + 'min_new_tokens': 8192, + 'max_new_tokens': 8192, + 'stream_interval': 50, + 'n': 1, + } + rid = uuid.uuid4().hex + obj = GenerateReqInput( + text=chat_prompts[0], + sampling_params=sampling_params, + rid=rid, + stream=True, + ) + async def _generate(): + generator = model.tokenizer_manager.generate_request(obj, None) + chunk = None + async for chunk in generator: + chunk = chunk + return chunk + task = asyncio.create_task(_generate()) + await asyncio.sleep(1) + model.tokenizer_manager.abort_request(rid) + chunk = await task + assert chunk is not None # assume at least generate one iter + assert chunk["meta_info"]["finish_reason"]["type"] == "abort" + +async def main(): + model_path = "Qwen/Qwen2.5-7B-Instruct" + model_path = download_model(model_path) + model = sglang_patch.engine.engine_module.Engine( + enable_memory_saver= True, + model_path=model_path, + dtype="bfloat16", + random_seed=1, + tp_size=1, + mem_fraction_static= 0.6, + disable_custom_all_reduce=True, + ) + + await test_sampling_n(model) + await test_abort_all(model) + await test_abort(model) + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/third_party/sglang/test_fp8.py b/tests/third_party/sglang/test_fp8.py new file mode 100644 index 000000000..c9ec986d2 --- /dev/null +++ b/tests/third_party/sglang/test_fp8.py @@ -0,0 +1,68 @@ +import json +from tqdm import tqdm + +from transformers import AutoModelForCausalLM + +from roll.utils.checkpoint_manager import download_model + +if False: + from sglang.srt.entrypoints.engine import Engine +else: + from roll.third_party.sglang import patch as sglang_patch + Engine = sglang_patch.engine.engine_module.Engine + + +def chat_format(prompt): + system = "Please reason step by step, and put your final answer within \\boxed{}." + return f"<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" + +def main(): + model_path = "Qwen/Qwen2.5-0.5B-Instruct" + model_path = "Qwen/Qwen3-Coder-30B-A3B-Instruct-FP8" + model_path = "Qwen/Qwen3-Coder-30B-A3B-Instruct" + model_path = download_model(model_path) + + model = Engine( + model_path=model_path, + skip_tokenizer_init=False, + trust_remote_code=True, + tp_size=1, + load_format="auto", + disable_cuda_graph=False, + disable_custom_all_reduce=True, + sampling_backend="pytorch", + mem_fraction_static=0.6, + max_total_tokens=2048, + max_running_requests=2, + enable_memory_saver=True, + quantization="fp8", + json_model_override_args= + json.dumps({ + "quantization_config": + { + "activation_scheme": "dynamic", + "weight_block_size": [128, 128], + } + }), + ) + + prompts = ["类型#上衣*材质#牛仔布*颜色#白色*风格#简约*图案#刺绣*衣样式#外套*衣款式#破洞,生成一段文案"] + prompts = [chat_format(prompt) for prompt in prompts] + + sampling_params = { + 'min_new_tokens': 128, + 'max_new_tokens': 128, + } + + output = model.generate(prompt=prompts, sampling_params=sampling_params) + print(output) + + train_model = AutoModelForCausalLM.from_pretrained(model_path, dtype="auto") + for name, param in tqdm(iterable=train_model.named_parameters()): + model.update_weights_from_tensor(named_tensors=[(name, param)]) + + output = model.generate(prompt=prompts, sampling_params=sampling_params) + print(output) + +if __name__ == "__main__": + main() diff --git a/tests/third_party/vllm/test_abort.py b/tests/third_party/vllm/test_abort.py new file mode 100644 index 000000000..4848e80bc --- /dev/null +++ b/tests/third_party/vllm/test_abort.py @@ -0,0 +1,147 @@ +import ray +import asyncio +import pytest +from packaging.version import Version + +import vllm +from vllm import SamplingParams +from vllm.sampling_params import RequestOutputKind +from vllm.utils import random_uuid + +from roll.distributed.scheduler.resource_manager import ResourceManager +from roll.third_party.vllm import create_async_llm +from roll.utils.checkpoint_manager import download_model +from utils import chat_prompts, print_request_output + + +# vLLM 0.8.4 has bug when using n_sample with output_kind other than RequestOutputKind.FINAL_ONLY +# https://github.com/vllm-project/vllm/pull/16863 + +async def test_vllm_sampling_n(model): + print(">>>>>>>>>>>>>>> test_vllm_sampling_n") + sampling_params = SamplingParams( + temperature=0.1, + top_p=0.99, + top_k=100, + min_tokens=8192, + max_tokens=8192, + n=3, + output_kind=RequestOutputKind.FINAL_ONLY, + ) + + async def generate(prompt): + request_id = random_uuid() + result_generator = model.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id) + output = None + async for request_output in result_generator: + output = request_output + assert output is not None + return output + + output = await generate(chat_prompts[0]) + assert len(output.outputs) == 3 + # print_request_output(output) + +# The semantics of AsyncLLMEngine.abort for v1 and v0 are not aligned (see +# https://github.com/vllm-project/vllm/blob/main/tests/async_engine/test_async_llm_engine.py#L350 and +# https://github.com/vllm-project/vllm/blob/main/tests/v1/engine/test_async_llm.py#L185 for difference). +# +# What we want is the semantic of v0 that raise asyncio.CancelledError in +# AsyncLLMEngine.generate when request is aborted rather than rely on cancel of async task to abort request. +async def test_vllm_abort(model): + print(">>>>>>>>>>>>>>> test_vllm_abort") + sampling_params = SamplingParams( + temperature=0.1, + min_tokens=8192, + max_tokens=8192, + n=3, + output_kind=RequestOutputKind.FINAL_ONLY, + ) + + request_id = random_uuid() + async def generate(): + output = None + if Version(vllm.__version__) >= Version("0.10.2"): + async for request_output in model.generate(chat_prompts[0], sampling_params, request_id=request_id): + output = request_output + else: + with pytest.raises(asyncio.CancelledError): # we patch older version vllm + async for request_output in model.generate(chat_prompts[0], sampling_params, request_id=request_id): + output = request_output + return output + + task = asyncio.create_task(generate()) + await asyncio.sleep(1) + await model.abort(request_id) + output = await task + # assume generate is longer than 1s + if Version(vllm.__version__) >= Version("0.10.2"): + assert output is not None and output.finished + assert len(output.outputs) == 3 + assert all(out.finish_reason == "abort" for out in output.outputs) + else: + assert output is None + +async def test_vllm_abort_cumulative(model): + print(">>>>>>>>>>>>>>> test_vllm_abort_cumulative") + sampling_params = SamplingParams( + temperature=0.1, + min_tokens=8192, + max_tokens=8192, + n=3, # the behaviour of n sample before 0.10.2 is the same as sglang, we must store output by index + output_kind=RequestOutputKind.CUMULATIVE, + ) + + request_id = random_uuid() + async def generate(): + output = None + if Version(vllm.__version__) >= Version("0.10.2"): + async for request_output in model.generate(chat_prompts[0], sampling_params, request_id=request_id): + output = request_output + else: + with pytest.raises(asyncio.CancelledError): # we patch older version vllm + async for request_output in model.generate(chat_prompts[0], sampling_params, request_id=request_id): + output = request_output + return output + + task = asyncio.create_task(generate()) + await asyncio.sleep(1) + await model.abort(request_id) + output = await task + # assume at least generate one iter and generate is longer than 1s + if Version(vllm.__version__) >= Version("0.10.2"): + assert output is not None and output.finished + assert len(output.outputs) == 3 + assert all(out.finish_reason == "abort" for out in output.outputs) + else: + assert output is not None and not output.finished + assert len(output.outputs) == 1 # does match sampling_params.n + +async def main(): + model_path = "Qwen/Qwen2.5-7B-Instruct" + model_path = download_model(model_path) + + ray.init() + resource_manager = ResourceManager(2, 1) + placement_groups = resource_manager.allocate_placement_group(world_size=1, device_mapping=[0, 1]) + sampling_params = SamplingParams(temperature=0.0, top_p=0.99, top_k=100, max_tokens=512) + + model = await create_async_llm( + resource_placement_groups=placement_groups[0], + model=model_path, + block_size=16, + dtype="bfloat16", + gpu_memory_utilization=0.8, + tensor_parallel_size=2, + distributed_executor_backend="ray", + disable_custom_all_reduce=True, + enable_sleep_mode=True, + enforce_eager=False, + ) + + await test_vllm_sampling_n(model) + await test_vllm_abort(model) + await test_vllm_abort_cumulative(model) + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/third_party/vllm/test_collective_rpc.py b/tests/third_party/vllm/test_collective_rpc.py new file mode 100644 index 000000000..4ee1b2a32 --- /dev/null +++ b/tests/third_party/vllm/test_collective_rpc.py @@ -0,0 +1,83 @@ +import ray +import asyncio +import torch +import pytest +from tqdm import tqdm +from transformers import AutoModelForCausalLM + +from roll.distributed.scheduler.resource_manager import ResourceManager +from roll.third_party.vllm import create_async_llm +from roll.third_party.vllm.worker_helper import WorkerHelper +from roll.utils.checkpoint_manager import download_model + + +def load_weight_tensor(self, name, param): + self.load_weights([(name, param)]) +WorkerHelper.load_weight_tensor = load_weight_tensor + +def load_weight_numpy(self, name, param): + param = torch.from_numpy(param) + self.load_weights([(name, param)]) +WorkerHelper.load_weight_numpy = load_weight_numpy + +def load_weight_list(self, name, dtype, buffer): + weight = torch.tensor(buffer, dtype=dtype).cuda() + self.load_weights([(name, weight)]) +WorkerHelper.load_weight_list = load_weight_list + +async def test_vllm_collective_rpc(): + ray.init() + resource_manager = ResourceManager(1, 1) + placement_groups = resource_manager.allocate_placement_group(world_size=1, device_mapping=[0]) + + model_path = "Qwen/Qwen2.5-7B-Instruct" + model_path = download_model(model_path) + model = await create_async_llm( + resource_placement_groups=placement_groups[0], + model=model_path, + load_format="auto", + block_size=16, + dtype="bfloat16", + gpu_memory_utilization=0.8, + tensor_parallel_size=1, + disable_custom_all_reduce=True, + enable_sleep_mode=True, + enforce_eager=False, + ) + + train_model = AutoModelForCausalLM.from_pretrained(model_path) + + print(">>>>>>>>>>>>>>> test_vllm_rpc: tensor(cuda)") + with pytest.raises(Exception): + try: + for name, param in tqdm(list(train_model.named_parameters()), desc="Updating parameter", unit="param"): + await model.engine_core.collective_rpc_async(method="load_weight_tensor", args=(name, param.detach().cuda())) + except Exception as e: + print("<<<<<<<<<<<<<<< exception: ", e) + raise + + print(">>>>>>>>>>>>>>> test_vllm_rpc: tensor(cpu)") + with pytest.raises(Exception): + try: + for name, param in tqdm(list(train_model.named_parameters()), desc="Updating parameter", unit="param"): + await model.engine_core.collective_rpc_async(method="load_weight_tensor", args=(name, param.detach().cpu())) + except Exception as e: + print("<<<<<<<<<<<<<<< exception: ", e) + raise + + print(">>>>>>>>>>>>>>> test_vllm_rpc: numpy") + with pytest.raises(Exception): + try: + for name, param in tqdm(list(train_model.named_parameters()), desc="Updating parameter", unit="param"): + await model.engine_core.collective_rpc_async(method="load_weight_numpy", args=(name, param.detach().numpy())) + except Exception as e: + print("<<<<<<<<<<<<<<< exception: ", e) + raise + + print(">>>>>>>>>>>>>>> test_vllm_rpc: list") + for name, p in tqdm(list(train_model.named_parameters()), desc="Updating parameter", unit="param"): + await model.engine_core.collective_rpc_async(method="load_weight_list", args=(name, p.dtype, p.tolist())) + +if __name__ == "__main__": + loop = asyncio.get_event_loop() + loop.run_until_complete(test_vllm_collective_rpc()) diff --git a/tests/third_party/vllm/test_fp8.py b/tests/third_party/vllm/test_fp8.py index de1bf9554..6b5383115 100644 --- a/tests/third_party/vllm/test_fp8.py +++ b/tests/third_party/vllm/test_fp8.py @@ -1,47 +1,45 @@ -import gc import os -import uuid -from contextlib import contextmanager - +import asyncio import ray -import torch from tqdm import tqdm from roll.platforms import current_platform from transformers import AutoModelForCausalLM from vllm import SamplingParams -from vllm.utils import GiB_bytes from roll.distributed.scheduler.resource_manager import ResourceManager -from roll.third_party.vllm import LLM -from roll.third_party.vllm.worker_helper import WorkerHelper +from roll.third_party.vllm import create_async_llm +from roll.third_party.vllm.worker import WorkerV1 from roll.utils.checkpoint_manager import download_model +from utils import generate_batch, chat_format, print_current_mem_usage, mem_usage, print_request_output -USE_CUSTOME_MODEL_UPDATE = True -def print_current_mem_usage(tag): - current_platform.empty_cache() - gc.collect() - free_bytes, total = current_platform.mem_get_info() - print(f"[mem_usage] {tag} | current used: {(total - free_bytes) / GiB_bytes}") +class Fp8Worker(WorkerV1): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) -def custom_wakeup(self): - print_current_mem_usage("before_wakeup") + def custom_wakeup(self): + print_current_mem_usage("before_wakeup") + self.wake_up(["weights"]) + print_current_mem_usage("after_wakeup") - self.wake_up(["weights"]) - print_current_mem_usage("after_wakeup") + def custom_load_model(self, model_path, zero=False): + train_model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto") + for param_name, param in tqdm(iterable=train_model.named_parameters(), total=len(list(train_model.named_parameters()))): + if zero: + param = param.data.clone().cuda().zero_() + else: + param = param.data.clone().cuda() + self.load_weights([(param_name, param)]) -WorkerHelper.custom_wakeup = custom_wakeup +async def test_fp8_mem_usage(): + os.environ["VLLM_USE_V1"] = "1" -def test_fp8_mem(): - ray.init() - resource_manager = ResourceManager(1, 1) - placement_groups = resource_manager.allocate_placement_group(world_size=1, device_mapping=[0]) model_path = "Qwen/Qwen2.5-7B-Instruct" model_path = download_model(model_path) - model = LLM( - resource_placement_groups=placement_groups[0], + model = await create_async_llm( + resource_placement_groups=[[0]], model=model_path, load_format="auto", block_size=16, @@ -51,52 +49,12 @@ def test_fp8_mem(): enable_sleep_mode=True, enforce_eager=False, quantization="fp8", + worker_extension_cls="tests.third_party.vllm.test_fp8.Fp8Worker" ) - model.collective_rpc(method="offload_states", args=(1,)) - model.collective_rpc(method="custom_wakeup") - - -@contextmanager -def mem_usage(mem_profile=False): - free_bytes, total = current_platform.mem_get_info() - used_bytes_before = total - free_bytes - MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT: int = 100000 - if mem_profile: - torch.cuda.memory._record_memory_history(max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT, stacks="python") - try: - yield - finally: - torch.cuda.empty_cache() - gc.collect() - dump_file = "" - if mem_profile: - dump_file = f"/tmp/{uuid.uuid4()}.pickle" - os.makedirs(os.path.dirname(dump_file), exist_ok=True) - torch.cuda.memory._dump_snapshot(dump_file) - # print(f"{torch.cuda.memory._snapshot()}") - torch.cuda.memory._record_memory_history(enabled=None) - free_bytes, total = current_platform.mem_get_info() - used_bytes_after = total - free_bytes - print( - f"[mem_usage] before {used_bytes_before / GiB_bytes} after {used_bytes_after / GiB_bytes}, dump to file {dump_file}" - ) - -def custom_load_model(self, model_path, zero=False): - train_model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto") - for param_name, param in tqdm(iterable=train_model.named_parameters(), total=len(list(train_model.named_parameters()))): - if zero: - param = param.data.clone().cuda().zero_() - else: - param = param.data.clone().cuda() - self.load_weights([(param_name, param)]) - -WorkerHelper.custom_load_model = custom_load_model - -def chat_format(prompt): - system = "Please reason step by step, and put your final answer within \\boxed{}." - return f"<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" - -def test_fp8(): + await model.offload_states(level=1) + await model.engine_core.collective_rpc_async("custom_wakeup") + +async def test_fp8(): os.environ["VLLM_USE_DEEP_GEMM"] = "1" ray.init() @@ -107,7 +65,7 @@ def test_fp8(): model_path = "Qwen/Qwen3-30B-A3B-Instruct-2507" model_path = "Qwen/Qwen3-32B" model_path = download_model(model_path) - model = LLM( + model = await create_async_llm( resource_placement_groups=placement_groups[0], model=model_path, load_format="auto", @@ -115,37 +73,40 @@ def test_fp8(): dtype="bfloat16", gpu_memory_utilization=0.8, tensor_parallel_size=2, + disable_custom_all_reduce=True, enable_sleep_mode=True, enforce_eager=False, quantization="fp8", + worker_extension_cls="tests.third_party.vllm.worker.Fp8Worker" ) - prompts = [ - "类型#上衣*材质#牛仔布*颜色#白色*风格#简约*图案#刺绣*衣样式#外套*衣款式#破洞,生成一段文案", - ] - chat_prompts = [] - for prompt in prompts: - chat_prompts.append(chat_format(prompt)) + prompts = ["类型#上衣*材质#牛仔布*颜色#白色*风格#简约*图案#刺绣*衣样式#外套*衣款式#破洞,生成一段文案"] + chat_prompts = [chat_format(prompt) for prompt in prompts] + sampling_params = SamplingParams(temperature=0.0, top_p=0.99, top_k=100, max_tokens=512) - vllm_outputs = model.generate(prompts=chat_prompts, sampling_params=sampling_params) - print(vllm_outputs) + vllm_outputs = await generate_batch(model, chat_prompts, sampling_params) + print_request_output(vllm_outputs) - model.offload_states() - model.collective_rpc("custom_load_model", args=(model_path, True)) + await model.offload_states() + await model.engine_core.collective_rpc_async("custom_load_model", args=(model_path, True)) with mem_usage(): - model.load_states() + await model.load_states() - vllm_outputs = model.generate(prompts=chat_prompts, sampling_params=sampling_params) - print(vllm_outputs) + vllm_outputs = await generate_batch(model, chat_prompts, sampling_params) + print_request_output(vllm_outputs) - model.offload_states() - model.collective_rpc("custom_load_model", args=(model_path, False)) + await model.offload_states() + await model.engine_core.collective_rpc_async("custom_load_model", args=(model_path, False)) with mem_usage(): - model.load_states() + await model.load_states() + + vllm_outputs = await generate_batch(model, chat_prompts, sampling_params) + print_request_output(vllm_outputs) - vllm_outputs = model.generate(prompts=chat_prompts, sampling_params=sampling_params) - print(vllm_outputs) +async def main(): + await test_fp8_mem_usage() + await test_fp8() if __name__ == "__main__": - test_fp8() + asyncio.run(main()) diff --git a/tests/third_party/vllm/test_fp8_perf.py b/tests/third_party/vllm/test_fp8_perf.py index 02b6c107f..ac095b5da 100644 --- a/tests/third_party/vllm/test_fp8_perf.py +++ b/tests/third_party/vllm/test_fp8_perf.py @@ -3,18 +3,15 @@ import time import itertools +import asyncio import ray from vllm import SamplingParams from roll.distributed.scheduler.resource_manager import ResourceManager -from roll.third_party.vllm import LLM +from roll.third_party.vllm import create_async_llm from roll.utils.checkpoint_manager import download_model -import nvtx +from utils import generate_batch, chat_format -def chat_format(prompt): - system = "Please reason step by step, and put your final answer within \\boxed{}." - return f"<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" - def print_speed_metrics(outputs, start_time): now = time.time() print(f"total time cost: {now - start_time}s") @@ -29,10 +26,10 @@ def print_speed_metrics(outputs, start_time): print(f"mean prompt len: {sum([len(o.prompt_token_ids) for o in outputs]) / len(outputs)}") print(f"min prompt len: {min([len(o.prompt_token_ids) for o in outputs])}") -def generate(model, prompts, sampling_params): +async def generate(model, prompts, sampling_params): print(f"Begin generate for {len(prompts)} prompts") start_time = time.time() - outputs = model.generate(prompts, sampling_params) + outputs = await generate_batch(model, prompts, sampling_params) print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<") print_speed_metrics(outputs, start_time) print("<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<") @@ -70,19 +67,21 @@ def get_sampling_param_max(limit, num): sampling_params.append(sampling_param) return sampling_params, num_tokens -def test_uniform(model, chat_prompts, limit, num): +async def test_uniform(model, chat_prompts, limit, num): print(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TEST UNIFORM {limit} {num}") sampling_params, num_tokens = get_sampling_param_uniform(limit, num) prompts = list(itertools.islice(itertools.cycle(chat_prompts), len(sampling_params))) - generate(model, prompts, sampling_params) + await generate(model, prompts, sampling_params) + await model.do_log_stats() -def test_max(model, chat_prompts, limit, num): +async def test_max(model, chat_prompts, limit, num): print(f">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> TEST MAX {limit} {num}") sampling_params, num_tokens = get_sampling_param_max(limit, num) prompts = list(itertools.islice(itertools.cycle(chat_prompts), len(sampling_params))) - generate(model, prompts, sampling_params) + await generate(model, prompts, sampling_params) + await model.do_log_stats() -if __name__ == "__main__": +async def main(): os.environ["VLLM_USE_DEEP_GEMM"] = "1" os.environ["NCCL_NVLS_ENABLE"] = "0" @@ -98,7 +97,7 @@ def test_max(model, chat_prompts, limit, num): model_path = "/data/cpfs_0/common/models/Qwen3-32B" model_path = "/data/cpfs_0/common/models/Qwen3-30B-A3B" model_path = download_model(model_path) - model = LLM( + model = await create_async_llm( resource_placement_groups=placement_groups[0], model=model_path, tensor_parallel_size=2, @@ -112,7 +111,7 @@ def test_max(model, chat_prompts, limit, num): # "activation_scheme": "dynamic", # "fmt": "e4m3", # "quant_method": "fp8", - # "weight_block_size": [64, 64], + # "weight_block_size": [128, 128], # } # }, ) @@ -130,16 +129,19 @@ def test_max(model, chat_prompts, limit, num): # nsys profile --trace-fork-before-exec=true --cuda-graph-trace=node #with nvtx.annotate("generate"): - # test_max(model, chat_prompts, 4096, 32) - - test_max(model, chat_prompts, 4096, 32) - test_max(model, chat_prompts, 4096, 16) - test_max(model, chat_prompts, 4096, 8) - test_max(model, chat_prompts, 4096, 4) - test_max(model, chat_prompts, 4096, 1) - - test_uniform(model, chat_prompts, 4096, 32) - test_uniform(model, chat_prompts, 4096, 16) - test_uniform(model, chat_prompts, 4096, 8) - test_uniform(model, chat_prompts, 4096, 4) - test_uniform(model, chat_prompts, 4096, 1) + # await test_max(model, chat_prompts, 4096, 32) + + await test_max(model, chat_prompts, 4096, 32) + await test_max(model, chat_prompts, 4096, 16) + await test_max(model, chat_prompts, 4096, 8) + await test_max(model, chat_prompts, 4096, 4) + await test_max(model, chat_prompts, 4096, 1) + + await test_uniform(model, chat_prompts, 4096, 32) + await test_uniform(model, chat_prompts, 4096, 16) + await test_uniform(model, chat_prompts, 4096, 8) + await test_uniform(model, chat_prompts, 4096, 4) + await test_uniform(model, chat_prompts, 4096, 1) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/tests/third_party/vllm/test_model_update.py b/tests/third_party/vllm/test_model_update.py index 3358c87b6..cbce8c9d1 100644 --- a/tests/third_party/vllm/test_model_update.py +++ b/tests/third_party/vllm/test_model_update.py @@ -1,93 +1,77 @@ -import pytest +import os import ray -import torch +import asyncio from tqdm import tqdm from transformers import AutoModelForCausalLM from vllm import SamplingParams from roll.distributed.scheduler.resource_manager import ResourceManager -from roll.third_party.vllm import LLM -from roll.third_party.vllm.worker_helper import WorkerHelper +from roll.third_party.vllm import create_async_llm +from roll.third_party.vllm.worker import WorkerV1 from roll.utils.checkpoint_manager import download_model - - -def load_weight_tensor(self, name, param): - self.load_weights([(name, param)]) - -WorkerHelper.load_weight_tensor = load_weight_tensor - -def load_weights_tensor(self, model): - for name, param in tqdm(list(model.named_parameters()), desc="Updating parameter", unit="param"): - self.collective_rpc(method="load_weight_tensor", args=(name, param.detach().cuda())) - -LLM.load_weights_tensor = load_weights_tensor - - -def load_weight_numpy(self, name, param): - param = torch.from_numpy(param) - self.load_weights([(name, param)]) - -WorkerHelper.load_weight_numpy = load_weight_numpy - -def load_weights_numpy(self, model): - for name, param in tqdm(list(model.named_parameters()), desc="Updating parameter", unit="param"): - self.collective_rpc(method="load_weight_numpy", args=(name, param.detach().numpy())) - -LLM.load_weights_numpy = load_weights_numpy - - -def load_weight_list(self, name, dtype, buffer): - weight = torch.tensor(buffer, dtype=dtype).cuda() - self.load_weights([(name, weight)]) - -WorkerHelper.load_weight_list = load_weight_list - -def load_weights_list(self, model): - for name, p in tqdm(list(model.named_parameters()), desc="Updating parameter", unit="param"): - self.collective_rpc(method="load_weight_list", args=(name, p.dtype, p.tolist())) - -LLM.load_weights_list = load_weights_list - - -def test_model_update_single_gpu(): - model_path = "Qwen/Qwen2.5-0.5B-Instruct" - model_path = download_model(model_path) - +from utils import generate_batch, chat_prompts, print_request_output + +class ModelUpdateWorker(WorkerV1): + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + + def load_full_model(self, model_path, zero=False): + train_model = AutoModelForCausalLM.from_pretrained(model_path, torch_dtype="auto") + for param_name, param in tqdm(iterable=train_model.named_parameters(), total=len(list(train_model.named_parameters()))): + if zero: + param = param.data.clone().cuda().zero_() + else: + param = param.data.clone().cuda() + self.load_weights([(param_name, param)]) + +async def test_vllm_offload(): + os.environ["VLLM_USE_V1"] = "1" ray.init() - resource_manager = ResourceManager(1, 1) - placement_groups = resource_manager.allocate_placement_group(world_size=1, device_mapping=[0]) + resource_manager = ResourceManager(2, 1) + placement_groups = resource_manager.allocate_placement_group(world_size=1, device_mapping=[0,1]) - model = LLM( + model_path = "Qwen/Qwen2.5-7B-Instruct" + model_path = download_model(model_path) + model = await create_async_llm( resource_placement_groups=placement_groups[0], model=model_path, + load_format="auto", block_size=16, dtype="bfloat16", gpu_memory_utilization=0.8, - tensor_parallel_size=1, - trust_remote_code=True, + tensor_parallel_size=2, disable_custom_all_reduce=True, - enforce_eager=True, enable_sleep_mode=True, + enforce_eager=False, + worker_extension_cls="tests.third_party.vllm.test_model_update.ModelUpdateWorker", ) - train_model = AutoModelForCausalLM.from_pretrained(model_path) - - with pytest.raises(Exception): - try: - model.load_weights_tensor(train_model) - except Exception as e: - print("load_weights_tensor exception: ", e) - raise - - with pytest.raises(Exception): - try: - model.load_weights_numpy(train_model) - except Exception as e: - print("load_weights_numpy exception: ", e) - raise - - model.load_weights_list(train_model) - + # test offload/onload and sleep_level + sampling_params = SamplingParams(temperature=0.0, top_p=0.99, top_k=100, max_tokens=512) + + print(">>>>>>>>>>>>>>> test_vllm_load_offload: base") + vllm_outputs = await generate_batch(model=model, prompts=chat_prompts, sampling_params=sampling_params) + assert len(vllm_outputs) == len(chat_prompts) + print_request_output(vllm_outputs) + + print(">>>>>>>>>>>>>>> test_vllm_load_offload: offload states sleep_level_1") + await model.offload_states(1) + await model.load_states() + vllm_outputs = await generate_batch(model=model, prompts=chat_prompts, sampling_params=sampling_params) + print_request_output(vllm_outputs) + + print(">>>>>>>>>>>>>>> test_vllm_load_offload: offload states sleep_level_2") + await model.offload_states(2) + await model.load_states() + vllm_outputs = await generate_batch(model=model, prompts=chat_prompts, sampling_params=sampling_params) + print_request_output(vllm_outputs) + + print(">>>>>>>>>>>>>>> test_vllm_load_offload: offload states sleep_level_2 + reload") + await model.offload_states(2) + await model.engine_core.collective_rpc_async("load_full_model", args=(model_path,)) + await model.load_states() + vllm_outputs = await generate_batch(model=model, prompts=chat_prompts, sampling_params=sampling_params) + print_request_output(vllm_outputs) if __name__ == "__main__": - test_model_update_single_gpu() + asyncio.run(test_vllm_offload()) diff --git a/tests/third_party/vllm/test_vllm_local_actor.py b/tests/third_party/vllm/test_vllm_local_actor.py index a8ca661f8..010da06d0 100644 --- a/tests/third_party/vllm/test_vllm_local_actor.py +++ b/tests/third_party/vllm/test_vllm_local_actor.py @@ -9,29 +9,14 @@ from vllm import SamplingParams from roll.distributed.scheduler.resource_manager import ResourceManager -from roll.third_party.vllm import LLM +from roll.third_party.vllm import create_async_llm +from utils import chat_prompts, generate_batch model_path = "Qwen/Qwen2.5-7B-Instruct" -prompts = [ - "类型#上衣*材质#牛仔布*颜色#白色*风格#简约*图案#刺绣*衣样式#外套*衣款式#破洞,生成一段文案", - "根据关键词描述生成女装/女士精品行业连衣裙品类的发在淘宝的小红书风格的推送配文,包括标题和内容。关键词:pe。要求:1. 推送标题要体现关键词和品类特点,语言通顺,有吸引力,约10个字;2. 推送内容要语言通顺,突出关键词和品类特点,对目标受众有吸引力,长度约30字。标题:", - "100.25和90.75谁更大?", -] - - -def chat_format(prompt): - system = "Please reason step by step, and put your final answer within \\boxed{}." - return f"<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" - - tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) -chat_prompts = [] -for prompt in prompts: - chat_prompts.append(chat_format(prompt)) - # os.environ["RAY_DEBUG"] = "legacy" # breakpoint() @@ -51,8 +36,8 @@ def chat_format(prompt): @ray.remote class TestActor: - def __init__(self, placement_groups): - self.model = LLM( + async def initialize(self, placement_groups): + self.model = await create_async_llm( resource_placement_groups=placement_groups[0], model=model_path, block_size=16, @@ -65,9 +50,9 @@ def __init__(self, placement_groups): enable_sleep_mode=True, ) - def run(self): + async def run(self): sampling_params = SamplingParams(temperature=0.0, top_p=0.99, top_k=100, max_tokens=512) - self.model.offload_states() + await self.model.offload_states() import torch print(f"memory allocated: {torch.cuda.memory_allocated() / 1024 ** 3}") @@ -78,9 +63,10 @@ def run(self): pdb.set_trace() - self.model.load_states() + await self.model.load_states() - vllm_outputs = self.model.generate( + vllm_outputs = await generate_batch( + self.model, sampling_params=sampling_params, prompts=chat_prompts, ) @@ -109,7 +95,8 @@ def run(self): runtime_env=runtime_env, num_cpus=0.01, num_gpus=0.01, -).remote(placement_groups=placement_groups) +).remote() +ray.get(actor.initialize.remote(placement_groups=placement_groups)) ray.get(actor.run.remote()) ray.shutdown() diff --git a/tests/third_party/vllm/test_vllm_mem_oom.py b/tests/third_party/vllm/test_vllm_mem_oom.py index 0e1aa9546..3eaa28c59 100644 --- a/tests/third_party/vllm/test_vllm_mem_oom.py +++ b/tests/third_party/vllm/test_vllm_mem_oom.py @@ -1,101 +1,88 @@ -import os -import time - +import asyncio import ray from transformers import AutoTokenizer from vllm import SamplingParams from roll.distributed.scheduler.resource_manager import ResourceManager -from roll.third_party.vllm import LLM +from roll.third_party.vllm import create_async_llm from roll.utils.context_managers import cpu_memory_info from roll.utils.logging import get_logger +from utils import generate_batch, chat_prompts logger = get_logger() - -model_path = "Qwen/Qwen2.5-7B-Instruct" - -prompts = [ - "类型#上衣*材质#牛仔布*颜色#白色*风格#简约*图案#刺绣*衣样式#外套*衣款式#破洞,生成一段文案", - "根据关键词描述生成女装/女士精品行业连衣裙品类的发在淘宝的小红书风格的推送配文,包括标题和内容。关键词:pe。要求:1. 推送标题要体现关键词和品类特点,语言通顺,有吸引力,约10个字;2. 推送内容要语言通顺,突出关键词和品类特点,对目标受众有吸引力,长度约30字。标题:", - "100.25和90.75谁更大?", -] - - -def chat_format(prompt): - system = "Please reason step by step, and put your final answer within \\boxed{}." - return f"<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" - - -tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) - -chat_prompts = [] -for prompt in prompts: - chat_prompts.append(chat_format(prompt)) - -# os.environ["RAY_DEBUG"] = "legacy" - -ray.init() -resource_manager = ResourceManager() -placement_groups = resource_manager.allocate_placement_group(world_size=1, device_mapping=list(range(1))) -sampling_params = SamplingParams(temperature=0.0, top_p=0.99, top_k=100, max_tokens=1024) - -model = LLM( - resource_placement_groups=placement_groups[0], - model=model_path, - block_size=16, - dtype="bfloat16", - gpu_memory_utilization=0.8, - tensor_parallel_size=1, - trust_remote_code=True, - load_format="dummy", -) - - -from memory_profiler import profile -import tracemalloc - -# tracemalloc.start() - -snapshot_1 = None -snapshot_last = None - - -# @profile -def generate_memory(): - global snapshot_1, snapshot_last - for _ in range(20): - model.load_states() - model.generate( - sampling_params=sampling_params, - prompts=chat_prompts, - use_tqdm=False, - ) - model.offload_states() - rss = cpu_memory_info().rss / 1024**2 - logger.info(f"rss: {rss}") - # snapshot_last = tracemalloc.take_snapshot() - # if snapshot_1 is None: - # snapshot_1 = snapshot_last - - -generate_memory() - -# tracemalloc.stop() - -# snapshot.dump(f"mem_dump.pickle") -ray.shutdown() - -# https://www.datacamp.com/tutorial/memory-profiling-python -# -# stats_1 = snapshot_1.compare_to(snapshot_last, 'lineno') -# -# with open('memory_leak_analysis.txt', 'w') as f: -# f.write("[ Memory usage increase from snapshot 1 to snapshot 2 ]\n") -# for stat in stats_1[:10]: -# f.write(f"{stat}\n") -# -# # Detailed traceback for the top memory consumers -# f.write("\n[ Detailed traceback for the top memory consumers ]\n") -# for stat in stats_1[:-1]: -# f.write('\n'.join(stat.traceback.format()) + '\n\n\n') +async def main(): + model_path = "Qwen/Qwen2.5-7B-Instruct" + + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + + # os.environ["RAY_DEBUG"] = "legacy" + + ray.init() + resource_manager = ResourceManager() + placement_groups = resource_manager.allocate_placement_group(world_size=1, device_mapping=list(range(1))) + sampling_params = SamplingParams(temperature=0.0, top_p=0.99, top_k=100, max_tokens=1024) + + model = await create_async_llm( + resource_placement_groups=placement_groups[0], + model=model_path, + block_size=16, + dtype="bfloat16", + gpu_memory_utilization=0.8, + tensor_parallel_size=1, + trust_remote_code=True, + load_format="dummy", + ) + + + from memory_profiler import profile + import tracemalloc + + # tracemalloc.start() + + snapshot_1 = None + snapshot_last = None + + + # @profile + async def generate_memory(): + global snapshot_1, snapshot_last + for _ in range(20): + await model.load_states() + await generate_batch( + model, + sampling_params=sampling_params, + prompts=chat_prompts, + use_tqdm=False, + ) + model.offload_states() + rss = cpu_memory_info().rss / 1024**2 + logger.info(f"rss: {rss}") + # snapshot_last = tracemalloc.take_snapshot() + # if snapshot_1 is None: + # snapshot_1 = snapshot_last + + + await generate_memory() + + # tracemalloc.stop() + + # snapshot.dump(f"mem_dump.pickle") + ray.shutdown() + + # https://www.datacamp.com/tutorial/memory-profiling-python + # + # stats_1 = snapshot_1.compare_to(snapshot_last, 'lineno') + # + # with open('memory_leak_analysis.txt', 'w') as f: + # f.write("[ Memory usage increase from snapshot 1 to snapshot 2 ]\n") + # for stat in stats_1[:10]: + # f.write(f"{stat}\n") + # + # # Detailed traceback for the top memory consumers + # f.write("\n[ Detailed traceback for the top memory consumers ]\n") + # for stat in stats_1[:-1]: + # f.write('\n'.join(stat.traceback.format()) + '\n\n\n') + +if __name__ == "__main__": + asyncio.run(main()) diff --git a/tests/third_party/vllm/utils.py b/tests/third_party/vllm/utils.py new file mode 100644 index 000000000..53c9e49a5 --- /dev/null +++ b/tests/third_party/vllm/utils.py @@ -0,0 +1,86 @@ +import asyncio +import os +import gc +import torch +from contextlib import contextmanager + +from vllm.sampling_params import SamplingParams +from vllm.utils import random_uuid, GiB_bytes +from vllm.outputs import RequestOutput + +from roll.platforms import current_platform + + +# helper function to generate batch of requests with the same sampling_params +async def generate_batch(model, prompts, sampling_params): + assert isinstance(sampling_params, SamplingParams) + async def generate(prompt): + request_id = random_uuid() + result_generator = model.generate(prompt=prompt, sampling_params=sampling_params, request_id=request_id) + output = None + async for request_output in result_generator: + output = request_output + assert output is not None + return output + return await asyncio.gather(*[generate(prompt) for prompt in prompts]) + +def print_request_output(vllm_output: RequestOutput): + def _print(output): + print(f"[request] {output.request_id}") + print(f"[prompt] {repr(output.prompt)}") + for text in output.outputs: + print(f"[text] {repr(text.text)}") + + if vllm_output is None: + print(f"[output is None]") + elif isinstance(vllm_output, list): + for output in vllm_output: + _print(output) + else: + assert isinstance(vllm_output, RequestOutput) + _print(vllm_output) + + +def chat_format(prompt): + system = "Please reason step by step, and put your final answer within \\boxed{}." + return f"<|im_start|>system\n{system}<|im_end|>\n<|im_start|>user\n{prompt}<|im_end|>\n<|im_start|>assistant\n" + +prompts = [ + "类型#上衣*材质#牛仔布*颜色#白色*风格#简约*图案#刺绣*衣样式#外套*衣款式#破洞,生成一段文案", + "根据关键词描述生成女装/女士精品行业连衣裙品类的发在淘宝的小红书风格的推送配文,包括标题和内容。关键词:pe。要求:1. 推送标题要体现关键词和品类特点,语言通顺,有吸引力,约10个字;2. 推送内容要语言通顺,突出关键词和品类特点,对目标受众有吸引力,长度约30字。标题:", + "100.25和90.75谁更大?", +] + +chat_prompts = [chat_format(prompt) for prompt in prompts] + + +def print_current_mem_usage(tag): + current_platform.empty_cache() + gc.collect() + free_bytes, total = current_platform.mem_get_info() + print(f"[mem_usage] {tag} | current used: {(total - free_bytes) / GiB_bytes}") + +@contextmanager +def mem_usage(mem_profile=False): + free_bytes, total = torch.cuda.mem_get_info() + used_bytes_before = total - free_bytes + MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT: int = 100000 + if mem_profile: + torch.cuda.memory._record_memory_history(max_entries=MAX_NUM_OF_MEM_EVENTS_PER_SNAPSHOT, stacks="python") + try: + yield + finally: + torch.cuda.empty_cache() + gc.collect() + dump_file = "" + if mem_profile: + dump_file = f"/tmp/{random_uuid()}.pickle" + os.makedirs(os.path.dirname(dump_file), exist_ok=True) + torch.cuda.memory._dump_snapshot(dump_file) + # print(f"{torch.cuda.memory._snapshot()}") + torch.cuda.memory._record_memory_history(enabled=None) + free_bytes, total = torch.cuda.mem_get_info() + used_bytes_after = total - free_bytes + print( + f"[mem_usage] before {used_bytes_before / GiB_bytes} after {used_bytes_after / GiB_bytes}, dump to file {dump_file}" + ) diff --git a/tests/utils/test_action_parser.py b/tests/utils/test_action_parser.py new file mode 100644 index 000000000..dbed14136 --- /dev/null +++ b/tests/utils/test_action_parser.py @@ -0,0 +1,24 @@ +import re + +import pytest + +from roll.pipeline.agentic.tools.action_parser import Qwen3CoderActionParser + + +def test_qwen3coder_action_parser_parse_action_single_call(): + tool = Qwen3CoderActionParser() + response = ( + "Let me check the current directory." + "." + ) + + ok, actions = tool.parse_action(response=response) + + assert ok is True + assert isinstance(actions, list) + assert len(actions) == 1 + + action = actions[0] + assert action["type"] == "function" + assert action["function"]["name"] == "list_directory" + assert action["function"]["arguments"] == '{"path": "."}' diff --git a/tests/utils/test_cp_rmpad_ulysses_utils.py b/tests/utils/test_cp_rmpad_ulysses_utils.py new file mode 100644 index 000000000..dada4b234 --- /dev/null +++ b/tests/utils/test_cp_rmpad_ulysses_utils.py @@ -0,0 +1,48 @@ +import pytest +import torch + + +def test_ulysses_pad_and_slice_inputs_divisible(): + from roll.utils.context_parallel.rmpad_ulysses import ulysses_pad_and_slice_inputs + + input_ids = torch.arange(10, dtype=torch.long).unsqueeze(0) # [1, 10] + pos = torch.arange(10, dtype=torch.long).unsqueeze(0) # [1, 10] + + # cp_size=2 => no padding needed + x0, p0, pad0 = ulysses_pad_and_slice_inputs(input_ids, pos, cp_size=2, cp_rank=0) + x1, p1, pad1 = ulysses_pad_and_slice_inputs(input_ids, pos, cp_size=2, cp_rank=1) + + assert pad0 == 0 and pad1 == 0 + assert x0.shape == (1, 5) and x1.shape == (1, 5) + assert torch.equal(torch.cat([x0, x1], dim=1), input_ids) + assert torch.equal(torch.cat([p0, p1], dim=1), pos) + + +def test_ulysses_pad_and_slice_inputs_with_padding(): + from roll.utils.context_parallel.rmpad_ulysses import ulysses_pad_and_slice_inputs + + input_ids = torch.arange(11, dtype=torch.long).unsqueeze(0) # [1, 11] + pos = torch.arange(11, dtype=torch.long).unsqueeze(0) # [1, 11] + + # cp_size=4 => pad to 12 + parts = [] + pads = [] + for r in range(4): + x, p, pad = ulysses_pad_and_slice_inputs(input_ids, pos, cp_size=4, cp_rank=r) + parts.append(x) + pads.append(pad) + assert x.shape == (1, 3) + assert p is not None and p.shape == (1, 3) + + assert all(p == 1 for p in pads) + full = torch.cat(parts, dim=1) + assert full.shape == (1, 12) + assert torch.equal(full[:, :11], input_ids) + + +def test_gather_outputs_and_unpad_no_group_is_noop(): + from roll.utils.context_parallel.rmpad_ulysses import gather_outputs_and_unpad + + x = torch.randn(1, 8, 3) + y = gather_outputs_and_unpad(x, gather_dim=1, unpad_dim=1, padding_size=2, group=None) + assert torch.equal(y, x[:, :6]) diff --git a/tests/utils/test_dynamic_batching.py b/tests/utils/test_dynamic_batching.py index bd4946570..d89cec60c 100644 --- a/tests/utils/test_dynamic_batching.py +++ b/tests/utils/test_dynamic_batching.py @@ -1,11 +1,9 @@ -import sys -sys.path.insert(0, "/home/wenxuan.jwx/ScaleAligner") - import torch from roll.distributed.scheduler.protocol import DataProto from roll.utils.dynamic_batching import * + def test_dynamic_batching(): dp_size = 2 num_seq = 6 @@ -40,5 +38,33 @@ def test_dynamic_batching(): assert tuple(micro_batch0.batch["input_ids"].shape) == (2,4) +def test_dynamic_batching_with_vpp(): + torch.manual_seed(42) + dp_size = 4 + num_seq = 256 + max_seq_len = 8192 + seqs_len = torch.randint(low=128, high=8192, size=(256,)).tolist() + input_ids = torch.arange(num_seq).unsqueeze(1).expand(num_seq, max_seq_len) + attention_mask = (torch.arange(max_seq_len) < torch.tensor(seqs_len)[:, None]).int() + data = DataProto.from_dict( + tensors={ + "input_ids": input_ids, + "attention_mask": attention_mask, + } + ) + max_tokens_per_microbatch = 8192 + sequence_length_round = 128 + + # test dynamic_batching_shard + pipeline_model_parallel_size = 4 + virtual_pipeline_model_size = 2 + data, _ = dynamic_batching_shard(data, dp_size, max_tokens_per_microbatch, sequence_length_round, + pipeline_model_parallel_size=pipeline_model_parallel_size, + virtual_pipeline_model_parallel_size=virtual_pipeline_model_size) + assert data.meta_info["global_micro_batch_indices"].__len__() % pipeline_model_parallel_size == 0 + assert data.meta_info["global_micro_batch_lengths"].__len__() == data.meta_info["global_micro_batch_indices"].__len__() + + if __name__ == "__main__": - test_dynamic_batching() \ No newline at end of file + # test_dynamic_batching() + test_dynamic_batching_with_vpp() diff --git a/tests/utils/test_sequence_packing.py b/tests/utils/test_sequence_packing.py new file mode 100644 index 000000000..6fadcb40b --- /dev/null +++ b/tests/utils/test_sequence_packing.py @@ -0,0 +1,247 @@ +import torch +import numpy as np +from dataclasses import dataclass +from typing import Dict +from tensordict import TensorDict +from roll.distributed.scheduler.protocol import DataProto + + +def test_load_balance_packer(): + """测试 LoadBalancePacker 并展示哪些样本被打包到一起""" + + # 导入必要的类 + from roll.utils.sequence_packing import LoadBalancePacker, SequencePackingConfig + + # 创建配置 + config = SequencePackingConfig( + algorithm="load_balance", + max_packed_sequence_length=4096 + ) + + # 创建 packer + packer = LoadBalancePacker(config) + + # 创建测试数据 - 10个样本,不同的序列长度 + batch_size = 10 + max_seq_len = 2048 + + # 创建不同长度的序列 + sequence_lengths = [512, 1024, 256, 2048, 128, 768, 1536, 384, 896, 640] + print(f"\n{'=' * 80}") + print(f"原始数据:") + print(f"{'=' * 80}") + print(f"总样本数: {batch_size}") + print(f"最大序列长度配置: {config.max_packed_sequence_length}") + print(f"\n各样本的序列长度:") + for idx, length in enumerate(sequence_lengths): + print(f" 样本 {idx}: {length} tokens") + + # 创建 attention_mask 来模拟真实的序列长度 + attention_masks = [] + input_ids_list = [] + + for seq_len in sequence_lengths: + # 创建 attention_mask: 前 seq_len 个位置为 1,其余为 0 + mask = torch.zeros(max_seq_len, dtype=torch.long) + mask[:seq_len] = 1 + attention_masks.append(mask) + + # 创建假的 input_ids + input_ids = torch.randint(0, 1000, (max_seq_len,), dtype=torch.long) + input_ids_list.append(input_ids) + + # 堆叠成批次 + attention_mask = torch.stack(attention_masks) + input_ids = torch.stack(input_ids_list) + + # 创建 TensorDict + batch_dict = TensorDict( + source={ + "input_ids": input_ids, + "attention_mask": attention_mask, + }, + batch_size=(batch_size,) + ) + + # 创建 DataProto + mini_batch = DataProto( + batch=batch_dict, + non_tensor_batch={}, + meta_info={} + ) + + # 设置参数 + tp_size = 1 + cp_size = 1 + vp_size = 1 + + # 创建一个假的 dp_group(对于测试,我们可以传 None) + class FakeDPGroup: + pass + + dp_group = FakeDPGroup() + + # 调用 packer + print(f"\n{'=' * 80}") + print(f"开始打包...") + print(f"{'=' * 80}") + + micro_batches = list(packer.make_micro_batch_iter_for_sequence_packing( + mini_batch=mini_batch, + tp_size=tp_size, + cp_size=cp_size, + vp_size=vp_size, + dp_group=dp_group, + micro_batch_size=None # LoadBalancePacker 会自动计算 + )) + + # 展示结果 + print(f"\n{'=' * 80}") + print(f"打包结果:") + print(f"{'=' * 80}") + print(f"总共生成了 {len(micro_batches)} 个 micro batches\n") + + print(f"partition_indices_list: {micro_batches[0].meta_info['partition_indices_list']}") + + total_workload = 0 + for micro_idx, micro_batch in enumerate(micro_batches): + partition_indices = micro_batch.meta_info.get('partition_indices', []) + + # 获取这个 micro batch 中的序列长度 + batch_seq_lens = [] + for idx in partition_indices: + seq_len = sequence_lengths[idx] + batch_seq_lens.append(seq_len) + + # 计算总长度和工作负载 + total_seq_len = sum(batch_seq_lens) + workload = sum(packer.calculate_workload(seq_len) for seq_len in batch_seq_lens) + total_workload += workload + + print(f"Micro Batch {micro_idx}:") + print(f" 包含样本: {partition_indices}") + print(f" 样本数量: {len(partition_indices)}") + print(f" 各样本长度: {batch_seq_lens}") + print(f" 总序列长度: {total_seq_len} tokens") + print(f" 工作负载: {workload:,.0f}") + print(f" 平均长度: {total_seq_len / len(partition_indices):.1f} tokens") + print() + + # 计算负载均衡统计 + workloads = [] + seq_lengths = [] + for micro_batch in micro_batches: + partition_indices = micro_batch.meta_info.get('partition_indices', []) + batch_seq_lens = [sequence_lengths[idx] for idx in partition_indices] + workload = sum(packer.calculate_workload(seq_len) for seq_len in batch_seq_lens) + workloads.append(workload) + seq_lengths.append(sum(batch_seq_lens)) + + print(f"{'=' * 80}") + print(f"负载均衡统计:") + print(f"{'=' * 80}") + print(f"工作负载分布:") + print(f" 最大: {max(workloads):,.0f}") + print(f" 最小: {min(workloads):,.0f}") + print(f" 平均: {np.mean(workloads):,.0f}") + print(f" 标准差: {np.std(workloads):,.0f}") + print(f" 不平衡度: {(max(workloads) - min(workloads)) / np.mean(workloads) * 100:.2f}%") + print() + print(f"序列长度分布:") + print(f" 最大: {max(seq_lengths)} tokens") + print(f" 最小: {min(seq_lengths)} tokens") + print(f" 平均: {np.mean(seq_lengths):.1f} tokens") + print(f" 标准差: {np.std(seq_lengths):.1f} tokens") + + # 可视化(简单的文本条形图) + print(f"\n{'=' * 80}") + print(f"工作负载可视化:") + print(f"{'=' * 80}") + max_workload = max(workloads) + bar_width = 50 + for i, workload in enumerate(workloads): + bar_len = int((workload / max_workload) * bar_width) + bar = '█' * bar_len + print(f"Batch {i}: {bar} {workload:,.0f}") + + print(f"\n{'=' * 80}") + + # ============ 测试 restore_results_order ============ + print(f"\n{'=' * 80}") + print(f"测试 restore_results_order:") + print(f"{'=' * 80}") + + # 1. 模拟计算结果(已经按照打乱的顺序 concat 在一起) + # 计算总样本数 + total_samples = sum(len(mb.meta_info['partition_indices']) for mb in micro_batches) + + # 创建模拟的计算结果(按照打乱的顺序) + shuffled_results = { + 'logits': torch.arange(total_samples).float().unsqueeze(1), # [total_samples, 1] + 'loss': torch.arange(total_samples).float() * 10, # [total_samples] + } + + print(f"模拟计算结果(打乱顺序):") + print(f" logits shape: {shuffled_results['logits'].shape}") + print(f" loss shape: {shuffled_results['loss'].shape}") + print(f" logits 前5个值: {shuffled_results['logits'][:5].squeeze().tolist()}") + print(f" loss 前5个值: {shuffled_results['loss'][:5].tolist()}") + + # 2. 获取 partition_indices_list + partition_indices_list = mini_batch.meta_info['partition_indices_list'] + print(f"\npartition_indices_list: {partition_indices_list}") + + # 3. 还原顺序 + restored_results = LoadBalancePacker.restore_results_order( + shuffled_results, + partition_indices_list + ) + + print(f"\n还原后的结果(原始顺序):") + print(f" logits shape: {restored_results['logits'].shape}") + print(f" loss shape: {restored_results['loss'].shape}") + print(f" logits 前5个值: {restored_results['logits'][:5].squeeze().tolist()}") + print(f" loss 前5个值: {restored_results['loss'][:5].tolist()}") + + # 4. 验证还原是否正确 + # 由于我们的模拟数据是 [0, 1, 2, 3, ...] 按打乱顺序排列 + # 还原后应该对应原始索引的顺序 + print(f"\n验证还原正确性:") + + # 构建期望的结果(按原始顺序) + current_idx = 0 + expected_order = [] + for partition in partition_indices_list: + for _ in partition: + expected_order.append(current_idx) + current_idx += 1 + + # 将期望顺序映射回原始索引 + original_order = [0] * total_samples + current_idx = 0 + for partition in partition_indices_list: + for orig_idx in partition: + original_order[orig_idx] = expected_order[current_idx] + current_idx += 1 + + print(f" 期望的 logits 值(前10个): {original_order}") + print(f" 实际的 logits 值(前10个): {restored_results['logits'][:10].squeeze().tolist()}") + + # 检查是否完全匹配 + is_correct = torch.allclose( + restored_results['logits'].squeeze(), + torch.tensor(original_order, dtype=torch.float) + ) + print(f" 还原结果{'✓ 正确' if is_correct else '✗ 错误'}") + + print(f"\n{'=' * 80}\n") + + + + +if __name__ == "__main__": + # 设置随机种子以便复现 + torch.manual_seed(42) + np.random.seed(42) + + test_load_balance_packer() diff --git a/tests/utils/test_taskgroups.py b/tests/utils/test_taskgroups.py new file mode 100644 index 000000000..78513c1c4 --- /dev/null +++ b/tests/utils/test_taskgroups.py @@ -0,0 +1,115 @@ +import asyncio +import pytest + +from roll.utils.taskgroups import TaskGroup + +async def test_base(): + async def foo(result, index): + result[index] = 2333 + + result = [None] * 4 + async with TaskGroup() as tg: + for i in range(4): + tg.create_task(foo(result, i)) + assert result == [2333, 2333, 2333, 2333] + +async def test_cancel_parent(): + async def foo(result, index): + result[index] = 2333 + + async def tg_task(expected): + result = [None] * 4 + try: + async with TaskGroup() as tg: + for i in range(4): + await asyncio.sleep(2) + tg.create_task(foo(result, i)) + except asyncio.CancelledError: + assert result == expected + raise + + task = asyncio.create_task(tg_task(expected=[None, None, None, None])) + await asyncio.sleep(1) + task.cancel() + with pytest.raises(asyncio.CancelledError): + await task + + task = asyncio.create_task(tg_task([2333, None, None, None])) + await asyncio.sleep(3) + task.cancel() + with pytest.raises(asyncio.CancelledError): + await task + +async def test_parent_exception(): + async def foo(result, index, sleep_time=0): + await asyncio.sleep(sleep_time) + result[index] = 2333 + + async def tg_task(): + result = [None] * 4 + try: + async with TaskGroup() as tg: + tg.create_task(foo(result, 0, sleep_time=0)) + tg.create_task(foo(result, 1, sleep_time=0)) + tg.create_task(foo(result, 2, sleep_time=2)) + tg.create_task(foo(result, 3, sleep_time=2)) + await asyncio.sleep(1) + raise RuntimeError + except RuntimeError: + assert result == [2333, 2333, None, None] + raise + + with pytest.raises(RuntimeError): + await asyncio.create_task(tg_task()) + +async def test_tg_exception(): + async def foo(result, index, sleep_time=0, raise_exception=False): + await asyncio.sleep(sleep_time) + if raise_exception: + raise RuntimeError + result[index] = 2333 + + async def tg_task(): + result = [None] * 4 + try: + async with TaskGroup() as tg: + tg.create_task(foo(result, 0, sleep_time=0, raise_exception=False)) + tg.create_task(foo(result, 1, sleep_time=0, raise_exception=False)) + tg.create_task(foo(result, 2, sleep_time=0, raise_exception=True)) + tg.create_task(foo(result, 3, sleep_time=2, raise_exception=False)) + # dead loop to test whether TaskGroup can propragate exception + while True: + await asyncio.sleep(1) + except RuntimeError: + assert result == [2333, 2333, None, None] + raise + + with pytest.raises(RuntimeError): + await asyncio.create_task(tg_task()) + +async def test_cancel_tg(): + async def foo(result, index, cancel=False): + await asyncio.sleep(1) + result[index] = 2333 + + async def tg_task(): + result = [None] * 4 + async with TaskGroup() as tg: + tg.create_task(foo(result, 0)) + tg.create_task(foo(result, 1)) + task = tg.create_task(foo(result, 2)) + task.cancel() + tg.create_task(foo(result, 3)) + assert result == [2333, 2333, None, 2333] + + await asyncio.create_task(tg_task()) + +async def main(): + await asyncio.create_task(test_base()) + await asyncio.create_task(test_cancel_parent()) + await asyncio.create_task(test_parent_exception()) + await asyncio.create_task(test_tg_exception()) + await asyncio.create_task(test_cancel_tg()) + +if __name__ == "__main__": + asyncio.run(main())